1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #pragma ident "%Z%%M% %I% %E% SMI" 32 33 #include <sys/param.h> 34 #include <sys/types.h> 35 #include <sys/systm.h> 36 #include <sys/cred.h> 37 #include <sys/time.h> 38 #include <sys/vnode.h> 39 #include <sys/vfs.h> 40 #include <sys/vfs_opreg.h> 41 #include <sys/file.h> 42 #include <sys/filio.h> 43 #include <sys/uio.h> 44 #include <sys/buf.h> 45 #include <sys/mman.h> 46 #include <sys/pathname.h> 47 #include <sys/dirent.h> 48 #include <sys/debug.h> 49 #include <sys/vmsystm.h> 50 #include <sys/fcntl.h> 51 #include <sys/flock.h> 52 #include <sys/swap.h> 53 #include <sys/errno.h> 54 #include <sys/strsubr.h> 55 #include <sys/sysmacros.h> 56 #include <sys/kmem.h> 57 #include <sys/cmn_err.h> 58 #include <sys/pathconf.h> 59 #include <sys/utsname.h> 60 #include <sys/dnlc.h> 61 #include <sys/acl.h> 62 #include <sys/systeminfo.h> 63 #include <sys/policy.h> 64 #include <sys/sdt.h> 65 #include <sys/list.h> 66 #include <sys/stat.h> 67 68 #include <rpc/types.h> 69 #include <rpc/auth.h> 70 #include <rpc/clnt.h> 71 72 #include <nfs/nfs.h> 73 #include <nfs/nfs_clnt.h> 74 #include <nfs/nfs_acl.h> 75 #include <nfs/lm.h> 76 #include <nfs/nfs4.h> 77 #include <nfs/nfs4_kprot.h> 78 #include <nfs/rnode4.h> 79 #include <nfs/nfs4_clnt.h> 80 81 #include <vm/hat.h> 82 #include <vm/as.h> 83 #include <vm/page.h> 84 #include <vm/pvn.h> 85 #include <vm/seg.h> 86 #include <vm/seg_map.h> 87 #include <vm/seg_kpm.h> 88 #include <vm/seg_vn.h> 89 90 #include <fs/fs_subr.h> 91 92 #include <sys/ddi.h> 93 #include <sys/int_fmtio.h> 94 95 typedef struct { 96 nfs4_ga_res_t *di_garp; 97 cred_t *di_cred; 98 hrtime_t di_time_call; 99 } dirattr_info_t; 100 101 typedef enum nfs4_acl_op { 102 NFS4_ACL_GET, 103 NFS4_ACL_SET 104 } nfs4_acl_op_t; 105 106 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 107 char *, dirattr_info_t *); 108 109 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 110 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 111 nfs4_error_t *, int *); 112 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 113 cred_t *); 114 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 115 stable_how4 *); 116 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 117 cred_t *, bool_t, struct uio *); 118 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 119 vsecattr_t *); 120 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 121 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 122 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 123 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 124 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 125 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 126 int, vnode_t **, cred_t *); 127 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 128 cred_t *, int, int, enum createmode4, int); 129 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 130 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 131 vnode_t *, char *, cred_t *, nfsstat4 *); 132 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 133 vnode_t *, char *, cred_t *, nfsstat4 *); 134 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 135 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 136 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 137 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 138 page_t *[], size_t, struct seg *, caddr_t, 139 enum seg_rw, cred_t *); 140 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 141 cred_t *); 142 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 143 int, cred_t *); 144 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 145 int, cred_t *); 146 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 147 static void nfs4_set_mod(vnode_t *); 148 static void nfs4_get_commit(vnode_t *); 149 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 150 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 151 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 152 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 153 cred_t *); 154 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 155 cred_t *); 156 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 157 hrtime_t, vnode_t *, cred_t *); 158 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 159 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 160 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 161 u_offset_t); 162 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 163 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 164 static cred_t *state_to_cred(nfs4_open_stream_t *); 165 static int vtoname(vnode_t *, char *, ssize_t); 166 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 167 static pid_t lo_to_pid(lock_owner4 *); 168 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 169 cred_t *, nfs4_lock_owner_t *); 170 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 171 nfs4_lock_owner_t *); 172 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 173 static void nfs4_delmap_callback(struct as *, void *, uint_t); 174 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 175 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 176 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 177 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 178 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 179 uid_t, gid_t, int); 180 181 /* 182 * Routines that implement the setting of v4 args for the misc. ops 183 */ 184 static void nfs4args_lock_free(nfs_argop4 *); 185 static void nfs4args_lockt_free(nfs_argop4 *); 186 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 187 int, rnode4_t *, cred_t *, bitmap4, int *, 188 nfs4_stateid_types_t *); 189 static void nfs4args_setattr_free(nfs_argop4 *); 190 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 191 bitmap4); 192 static void nfs4args_verify_free(nfs_argop4 *); 193 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 194 WRITE4args **, nfs4_stateid_types_t *); 195 196 /* 197 * These are the vnode ops functions that implement the vnode interface to 198 * the networked file system. See more comments below at nfs4_vnodeops. 199 */ 200 static int nfs4_open(vnode_t **, int, cred_t *); 201 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *); 202 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 203 caller_context_t *); 204 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 205 caller_context_t *); 206 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *); 207 static int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *); 208 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 209 caller_context_t *); 210 static int nfs4_access(vnode_t *, int, int, cred_t *); 211 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *); 212 static int nfs4_fsync(vnode_t *, int, cred_t *); 213 static void nfs4_inactive(vnode_t *, cred_t *); 214 static int nfs4_lookup(vnode_t *, char *, vnode_t **, 215 struct pathname *, int, vnode_t *, cred_t *); 216 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 217 int, vnode_t **, cred_t *, int); 218 static int nfs4_remove(vnode_t *, char *, cred_t *); 219 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *); 220 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *); 221 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, 222 vnode_t **, cred_t *); 223 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *); 224 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 225 cred_t *); 226 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *); 227 static int nfs4_fid(vnode_t *, fid_t *); 228 static int nfs4_rwlock(vnode_t *, int, caller_context_t *); 229 static void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 230 static int nfs4_seek(vnode_t *, offset_t, offset_t *); 231 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 232 page_t *[], size_t, struct seg *, caddr_t, 233 enum seg_rw, cred_t *); 234 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *); 235 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, 236 size_t, uchar_t, uchar_t, uint_t, cred_t *); 237 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, 238 size_t, uchar_t, uchar_t, uint_t, cred_t *); 239 static int nfs4_cmp(vnode_t *, vnode_t *); 240 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 241 struct flk_callback *, cred_t *); 242 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 243 cred_t *, caller_context_t *); 244 static int nfs4_realvp(vnode_t *, vnode_t **); 245 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, 246 size_t, uint_t, uint_t, uint_t, cred_t *); 247 static int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *); 248 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 249 cred_t *); 250 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *); 251 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 252 static int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *); 253 static int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *); 254 255 /* 256 * Used for nfs4_commit_vp() to indicate if we should 257 * wait on pending writes. 258 */ 259 #define NFS4_WRITE_NOWAIT 0 260 #define NFS4_WRITE_WAIT 1 261 262 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 263 264 /* 265 * Error flags used to pass information about certain special errors 266 * which need to be handled specially. 267 */ 268 #define NFS_EOF -98 269 #define NFS_VERF_MISMATCH -97 270 271 /* 272 * Flags used to differentiate between which operation drove the 273 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 274 */ 275 #define NFS4_CLOSE_OP 0x1 276 #define NFS4_DELMAP_OP 0x2 277 #define NFS4_INACTIVE_OP 0x3 278 279 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 280 281 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 282 #define ALIGN64(x, ptr, sz) \ 283 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 284 if (x) { \ 285 x = sizeof (uint64_t) - (x); \ 286 sz -= (x); \ 287 ptr += (x); \ 288 } 289 290 #ifdef DEBUG 291 int nfs4_client_attr_debug = 0; 292 int nfs4_client_state_debug = 0; 293 int nfs4_client_shadow_debug = 0; 294 int nfs4_client_lock_debug = 0; 295 int nfs4_seqid_sync = 0; 296 int nfs4_client_map_debug = 0; 297 static int nfs4_pageio_debug = 0; 298 int nfs4_client_inactive_debug = 0; 299 int nfs4_client_recov_debug = 0; 300 int nfs4_client_recov_stub_debug = 0; 301 int nfs4_client_failover_debug = 0; 302 int nfs4_client_call_debug = 0; 303 int nfs4_client_lookup_debug = 0; 304 int nfs4_client_zone_debug = 0; 305 int nfs4_lost_rqst_debug = 0; 306 int nfs4_rdattrerr_debug = 0; 307 int nfs4_open_stream_debug = 0; 308 309 int nfs4read_error_inject; 310 311 static int nfs4_create_misses = 0; 312 313 static int nfs4_readdir_cache_shorts = 0; 314 static int nfs4_readdir_readahead = 0; 315 316 static int nfs4_bio_do_stop = 0; 317 318 static int nfs4_lostpage = 0; /* number of times we lost original page */ 319 320 int nfs4_mmap_debug = 0; 321 322 static int nfs4_pathconf_cache_hits = 0; 323 static int nfs4_pathconf_cache_misses = 0; 324 325 int nfs4close_all_cnt; 326 int nfs4close_one_debug = 0; 327 int nfs4close_notw_debug = 0; 328 329 int denied_to_flk_debug = 0; 330 void *lockt_denied_debug; 331 332 #endif 333 334 /* 335 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 336 * or NFS4ERR_RESOURCE. 337 */ 338 static int confirm_retry_sec = 30; 339 340 static int nfs4_lookup_neg_cache = 1; 341 342 /* 343 * number of pages to read ahead 344 * optimized for 100 base-T. 345 */ 346 static int nfs4_nra = 4; 347 348 static int nfs4_do_symlink_cache = 1; 349 350 static int nfs4_pathconf_disable_cache = 0; 351 352 /* 353 * These are the vnode ops routines which implement the vnode interface to 354 * the networked file system. These routines just take their parameters, 355 * make them look networkish by putting the right info into interface structs, 356 * and then calling the appropriate remote routine(s) to do the work. 357 * 358 * Note on directory name lookup cacheing: If we detect a stale fhandle, 359 * we purge the directory cache relative to that vnode. This way, the 360 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 361 * more details on rnode locking. 362 */ 363 364 struct vnodeops *nfs4_vnodeops; 365 366 const fs_operation_def_t nfs4_vnodeops_template[] = { 367 VOPNAME_OPEN, { .vop_open = nfs4_open }, 368 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 369 VOPNAME_READ, { .vop_read = nfs4_read }, 370 VOPNAME_WRITE, { .vop_write = nfs4_write }, 371 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 372 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 373 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 374 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 375 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 376 VOPNAME_CREATE, { .vop_create = nfs4_create }, 377 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 378 VOPNAME_LINK, { .vop_link = nfs4_link }, 379 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 380 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 381 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 382 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 383 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 384 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 385 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 386 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 387 VOPNAME_FID, { .vop_fid = nfs4_fid }, 388 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 389 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 390 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 391 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 392 VOPNAME_SPACE, { .vop_space = nfs4_space }, 393 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 394 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 395 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 396 VOPNAME_MAP, { .vop_map = nfs4_map }, 397 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 398 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 399 /* no separate nfs4_dump */ 400 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 401 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 402 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 403 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 404 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 405 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 406 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 407 NULL, NULL 408 }; 409 410 /* 411 * The following are subroutines and definitions to set args or get res 412 * for the different nfsv4 ops 413 */ 414 415 void 416 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 417 { 418 int i; 419 420 for (i = 0; i < arglen; i++) { 421 if (argop[i].argop == OP_LOOKUP) 422 kmem_free( 423 argop[i].nfs_argop4_u.oplookup.objname.utf8string_val, 424 argop[i].nfs_argop4_u.oplookup.objname.utf8string_len); 425 } 426 } 427 428 static void 429 nfs4args_lock_free(nfs_argop4 *argop) 430 { 431 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 432 433 if (locker->new_lock_owner == TRUE) { 434 open_to_lock_owner4 *open_owner; 435 436 open_owner = &locker->locker4_u.open_owner; 437 if (open_owner->lock_owner.owner_val != NULL) { 438 kmem_free(open_owner->lock_owner.owner_val, 439 open_owner->lock_owner.owner_len); 440 } 441 } 442 } 443 444 static void 445 nfs4args_lockt_free(nfs_argop4 *argop) 446 { 447 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 448 449 if (lowner->owner_val != NULL) { 450 kmem_free(lowner->owner_val, lowner->owner_len); 451 } 452 } 453 454 static void 455 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 456 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 457 nfs4_stateid_types_t *sid_types) 458 { 459 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 460 mntinfo4_t *mi; 461 462 argop->argop = OP_SETATTR; 463 /* 464 * The stateid is set to 0 if client is not modifying the size 465 * and otherwise to whatever nfs4_get_stateid() returns. 466 * 467 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 468 * state struct could be found for the process/file pair. We may 469 * want to change this in the future (by OPENing the file). See 470 * bug # 4474852. 471 */ 472 if (vap->va_mask & AT_SIZE) { 473 474 ASSERT(rp != NULL); 475 mi = VTOMI4(RTOV4(rp)); 476 477 argop->nfs_argop4_u.opsetattr.stateid = 478 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 479 OP_SETATTR, sid_types, FALSE); 480 } else { 481 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 482 sizeof (stateid4)); 483 } 484 485 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 486 if (*error) 487 bzero(attr, sizeof (*attr)); 488 } 489 490 static void 491 nfs4args_setattr_free(nfs_argop4 *argop) 492 { 493 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 494 } 495 496 static int 497 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 498 bitmap4 supp) 499 { 500 fattr4 *attr; 501 int error = 0; 502 503 argop->argop = op; 504 switch (op) { 505 case OP_VERIFY: 506 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 507 break; 508 case OP_NVERIFY: 509 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 510 break; 511 default: 512 return (EINVAL); 513 } 514 if (!error) 515 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 516 if (error) 517 bzero(attr, sizeof (*attr)); 518 return (error); 519 } 520 521 static void 522 nfs4args_verify_free(nfs_argop4 *argop) 523 { 524 switch (argop->argop) { 525 case OP_VERIFY: 526 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 527 break; 528 case OP_NVERIFY: 529 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 530 break; 531 default: 532 break; 533 } 534 } 535 536 static void 537 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 538 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 539 { 540 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 541 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 542 543 argop->argop = OP_WRITE; 544 wargs->stable = stable; 545 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 546 mi, OP_WRITE, sid_tp); 547 wargs->mblk = NULL; 548 *wargs_pp = wargs; 549 } 550 551 void 552 nfs4args_copen_free(OPEN4cargs *open_args) 553 { 554 if (open_args->owner.owner_val) { 555 kmem_free(open_args->owner.owner_val, 556 open_args->owner.owner_len); 557 } 558 if ((open_args->opentype == OPEN4_CREATE) && 559 (open_args->mode != EXCLUSIVE4)) { 560 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 561 } 562 } 563 564 /* 565 * XXX: This is referenced in modstubs.s 566 */ 567 struct vnodeops * 568 nfs4_getvnodeops(void) 569 { 570 return (nfs4_vnodeops); 571 } 572 573 /* 574 * The OPEN operation opens a regular file. 575 * 576 * ARGSUSED 577 */ 578 static int 579 nfs4_open(vnode_t **vpp, int flag, cred_t *cr) 580 { 581 vnode_t *dvp = NULL; 582 rnode4_t *rp, *drp; 583 int error; 584 int just_been_created; 585 char fn[MAXNAMELEN]; 586 587 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 588 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 589 return (EIO); 590 rp = VTOR4(*vpp); 591 592 /* 593 * Check to see if opening something besides a regular file; 594 * if so skip the OTW call 595 */ 596 if ((*vpp)->v_type != VREG) { 597 error = nfs4_open_non_reg_file(vpp, flag, cr); 598 return (error); 599 } 600 601 /* 602 * XXX - would like a check right here to know if the file is 603 * executable or not, so as to skip OTW 604 */ 605 606 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 607 return (error); 608 609 drp = VTOR4(dvp); 610 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 611 return (EINTR); 612 613 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 614 nfs_rw_exit(&drp->r_rwlock); 615 return (error); 616 } 617 618 /* 619 * See if this file has just been CREATEd. 620 * If so, clear the flag and update the dnlc, which was previously 621 * skipped in nfs4_create. 622 * XXX need better serilization on this. 623 * XXX move this into the nf4open_otw call, after we have 624 * XXX acquired the open owner seqid sync. 625 */ 626 mutex_enter(&rp->r_statev4_lock); 627 if (rp->created_v4) { 628 rp->created_v4 = 0; 629 mutex_exit(&rp->r_statev4_lock); 630 631 dnlc_update(dvp, fn, *vpp); 632 /* This is needed so we don't bump the open ref count */ 633 just_been_created = 1; 634 } else { 635 mutex_exit(&rp->r_statev4_lock); 636 just_been_created = 0; 637 } 638 639 /* 640 * If caller specified O_TRUNC/FTRUNC, then be sure to set 641 * FWRITE (to drive successful setattr(size=0) after open) 642 */ 643 if (flag & FTRUNC) 644 flag |= FWRITE; 645 646 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 647 just_been_created); 648 649 if (!error && !((*vpp)->v_flag & VROOT)) 650 dnlc_update(dvp, fn, *vpp); 651 652 nfs_rw_exit(&drp->r_rwlock); 653 654 /* release the hold from vtodv */ 655 VN_RELE(dvp); 656 657 /* exchange the shadow for the master vnode, if needed */ 658 659 if (error == 0 && IS_SHADOW(*vpp, rp)) 660 sv_exchange(vpp); 661 662 return (error); 663 } 664 665 /* 666 * See if there's a "lost open" request to be saved and recovered. 667 */ 668 static void 669 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 670 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 671 vnode_t *dvp, OPEN4cargs *open_args) 672 { 673 vfs_t *vfsp; 674 char *srccfp; 675 676 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 677 678 if (error != ETIMEDOUT && error != EINTR && 679 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 680 lost_rqstp->lr_op = 0; 681 return; 682 } 683 684 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 685 "nfs4open_save_lost_rqst: error %d", error)); 686 687 lost_rqstp->lr_op = OP_OPEN; 688 /* 689 * The vp (if it is not NULL) and dvp are held and rele'd via 690 * the recovery code. See nfs4_save_lost_rqst. 691 */ 692 lost_rqstp->lr_vp = vp; 693 lost_rqstp->lr_dvp = dvp; 694 lost_rqstp->lr_oop = oop; 695 lost_rqstp->lr_osp = NULL; 696 lost_rqstp->lr_lop = NULL; 697 lost_rqstp->lr_cr = cr; 698 lost_rqstp->lr_flk = NULL; 699 lost_rqstp->lr_oacc = open_args->share_access; 700 lost_rqstp->lr_odeny = open_args->share_deny; 701 lost_rqstp->lr_oclaim = open_args->claim; 702 if (open_args->claim == CLAIM_DELEGATE_CUR) { 703 lost_rqstp->lr_ostateid = 704 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 705 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 706 } else { 707 srccfp = open_args->open_claim4_u.cfile; 708 } 709 lost_rqstp->lr_ofile.utf8string_len = 0; 710 lost_rqstp->lr_ofile.utf8string_val = NULL; 711 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 712 lost_rqstp->lr_putfirst = FALSE; 713 } 714 715 struct nfs4_excl_time { 716 uint32 seconds; 717 uint32 nseconds; 718 }; 719 720 /* 721 * The OPEN operation creates and/or opens a regular file 722 * 723 * ARGSUSED 724 */ 725 static int 726 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 727 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 728 enum createmode4 createmode, int file_just_been_created) 729 { 730 rnode4_t *rp; 731 rnode4_t *drp = VTOR4(dvp); 732 vnode_t *vp = NULL; 733 vnode_t *vpi = *vpp; 734 bool_t needrecov = FALSE; 735 736 int doqueue = 1; 737 738 COMPOUND4args_clnt args; 739 COMPOUND4res_clnt res; 740 nfs_argop4 *argop; 741 nfs_resop4 *resop; 742 int argoplist_size; 743 int idx_open, idx_fattr; 744 745 GETFH4res *gf_res = NULL; 746 OPEN4res *op_res = NULL; 747 nfs4_ga_res_t *garp; 748 fattr4 *attr = NULL; 749 struct nfs4_excl_time verf; 750 bool_t did_excl_setup = FALSE; 751 int created_osp; 752 753 OPEN4cargs *open_args; 754 nfs4_open_owner_t *oop = NULL; 755 nfs4_open_stream_t *osp = NULL; 756 seqid4 seqid = 0; 757 bool_t retry_open = FALSE; 758 nfs4_recov_state_t recov_state; 759 nfs4_lost_rqst_t lost_rqst; 760 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 761 hrtime_t t; 762 int acc = 0; 763 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 764 cred_t *ncr = NULL; 765 766 nfs4_sharedfh_t *otw_sfh; 767 nfs4_sharedfh_t *orig_sfh; 768 int fh_differs = 0; 769 int numops, setgid_flag; 770 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 771 772 /* 773 * Make sure we properly deal with setting the right gid on 774 * a newly created file to reflect the parent's setgid bit 775 */ 776 setgid_flag = 0; 777 if (create_flag && in_va) { 778 779 /* 780 * If the parent's directory has the setgid bit set 781 * _and_ the client was able to get a valid mapping 782 * for the parent dir's owner_group, we want to 783 * append NVERIFY(owner_group == dva.va_gid) and 784 * SETATTR to the CREATE compound. 785 */ 786 mutex_enter(&drp->r_statelock); 787 if (drp->r_attr.va_mode & VSGID && 788 drp->r_attr.va_gid != GID_NOBODY) { 789 in_va->va_gid = drp->r_attr.va_gid; 790 setgid_flag = 1; 791 } 792 mutex_exit(&drp->r_statelock); 793 } 794 795 /* 796 * Normal/non-create compound: 797 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 798 * 799 * Open(create) compound no setgid: 800 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 801 * RESTOREFH + GETATTR 802 * 803 * Open(create) setgid: 804 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 805 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 806 * NVERIFY(grp) + SETATTR 807 */ 808 if (setgid_flag) { 809 numops = 10; 810 idx_open = 1; 811 idx_fattr = 3; 812 } else if (create_flag) { 813 numops = 7; 814 idx_open = 2; 815 idx_fattr = 4; 816 } else { 817 numops = 4; 818 idx_open = 1; 819 idx_fattr = 3; 820 } 821 822 args.array_len = numops; 823 argoplist_size = numops * sizeof (nfs_argop4); 824 argop = kmem_alloc(argoplist_size, KM_SLEEP); 825 826 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 827 "open %s open flag 0x%x cred %p", file_name, open_flag, 828 (void *)cr)); 829 830 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 831 if (create_flag) { 832 /* 833 * We are to create a file. Initialize the passed in vnode 834 * pointer. 835 */ 836 vpi = NULL; 837 } else { 838 /* 839 * Check to see if the client owns a read delegation and is 840 * trying to open for write. If so, then return the delegation 841 * to avoid the server doing a cb_recall and returning DELAY. 842 * NB - we don't use the statev4_lock here because we'd have 843 * to drop the lock anyway and the result would be stale. 844 */ 845 if ((open_flag & FWRITE) && 846 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 847 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 848 849 /* 850 * If the file has a delegation, then do an access check up 851 * front. This avoids having to an access check later after 852 * we've already done start_op, which could deadlock. 853 */ 854 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 855 if (open_flag & FREAD && 856 nfs4_access(vpi, VREAD, 0, cr) == 0) 857 acc |= VREAD; 858 if (open_flag & FWRITE && 859 nfs4_access(vpi, VWRITE, 0, cr) == 0) 860 acc |= VWRITE; 861 } 862 } 863 864 drp = VTOR4(dvp); 865 866 recov_state.rs_flags = 0; 867 recov_state.rs_num_retry_despite_err = 0; 868 cred_otw = cr; 869 870 recov_retry: 871 fh_differs = 0; 872 nfs4_error_zinit(&e); 873 874 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 875 if (e.error) { 876 if (ncr != NULL) 877 crfree(ncr); 878 kmem_free(argop, argoplist_size); 879 return (e.error); 880 } 881 882 args.ctag = TAG_OPEN; 883 args.array_len = numops; 884 args.array = argop; 885 886 /* putfh directory fh */ 887 argop[0].argop = OP_CPUTFH; 888 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 889 890 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 891 argop[idx_open].argop = OP_COPEN; 892 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 893 open_args->claim = CLAIM_NULL; 894 895 /* name of file */ 896 open_args->open_claim4_u.cfile = file_name; 897 open_args->owner.owner_len = 0; 898 open_args->owner.owner_val = NULL; 899 900 if (create_flag) { 901 /* CREATE a file */ 902 open_args->opentype = OPEN4_CREATE; 903 open_args->mode = createmode; 904 if (createmode == EXCLUSIVE4) { 905 if (did_excl_setup == FALSE) { 906 verf.seconds = nfs_atoi(hw_serial); 907 if (verf.seconds != 0) 908 verf.nseconds = newnum(); 909 else { 910 timestruc_t now; 911 912 gethrestime(&now); 913 verf.seconds = now.tv_sec; 914 verf.nseconds = now.tv_nsec; 915 } 916 /* 917 * Since the server will use this value for the 918 * mtime, make sure that it can't overflow. Zero 919 * out the MSB. The actual value does not matter 920 * here, only its uniqeness. 921 */ 922 verf.seconds &= INT32_MAX; 923 did_excl_setup = TRUE; 924 } 925 926 /* Now copy over verifier to OPEN4args. */ 927 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 928 } else { 929 int v_error; 930 bitmap4 supp_attrs; 931 servinfo4_t *svp; 932 933 attr = &open_args->createhow4_u.createattrs; 934 935 svp = drp->r_server; 936 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 937 supp_attrs = svp->sv_supp_attrs; 938 nfs_rw_exit(&svp->sv_lock); 939 940 /* GUARDED4 or UNCHECKED4 */ 941 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 942 supp_attrs); 943 if (v_error) { 944 bzero(attr, sizeof (*attr)); 945 nfs4args_copen_free(open_args); 946 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 947 &recov_state, FALSE); 948 if (ncr != NULL) 949 crfree(ncr); 950 kmem_free(argop, argoplist_size); 951 return (v_error); 952 } 953 } 954 } else { 955 /* NO CREATE */ 956 open_args->opentype = OPEN4_NOCREATE; 957 } 958 959 if (recov_state.rs_sp != NULL) { 960 mutex_enter(&recov_state.rs_sp->s_lock); 961 open_args->owner.clientid = recov_state.rs_sp->clientid; 962 mutex_exit(&recov_state.rs_sp->s_lock); 963 } else { 964 /* XXX should we just fail here? */ 965 open_args->owner.clientid = 0; 966 } 967 968 /* 969 * This increments oop's ref count or creates a temporary 'just_created' 970 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 971 * completes. 972 */ 973 mutex_enter(&VTOMI4(dvp)->mi_lock); 974 975 /* See if a permanent or just created open owner exists */ 976 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 977 if (!oop) { 978 /* 979 * This open owner does not exist so create a temporary 980 * just created one. 981 */ 982 oop = create_open_owner(cr, VTOMI4(dvp)); 983 ASSERT(oop != NULL); 984 } 985 mutex_exit(&VTOMI4(dvp)->mi_lock); 986 987 /* this length never changes, do alloc before seqid sync */ 988 open_args->owner.owner_len = sizeof (oop->oo_name); 989 open_args->owner.owner_val = 990 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 991 992 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 993 if (e.error == EAGAIN) { 994 open_owner_rele(oop); 995 nfs4args_copen_free(open_args); 996 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 997 if (ncr != NULL) { 998 crfree(ncr); 999 ncr = NULL; 1000 } 1001 goto recov_retry; 1002 } 1003 1004 /* Check to see if we need to do the OTW call */ 1005 if (!create_flag) { 1006 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1007 file_just_been_created, &e.error, acc, &recov_state)) { 1008 1009 /* 1010 * The OTW open is not necessary. Either 1011 * the open can succeed without it (eg. 1012 * delegation, error == 0) or the open 1013 * must fail due to an access failure 1014 * (error != 0). In either case, tidy 1015 * up and return. 1016 */ 1017 1018 nfs4_end_open_seqid_sync(oop); 1019 open_owner_rele(oop); 1020 nfs4args_copen_free(open_args); 1021 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1022 if (ncr != NULL) 1023 crfree(ncr); 1024 kmem_free(argop, argoplist_size); 1025 return (e.error); 1026 } 1027 } 1028 1029 bcopy(&oop->oo_name, open_args->owner.owner_val, 1030 open_args->owner.owner_len); 1031 1032 seqid = nfs4_get_open_seqid(oop) + 1; 1033 open_args->seqid = seqid; 1034 open_args->share_access = 0; 1035 if (open_flag & FREAD) 1036 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1037 if (open_flag & FWRITE) 1038 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1039 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1040 1041 1042 1043 /* 1044 * getfh w/sanity check for idx_open/idx_fattr 1045 */ 1046 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1047 argop[idx_open + 1].argop = OP_GETFH; 1048 1049 /* getattr */ 1050 argop[idx_fattr].argop = OP_GETATTR; 1051 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1052 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1053 1054 if (setgid_flag) { 1055 vattr_t _v; 1056 servinfo4_t *svp; 1057 bitmap4 supp_attrs; 1058 1059 svp = drp->r_server; 1060 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1061 supp_attrs = svp->sv_supp_attrs; 1062 nfs_rw_exit(&svp->sv_lock); 1063 1064 /* 1065 * For setgid case, we need to: 1066 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1067 */ 1068 argop[4].argop = OP_SAVEFH; 1069 1070 argop[5].argop = OP_CPUTFH; 1071 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1072 1073 argop[6].argop = OP_GETATTR; 1074 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1075 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1076 1077 argop[7].argop = OP_RESTOREFH; 1078 1079 /* 1080 * nverify 1081 */ 1082 _v.va_mask = AT_GID; 1083 _v.va_gid = in_va->va_gid; 1084 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1085 supp_attrs))) { 1086 1087 /* 1088 * setattr 1089 * 1090 * We _know_ we're not messing with AT_SIZE or 1091 * AT_XTIME, so no need for stateid or flags. 1092 * Also we specify NULL rp since we're only 1093 * interested in setting owner_group attributes. 1094 */ 1095 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1096 supp_attrs, &e.error, 0); 1097 if (e.error) 1098 nfs4args_verify_free(&argop[8]); 1099 } 1100 1101 if (e.error) { 1102 /* 1103 * XXX - Revisit the last argument to nfs4_end_op() 1104 * once 5020486 is fixed. 1105 */ 1106 nfs4_end_open_seqid_sync(oop); 1107 open_owner_rele(oop); 1108 nfs4args_copen_free(open_args); 1109 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1110 if (ncr != NULL) 1111 crfree(ncr); 1112 kmem_free(argop, argoplist_size); 1113 return (e.error); 1114 } 1115 } else if (create_flag) { 1116 /* 1117 * For setgid case, we need to: 1118 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1119 */ 1120 argop[1].argop = OP_SAVEFH; 1121 1122 argop[5].argop = OP_RESTOREFH; 1123 1124 argop[6].argop = OP_GETATTR; 1125 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1126 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1127 } 1128 1129 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1130 "nfs4open_otw: %s call, nm %s, rp %s", 1131 needrecov ? "recov" : "first", file_name, 1132 rnode4info(VTOR4(dvp)))); 1133 1134 t = gethrtime(); 1135 1136 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1137 1138 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1139 nfs4_set_open_seqid(seqid, oop, args.ctag); 1140 1141 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1142 1143 if (e.error || needrecov) { 1144 bool_t abort = FALSE; 1145 1146 if (needrecov) { 1147 nfs4_bseqid_entry_t *bsep = NULL; 1148 1149 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1150 cred_otw, vpi, dvp, open_args); 1151 1152 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1153 bsep = nfs4_create_bseqid_entry(oop, NULL, 1154 vpi, 0, args.ctag, open_args->seqid); 1155 num_bseqid_retry--; 1156 } 1157 1158 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1159 NULL, lost_rqst.lr_op == OP_OPEN ? 1160 &lost_rqst : NULL, OP_OPEN, bsep); 1161 1162 if (bsep) 1163 kmem_free(bsep, sizeof (*bsep)); 1164 /* give up if we keep getting BAD_SEQID */ 1165 if (num_bseqid_retry == 0) 1166 abort = TRUE; 1167 if (abort == TRUE && e.error == 0) 1168 e.error = geterrno4(res.status); 1169 } 1170 nfs4_end_open_seqid_sync(oop); 1171 open_owner_rele(oop); 1172 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1173 nfs4args_copen_free(open_args); 1174 if (setgid_flag) { 1175 nfs4args_verify_free(&argop[8]); 1176 nfs4args_setattr_free(&argop[9]); 1177 } 1178 if (!e.error) 1179 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1180 if (ncr != NULL) { 1181 crfree(ncr); 1182 ncr = NULL; 1183 } 1184 if (!needrecov || abort == TRUE || e.error == EINTR || 1185 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1186 kmem_free(argop, argoplist_size); 1187 return (e.error); 1188 } 1189 goto recov_retry; 1190 } 1191 1192 /* 1193 * Will check and update lease after checking the rflag for 1194 * OPEN_CONFIRM in the successful OPEN call. 1195 */ 1196 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1197 1198 /* 1199 * XXX what if we're crossing mount points from server1:/drp 1200 * to server2:/drp/rp. 1201 */ 1202 1203 /* Signal our end of use of the open seqid */ 1204 nfs4_end_open_seqid_sync(oop); 1205 1206 /* 1207 * This will destroy the open owner if it was just created, 1208 * and no one else has put a reference on it. 1209 */ 1210 open_owner_rele(oop); 1211 if (create_flag && (createmode != EXCLUSIVE4) && 1212 res.status == NFS4ERR_BADOWNER) 1213 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1214 1215 e.error = geterrno4(res.status); 1216 nfs4args_copen_free(open_args); 1217 if (setgid_flag) { 1218 nfs4args_verify_free(&argop[8]); 1219 nfs4args_setattr_free(&argop[9]); 1220 } 1221 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1222 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1223 /* 1224 * If the reply is NFS4ERR_ACCESS, it may be because 1225 * we are root (no root net access). If the real uid 1226 * is not root, then retry with the real uid instead. 1227 */ 1228 if (ncr != NULL) { 1229 crfree(ncr); 1230 ncr = NULL; 1231 } 1232 if (res.status == NFS4ERR_ACCESS && 1233 (ncr = crnetadjust(cred_otw)) != NULL) { 1234 cred_otw = ncr; 1235 goto recov_retry; 1236 } 1237 kmem_free(argop, argoplist_size); 1238 return (e.error); 1239 } 1240 1241 resop = &res.array[idx_open]; /* open res */ 1242 op_res = &resop->nfs_resop4_u.opopen; 1243 1244 #ifdef DEBUG 1245 /* 1246 * verify attrset bitmap 1247 */ 1248 if (create_flag && 1249 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1250 /* make sure attrset returned is what we asked for */ 1251 /* XXX Ignore this 'error' for now */ 1252 if (attr->attrmask != op_res->attrset) 1253 /* EMPTY */; 1254 } 1255 #endif 1256 1257 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1258 mutex_enter(&VTOMI4(dvp)->mi_lock); 1259 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1260 mutex_exit(&VTOMI4(dvp)->mi_lock); 1261 } 1262 1263 resop = &res.array[idx_open + 1]; /* getfh res */ 1264 gf_res = &resop->nfs_resop4_u.opgetfh; 1265 1266 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1267 1268 /* 1269 * The open stateid has been updated on the server but not 1270 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1271 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1272 * WRITE call. That, however, will use the old stateid, so go ahead 1273 * and upate the open stateid now, before any call to makenfs4node. 1274 */ 1275 if (vpi) { 1276 nfs4_open_stream_t *tmp_osp; 1277 rnode4_t *tmp_rp = VTOR4(vpi); 1278 1279 tmp_osp = find_open_stream(oop, tmp_rp); 1280 if (tmp_osp) { 1281 tmp_osp->open_stateid = op_res->stateid; 1282 mutex_exit(&tmp_osp->os_sync_lock); 1283 open_stream_rele(tmp_osp, tmp_rp); 1284 } 1285 1286 /* 1287 * We must determine if the file handle given by the otw open 1288 * is the same as the file handle which was passed in with 1289 * *vpp. This case can be reached if the file we are trying 1290 * to open has been removed and another file has been created 1291 * having the same file name. The passed in vnode is released 1292 * later. 1293 */ 1294 orig_sfh = VTOR4(vpi)->r_fh; 1295 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1296 } 1297 1298 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1299 1300 if (create_flag || fh_differs) { 1301 int rnode_err = 0; 1302 1303 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1304 dvp, fn_get(VTOSV(dvp)->sv_name, file_name)); 1305 1306 if (e.error) 1307 PURGE_ATTRCACHE4(vp); 1308 /* 1309 * For the newly created vp case, make sure the rnode 1310 * isn't bad before using it. 1311 */ 1312 mutex_enter(&(VTOR4(vp))->r_statelock); 1313 if (VTOR4(vp)->r_flags & R4RECOVERR) 1314 rnode_err = EIO; 1315 mutex_exit(&(VTOR4(vp))->r_statelock); 1316 1317 if (rnode_err) { 1318 nfs4_end_open_seqid_sync(oop); 1319 nfs4args_copen_free(open_args); 1320 if (setgid_flag) { 1321 nfs4args_verify_free(&argop[8]); 1322 nfs4args_setattr_free(&argop[9]); 1323 } 1324 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1325 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1326 needrecov); 1327 open_owner_rele(oop); 1328 VN_RELE(vp); 1329 if (ncr != NULL) 1330 crfree(ncr); 1331 sfh4_rele(&otw_sfh); 1332 kmem_free(argop, argoplist_size); 1333 return (EIO); 1334 } 1335 } else { 1336 vp = vpi; 1337 } 1338 sfh4_rele(&otw_sfh); 1339 1340 /* 1341 * It seems odd to get a full set of attrs and then not update 1342 * the object's attrcache in the non-create case. Create case uses 1343 * the attrs since makenfs4node checks to see if the attrs need to 1344 * be updated (and then updates them). The non-create case should 1345 * update attrs also. 1346 */ 1347 if (! create_flag && ! fh_differs && !e.error) { 1348 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1349 } 1350 1351 nfs4_error_zinit(&e); 1352 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1353 /* This does not do recovery for vp explicitly. */ 1354 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1355 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1356 1357 if (e.error || e.stat) { 1358 nfs4_end_open_seqid_sync(oop); 1359 nfs4args_copen_free(open_args); 1360 if (setgid_flag) { 1361 nfs4args_verify_free(&argop[8]); 1362 nfs4args_setattr_free(&argop[9]); 1363 } 1364 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1365 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1366 needrecov); 1367 open_owner_rele(oop); 1368 if (create_flag || fh_differs) { 1369 /* rele the makenfs4node */ 1370 VN_RELE(vp); 1371 } 1372 if (ncr != NULL) { 1373 crfree(ncr); 1374 ncr = NULL; 1375 } 1376 if (retry_open == TRUE) { 1377 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1378 "nfs4open_otw: retry the open since OPEN " 1379 "CONFIRM failed with error %d stat %d", 1380 e.error, e.stat)); 1381 if (create_flag && createmode == GUARDED4) { 1382 NFS4_DEBUG(nfs4_client_recov_debug, 1383 (CE_NOTE, "nfs4open_otw: switch " 1384 "createmode from GUARDED4 to " 1385 "UNCHECKED4")); 1386 createmode = UNCHECKED4; 1387 } 1388 goto recov_retry; 1389 } 1390 if (!e.error) { 1391 if (create_flag && (createmode != EXCLUSIVE4) && 1392 e.stat == NFS4ERR_BADOWNER) 1393 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1394 1395 e.error = geterrno4(e.stat); 1396 } 1397 kmem_free(argop, argoplist_size); 1398 return (e.error); 1399 } 1400 } 1401 1402 rp = VTOR4(vp); 1403 1404 mutex_enter(&rp->r_statev4_lock); 1405 if (create_flag) 1406 rp->created_v4 = 1; 1407 mutex_exit(&rp->r_statev4_lock); 1408 1409 mutex_enter(&oop->oo_lock); 1410 /* Doesn't matter if 'oo_just_created' already was set as this */ 1411 oop->oo_just_created = NFS4_PERM_CREATED; 1412 if (oop->oo_cred_otw) 1413 crfree(oop->oo_cred_otw); 1414 oop->oo_cred_otw = cred_otw; 1415 crhold(oop->oo_cred_otw); 1416 mutex_exit(&oop->oo_lock); 1417 1418 /* returns with 'os_sync_lock' held */ 1419 osp = find_or_create_open_stream(oop, rp, &created_osp); 1420 if (!osp) { 1421 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1422 "nfs4open_otw: failed to create an open stream")); 1423 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1424 "signal our end of use of the open seqid")); 1425 1426 nfs4_end_open_seqid_sync(oop); 1427 open_owner_rele(oop); 1428 nfs4args_copen_free(open_args); 1429 if (setgid_flag) { 1430 nfs4args_verify_free(&argop[8]); 1431 nfs4args_setattr_free(&argop[9]); 1432 } 1433 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1434 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1435 if (create_flag || fh_differs) 1436 VN_RELE(vp); 1437 if (ncr != NULL) 1438 crfree(ncr); 1439 1440 kmem_free(argop, argoplist_size); 1441 return (EINVAL); 1442 1443 } 1444 1445 osp->open_stateid = op_res->stateid; 1446 1447 if (open_flag & FREAD) 1448 osp->os_share_acc_read++; 1449 if (open_flag & FWRITE) 1450 osp->os_share_acc_write++; 1451 osp->os_share_deny_none++; 1452 1453 /* 1454 * Need to reset this bitfield for the possible case where we were 1455 * going to OTW CLOSE the file, got a non-recoverable error, and before 1456 * we could retry the CLOSE, OPENed the file again. 1457 */ 1458 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1459 osp->os_final_close = 0; 1460 osp->os_force_close = 0; 1461 #ifdef DEBUG 1462 if (osp->os_failed_reopen) 1463 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1464 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1465 (void *)osp, (void *)cr, rnode4info(rp))); 1466 #endif 1467 osp->os_failed_reopen = 0; 1468 1469 mutex_exit(&osp->os_sync_lock); 1470 1471 nfs4_end_open_seqid_sync(oop); 1472 1473 if (created_osp && recov_state.rs_sp != NULL) { 1474 mutex_enter(&recov_state.rs_sp->s_lock); 1475 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1476 mutex_exit(&recov_state.rs_sp->s_lock); 1477 } 1478 1479 /* get rid of our reference to find oop */ 1480 open_owner_rele(oop); 1481 1482 open_stream_rele(osp, rp); 1483 1484 /* accept delegation, if any */ 1485 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1486 1487 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1488 1489 if (createmode == EXCLUSIVE4 && 1490 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1491 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1492 " EXCLUSIVE4: sending a SETATTR")); 1493 /* 1494 * If doing an exclusive create, then generate 1495 * a SETATTR to set the initial attributes. 1496 * Try to set the mtime and the atime to the 1497 * server's current time. It is somewhat 1498 * expected that these fields will be used to 1499 * store the exclusive create cookie. If not, 1500 * server implementors will need to know that 1501 * a SETATTR will follow an exclusive create 1502 * and the cookie should be destroyed if 1503 * appropriate. 1504 * 1505 * The AT_GID and AT_SIZE bits are turned off 1506 * so that the SETATTR request will not attempt 1507 * to process these. The gid will be set 1508 * separately if appropriate. The size is turned 1509 * off because it is assumed that a new file will 1510 * be created empty and if the file wasn't empty, 1511 * then the exclusive create will have failed 1512 * because the file must have existed already. 1513 * Therefore, no truncate operation is needed. 1514 */ 1515 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1516 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1517 1518 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1519 if (e.error) { 1520 /* 1521 * Couldn't correct the attributes of 1522 * the newly created file and the 1523 * attributes are wrong. Remove the 1524 * file and return an error to the 1525 * application. 1526 */ 1527 /* XXX will this take care of client state ? */ 1528 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1529 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1530 " remove file", e.error)); 1531 VN_RELE(vp); 1532 (void) nfs4_remove(dvp, file_name, cr); 1533 /* 1534 * Since we've reled the vnode and removed 1535 * the file we now need to return the error. 1536 * At this point we don't want to update the 1537 * dircaches, call nfs4_waitfor_purge_complete 1538 * or set vpp to vp so we need to skip these 1539 * as well. 1540 */ 1541 goto skip_update_dircaches; 1542 } 1543 } 1544 1545 /* 1546 * If we created or found the correct vnode, due to create_flag or 1547 * fh_differs being set, then update directory cache attribute, readdir 1548 * and dnlc caches. 1549 */ 1550 if (create_flag || fh_differs) { 1551 dirattr_info_t dinfo, *dinfop; 1552 1553 /* 1554 * Make sure getattr succeeded before using results. 1555 * note: op 7 is getattr(dir) for both flavors of 1556 * open(create). 1557 */ 1558 if (create_flag && res.status == NFS4_OK) { 1559 dinfo.di_time_call = t; 1560 dinfo.di_cred = cr; 1561 dinfo.di_garp = 1562 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1563 dinfop = &dinfo; 1564 } else { 1565 dinfop = NULL; 1566 } 1567 1568 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1569 dinfop); 1570 } 1571 1572 /* 1573 * If the page cache for this file was flushed from actions 1574 * above, it was done asynchronously and if that is true, 1575 * there is a need to wait here for it to complete. This must 1576 * be done outside of start_fop/end_fop. 1577 */ 1578 (void) nfs4_waitfor_purge_complete(vp); 1579 1580 /* 1581 * It is implicit that we are in the open case (create_flag == 0) since 1582 * fh_differs can only be set to a non-zero value in the open case. 1583 */ 1584 if (fh_differs != 0 && vpi != NULL) 1585 VN_RELE(vpi); 1586 1587 /* 1588 * Be sure to set *vpp to the correct value before returning. 1589 */ 1590 *vpp = vp; 1591 1592 skip_update_dircaches: 1593 1594 nfs4args_copen_free(open_args); 1595 if (setgid_flag) { 1596 nfs4args_verify_free(&argop[8]); 1597 nfs4args_setattr_free(&argop[9]); 1598 } 1599 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1600 1601 if (ncr) 1602 crfree(ncr); 1603 kmem_free(argop, argoplist_size); 1604 return (e.error); 1605 } 1606 1607 /* 1608 * Reopen an open instance. cf. nfs4open_otw(). 1609 * 1610 * Errors are returned by the nfs4_error_t parameter. 1611 * - ep->error contains an errno value or zero. 1612 * - if it is zero, ep->stat is set to an NFS status code, if any. 1613 * If the file could not be reopened, but the caller should continue, the 1614 * file is marked dead and no error values are returned. If the caller 1615 * should stop recovering open files and start over, either the ep->error 1616 * value or ep->stat will indicate an error (either something that requires 1617 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1618 * filehandles) may be handled silently by this routine. 1619 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1620 * will be started, so the caller should not do it. 1621 * 1622 * Gotos: 1623 * - kill_file : reopen failed in such a fashion to constitute marking the 1624 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1625 * is for cases where recovery is not possible. 1626 * - failed_reopen : same as above, except that the file has already been 1627 * marked dead, so no need to do it again. 1628 * - bailout : reopen failed but we are able to recover and retry the reopen - 1629 * either within this function immediatley or via the calling function. 1630 */ 1631 1632 void 1633 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1634 open_claim_type4 claim, bool_t frc_use_claim_previous, 1635 bool_t is_recov) 1636 { 1637 COMPOUND4args_clnt args; 1638 COMPOUND4res_clnt res; 1639 nfs_argop4 argop[4]; 1640 nfs_resop4 *resop; 1641 OPEN4res *op_res = NULL; 1642 OPEN4cargs *open_args; 1643 GETFH4res *gf_res; 1644 rnode4_t *rp = VTOR4(vp); 1645 int doqueue = 1; 1646 cred_t *cr = NULL, *cred_otw = NULL; 1647 nfs4_open_owner_t *oop = NULL; 1648 seqid4 seqid; 1649 nfs4_ga_res_t *garp; 1650 char fn[MAXNAMELEN]; 1651 nfs4_recov_state_t recov = {NULL, 0}; 1652 nfs4_lost_rqst_t lost_rqst; 1653 mntinfo4_t *mi = VTOMI4(vp); 1654 bool_t abort; 1655 char *failed_msg = ""; 1656 int fh_different; 1657 hrtime_t t; 1658 nfs4_bseqid_entry_t *bsep = NULL; 1659 1660 ASSERT(nfs4_consistent_type(vp)); 1661 ASSERT(nfs_zone() == mi->mi_zone); 1662 1663 nfs4_error_zinit(ep); 1664 1665 /* this is the cred used to find the open owner */ 1666 cr = state_to_cred(osp); 1667 if (cr == NULL) { 1668 failed_msg = "Couldn't reopen: no cred"; 1669 goto kill_file; 1670 } 1671 /* use this cred for OTW operations */ 1672 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1673 1674 top: 1675 nfs4_error_zinit(ep); 1676 1677 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1678 /* File system has been unmounted, quit */ 1679 ep->error = EIO; 1680 failed_msg = "Couldn't reopen: file system has been unmounted"; 1681 goto kill_file; 1682 } 1683 1684 oop = osp->os_open_owner; 1685 1686 ASSERT(oop != NULL); 1687 if (oop == NULL) { /* be defensive in non-DEBUG */ 1688 failed_msg = "can't reopen: no open owner"; 1689 goto kill_file; 1690 } 1691 open_owner_hold(oop); 1692 1693 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1694 if (ep->error) { 1695 open_owner_rele(oop); 1696 oop = NULL; 1697 goto bailout; 1698 } 1699 1700 /* 1701 * If the rnode has a delegation and the delegation has been 1702 * recovered and the server didn't request a recall and the caller 1703 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1704 * recovery) and the rnode hasn't been marked dead, then install 1705 * the delegation stateid in the open stream. Otherwise, proceed 1706 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1707 */ 1708 mutex_enter(&rp->r_statev4_lock); 1709 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1710 !rp->r_deleg_return_pending && 1711 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1712 !rp->r_deleg_needs_recall && 1713 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1714 !(rp->r_flags & R4RECOVERR)) { 1715 mutex_enter(&osp->os_sync_lock); 1716 osp->os_delegation = 1; 1717 osp->open_stateid = rp->r_deleg_stateid; 1718 mutex_exit(&osp->os_sync_lock); 1719 mutex_exit(&rp->r_statev4_lock); 1720 goto bailout; 1721 } 1722 mutex_exit(&rp->r_statev4_lock); 1723 1724 /* 1725 * If the file failed recovery, just quit. This failure need not 1726 * affect other reopens, so don't return an error. 1727 */ 1728 mutex_enter(&rp->r_statelock); 1729 if (rp->r_flags & R4RECOVERR) { 1730 mutex_exit(&rp->r_statelock); 1731 ep->error = 0; 1732 goto failed_reopen; 1733 } 1734 mutex_exit(&rp->r_statelock); 1735 1736 /* 1737 * argop is empty here 1738 * 1739 * PUTFH, OPEN, GETATTR 1740 */ 1741 args.ctag = TAG_REOPEN; 1742 args.array_len = 4; 1743 args.array = argop; 1744 1745 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1746 "nfs4_reopen: file is type %d, id %s", 1747 vp->v_type, rnode4info(VTOR4(vp)))); 1748 1749 argop[0].argop = OP_CPUTFH; 1750 1751 if (claim != CLAIM_PREVIOUS) { 1752 /* 1753 * if this is a file mount then 1754 * use the mntinfo parentfh 1755 */ 1756 argop[0].nfs_argop4_u.opcputfh.sfh = 1757 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1758 VTOSV(vp)->sv_dfh; 1759 } else { 1760 /* putfh fh to reopen */ 1761 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1762 } 1763 1764 argop[1].argop = OP_COPEN; 1765 open_args = &argop[1].nfs_argop4_u.opcopen; 1766 open_args->claim = claim; 1767 1768 if (claim == CLAIM_NULL) { 1769 1770 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1771 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1772 "failed for vp 0x%p for CLAIM_NULL with %m", 1773 (void *)vp); 1774 failed_msg = "Couldn't reopen: vtoname failed for " 1775 "CLAIM_NULL"; 1776 /* nothing allocated yet */ 1777 goto kill_file; 1778 } 1779 1780 open_args->open_claim4_u.cfile = fn; 1781 } else if (claim == CLAIM_PREVIOUS) { 1782 1783 /* 1784 * We have two cases to deal with here: 1785 * 1) We're being called to reopen files in order to satisfy 1786 * a lock operation request which requires us to explicitly 1787 * reopen files which were opened under a delegation. If 1788 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1789 * that case, frc_use_claim_previous is TRUE and we must 1790 * use the rnode's current delegation type (r_deleg_type). 1791 * 2) We're reopening files during some form of recovery. 1792 * In this case, frc_use_claim_previous is FALSE and we 1793 * use the delegation type appropriate for recovery 1794 * (r_deleg_needs_recovery). 1795 */ 1796 mutex_enter(&rp->r_statev4_lock); 1797 open_args->open_claim4_u.delegate_type = 1798 frc_use_claim_previous ? 1799 rp->r_deleg_type : 1800 rp->r_deleg_needs_recovery; 1801 mutex_exit(&rp->r_statev4_lock); 1802 1803 } else if (claim == CLAIM_DELEGATE_CUR) { 1804 1805 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1806 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1807 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1808 "with %m", (void *)vp); 1809 failed_msg = "Couldn't reopen: vtoname failed for " 1810 "CLAIM_DELEGATE_CUR"; 1811 /* nothing allocated yet */ 1812 goto kill_file; 1813 } 1814 1815 mutex_enter(&rp->r_statev4_lock); 1816 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1817 rp->r_deleg_stateid; 1818 mutex_exit(&rp->r_statev4_lock); 1819 1820 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1821 } 1822 open_args->opentype = OPEN4_NOCREATE; 1823 open_args->owner.clientid = mi2clientid(mi); 1824 open_args->owner.owner_len = sizeof (oop->oo_name); 1825 open_args->owner.owner_val = 1826 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1827 bcopy(&oop->oo_name, open_args->owner.owner_val, 1828 open_args->owner.owner_len); 1829 open_args->share_access = 0; 1830 open_args->share_deny = 0; 1831 1832 mutex_enter(&osp->os_sync_lock); 1833 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1834 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1835 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1836 (void *)osp, (void *)rp, osp->os_share_acc_read, 1837 osp->os_share_acc_write, osp->os_open_ref_count, 1838 osp->os_mmap_read, osp->os_mmap_write, claim)); 1839 1840 if (osp->os_share_acc_read || osp->os_mmap_read) 1841 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1842 if (osp->os_share_acc_write || osp->os_mmap_write) 1843 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1844 if (osp->os_share_deny_read) 1845 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1846 if (osp->os_share_deny_write) 1847 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1848 mutex_exit(&osp->os_sync_lock); 1849 1850 seqid = nfs4_get_open_seqid(oop) + 1; 1851 open_args->seqid = seqid; 1852 1853 /* Construct the getfh part of the compound */ 1854 argop[2].argop = OP_GETFH; 1855 1856 /* Construct the getattr part of the compound */ 1857 argop[3].argop = OP_GETATTR; 1858 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1859 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1860 1861 t = gethrtime(); 1862 1863 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1864 1865 if (ep->error) { 1866 if (!is_recov && !frc_use_claim_previous && 1867 (ep->error == EINTR || ep->error == ETIMEDOUT || 1868 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1869 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1870 cred_otw, vp, NULL, open_args); 1871 abort = nfs4_start_recovery(ep, 1872 VTOMI4(vp), vp, NULL, NULL, 1873 lost_rqst.lr_op == OP_OPEN ? 1874 &lost_rqst : NULL, OP_OPEN, NULL); 1875 nfs4args_copen_free(open_args); 1876 goto bailout; 1877 } 1878 1879 nfs4args_copen_free(open_args); 1880 1881 if (ep->error == EACCES && cred_otw != cr) { 1882 crfree(cred_otw); 1883 cred_otw = cr; 1884 crhold(cred_otw); 1885 nfs4_end_open_seqid_sync(oop); 1886 open_owner_rele(oop); 1887 oop = NULL; 1888 goto top; 1889 } 1890 if (ep->error == ETIMEDOUT) 1891 goto bailout; 1892 failed_msg = "Couldn't reopen: rpc error"; 1893 goto kill_file; 1894 } 1895 1896 if (nfs4_need_to_bump_seqid(&res)) 1897 nfs4_set_open_seqid(seqid, oop, args.ctag); 1898 1899 switch (res.status) { 1900 case NFS4_OK: 1901 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1902 mutex_enter(&rp->r_statelock); 1903 rp->r_delay_interval = 0; 1904 mutex_exit(&rp->r_statelock); 1905 } 1906 break; 1907 case NFS4ERR_BAD_SEQID: 1908 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1909 args.ctag, open_args->seqid); 1910 1911 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1912 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1913 NULL, OP_OPEN, bsep); 1914 1915 nfs4args_copen_free(open_args); 1916 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1917 nfs4_end_open_seqid_sync(oop); 1918 open_owner_rele(oop); 1919 oop = NULL; 1920 kmem_free(bsep, sizeof (*bsep)); 1921 1922 goto kill_file; 1923 case NFS4ERR_NO_GRACE: 1924 nfs4args_copen_free(open_args); 1925 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1926 nfs4_end_open_seqid_sync(oop); 1927 open_owner_rele(oop); 1928 oop = NULL; 1929 if (claim == CLAIM_PREVIOUS) { 1930 /* 1931 * Retry as a plain open. We don't need to worry about 1932 * checking the changeinfo: it is acceptable for a 1933 * client to re-open a file and continue processing 1934 * (in the absence of locks). 1935 */ 1936 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1937 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1938 "will retry as CLAIM_NULL")); 1939 claim = CLAIM_NULL; 1940 nfs4_mi_kstat_inc_no_grace(mi); 1941 goto top; 1942 } 1943 failed_msg = 1944 "Couldn't reopen: tried reclaim outside grace period. "; 1945 goto kill_file; 1946 case NFS4ERR_GRACE: 1947 nfs4_set_grace_wait(mi); 1948 nfs4args_copen_free(open_args); 1949 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1950 nfs4_end_open_seqid_sync(oop); 1951 open_owner_rele(oop); 1952 oop = NULL; 1953 ep->error = nfs4_wait_for_grace(mi, &recov); 1954 if (ep->error != 0) 1955 goto bailout; 1956 goto top; 1957 case NFS4ERR_DELAY: 1958 nfs4_set_delay_wait(vp); 1959 nfs4args_copen_free(open_args); 1960 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1961 nfs4_end_open_seqid_sync(oop); 1962 open_owner_rele(oop); 1963 oop = NULL; 1964 ep->error = nfs4_wait_for_delay(vp, &recov); 1965 nfs4_mi_kstat_inc_delay(mi); 1966 if (ep->error != 0) 1967 goto bailout; 1968 goto top; 1969 case NFS4ERR_FHEXPIRED: 1970 /* recover filehandle and retry */ 1971 abort = nfs4_start_recovery(ep, 1972 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 1973 nfs4args_copen_free(open_args); 1974 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1975 nfs4_end_open_seqid_sync(oop); 1976 open_owner_rele(oop); 1977 oop = NULL; 1978 if (abort == FALSE) 1979 goto top; 1980 failed_msg = "Couldn't reopen: recovery aborted"; 1981 goto kill_file; 1982 case NFS4ERR_RESOURCE: 1983 case NFS4ERR_STALE_CLIENTID: 1984 case NFS4ERR_WRONGSEC: 1985 case NFS4ERR_EXPIRED: 1986 /* 1987 * Do not mark the file dead and let the calling 1988 * function initiate recovery. 1989 */ 1990 nfs4args_copen_free(open_args); 1991 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1992 nfs4_end_open_seqid_sync(oop); 1993 open_owner_rele(oop); 1994 oop = NULL; 1995 goto bailout; 1996 case NFS4ERR_ACCESS: 1997 if (cred_otw != cr) { 1998 crfree(cred_otw); 1999 cred_otw = cr; 2000 crhold(cred_otw); 2001 nfs4args_copen_free(open_args); 2002 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2003 nfs4_end_open_seqid_sync(oop); 2004 open_owner_rele(oop); 2005 oop = NULL; 2006 goto top; 2007 } 2008 /* fall through */ 2009 default: 2010 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2011 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2012 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2013 rnode4info(VTOR4(vp)))); 2014 failed_msg = "Couldn't reopen: NFSv4 error"; 2015 nfs4args_copen_free(open_args); 2016 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2017 goto kill_file; 2018 } 2019 2020 resop = &res.array[1]; /* open res */ 2021 op_res = &resop->nfs_resop4_u.opopen; 2022 2023 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2024 2025 /* 2026 * Check if the path we reopened really is the same 2027 * file. We could end up in a situation where the file 2028 * was removed and a new file created with the same name. 2029 */ 2030 resop = &res.array[2]; 2031 gf_res = &resop->nfs_resop4_u.opgetfh; 2032 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2033 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2034 if (fh_different) { 2035 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2036 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2037 /* Oops, we don't have the same file */ 2038 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2039 failed_msg = "Couldn't reopen: Persistent " 2040 "file handle changed"; 2041 else 2042 failed_msg = "Couldn't reopen: Volatile " 2043 "(no expire on open) file handle changed"; 2044 2045 nfs4args_copen_free(open_args); 2046 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2047 nfs_rw_exit(&mi->mi_fh_lock); 2048 goto kill_file; 2049 2050 } else { 2051 /* 2052 * We have volatile file handles that don't compare. 2053 * If the fids are the same then we assume that the 2054 * file handle expired but the rnode still refers to 2055 * the same file object. 2056 * 2057 * First check that we have fids or not. 2058 * If we don't we have a dumb server so we will 2059 * just assume every thing is ok for now. 2060 */ 2061 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2062 rp->r_attr.va_mask & AT_NODEID && 2063 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2064 /* 2065 * We have fids, but they don't 2066 * compare. So kill the file. 2067 */ 2068 failed_msg = 2069 "Couldn't reopen: file handle changed" 2070 " due to mismatched fids"; 2071 nfs4args_copen_free(open_args); 2072 (void) xdr_free(xdr_COMPOUND4res_clnt, 2073 (caddr_t)&res); 2074 nfs_rw_exit(&mi->mi_fh_lock); 2075 goto kill_file; 2076 } else { 2077 /* 2078 * We have volatile file handles that refers 2079 * to the same file (at least they have the 2080 * same fid) or we don't have fids so we 2081 * can't tell. :(. We'll be a kind and accepting 2082 * client so we'll update the rnode's file 2083 * handle with the otw handle. 2084 * 2085 * We need to drop mi->mi_fh_lock since 2086 * sh4_update acquires it. Since there is 2087 * only one recovery thread there is no 2088 * race. 2089 */ 2090 nfs_rw_exit(&mi->mi_fh_lock); 2091 sfh4_update(rp->r_fh, &gf_res->object); 2092 } 2093 } 2094 } else { 2095 nfs_rw_exit(&mi->mi_fh_lock); 2096 } 2097 2098 ASSERT(nfs4_consistent_type(vp)); 2099 2100 /* 2101 * If the server wanted an OPEN_CONFIRM but that fails, just start 2102 * over. Presumably if there is a persistent error it will show up 2103 * when we resend the OPEN. 2104 */ 2105 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2106 bool_t retry_open = FALSE; 2107 2108 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2109 cred_otw, is_recov, &retry_open, 2110 oop, FALSE, ep, NULL); 2111 if (ep->error || ep->stat) { 2112 nfs4args_copen_free(open_args); 2113 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2114 nfs4_end_open_seqid_sync(oop); 2115 open_owner_rele(oop); 2116 oop = NULL; 2117 goto top; 2118 } 2119 } 2120 2121 mutex_enter(&osp->os_sync_lock); 2122 osp->open_stateid = op_res->stateid; 2123 osp->os_delegation = 0; 2124 /* 2125 * Need to reset this bitfield for the possible case where we were 2126 * going to OTW CLOSE the file, got a non-recoverable error, and before 2127 * we could retry the CLOSE, OPENed the file again. 2128 */ 2129 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2130 osp->os_final_close = 0; 2131 osp->os_force_close = 0; 2132 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2133 osp->os_dc_openacc = open_args->share_access; 2134 mutex_exit(&osp->os_sync_lock); 2135 2136 nfs4_end_open_seqid_sync(oop); 2137 2138 /* accept delegation, if any */ 2139 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2140 2141 nfs4args_copen_free(open_args); 2142 2143 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2144 2145 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2146 2147 ASSERT(nfs4_consistent_type(vp)); 2148 2149 open_owner_rele(oop); 2150 crfree(cr); 2151 crfree(cred_otw); 2152 return; 2153 2154 kill_file: 2155 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2156 failed_reopen: 2157 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2158 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2159 (void *)osp, (void *)cr, rnode4info(rp))); 2160 mutex_enter(&osp->os_sync_lock); 2161 osp->os_failed_reopen = 1; 2162 mutex_exit(&osp->os_sync_lock); 2163 bailout: 2164 if (oop != NULL) { 2165 nfs4_end_open_seqid_sync(oop); 2166 open_owner_rele(oop); 2167 } 2168 if (cr != NULL) 2169 crfree(cr); 2170 if (cred_otw != NULL) 2171 crfree(cred_otw); 2172 } 2173 2174 /* for . and .. OPENs */ 2175 /* ARGSUSED */ 2176 static int 2177 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2178 { 2179 rnode4_t *rp; 2180 nfs4_ga_res_t gar; 2181 2182 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2183 2184 /* 2185 * If close-to-open consistency checking is turned off or 2186 * if there is no cached data, we can avoid 2187 * the over the wire getattr. Otherwise, force a 2188 * call to the server to get fresh attributes and to 2189 * check caches. This is required for close-to-open 2190 * consistency. 2191 */ 2192 rp = VTOR4(*vpp); 2193 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2194 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2195 return (0); 2196 2197 gar.n4g_va.va_mask = AT_ALL; 2198 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2199 } 2200 2201 /* 2202 * CLOSE a file 2203 */ 2204 static int 2205 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) 2206 { 2207 rnode4_t *rp; 2208 int error = 0; 2209 int r_error = 0; 2210 int n4error = 0; 2211 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2212 2213 /* 2214 * Remove client state for this (lockowner, file) pair. 2215 * Issue otw v4 call to have the server do the same. 2216 */ 2217 2218 rp = VTOR4(vp); 2219 2220 /* 2221 * zone_enter(2) prevents processes from changing zones with NFS files 2222 * open; if we happen to get here from the wrong zone we can't do 2223 * anything over the wire. 2224 */ 2225 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2226 /* 2227 * We could attempt to clean up locks, except we're sure 2228 * that the current process didn't acquire any locks on 2229 * the file: any attempt to lock a file belong to another zone 2230 * will fail, and one can't lock an NFS file and then change 2231 * zones, as that fails too. 2232 * 2233 * Returning an error here is the sane thing to do. A 2234 * subsequent call to VN_RELE() which translates to a 2235 * nfs4_inactive() will clean up state: if the zone of the 2236 * vnode's origin is still alive and kicking, the inactive 2237 * thread will handle the request (from the correct zone), and 2238 * everything (minus the OTW close call) should be OK. If the 2239 * zone is going away nfs4_async_inactive() will throw away 2240 * delegations, open streams and cached pages inline. 2241 */ 2242 return (EIO); 2243 } 2244 2245 /* 2246 * If we are using local locking for this filesystem, then 2247 * release all of the SYSV style record locks. Otherwise, 2248 * we are doing network locking and we need to release all 2249 * of the network locks. All of the locks held by this 2250 * process on this file are released no matter what the 2251 * incoming reference count is. 2252 */ 2253 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2254 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2255 cleanshares(vp, ttoproc(curthread)->p_pid); 2256 } else 2257 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2258 2259 if (e.error) 2260 return (e.error); 2261 2262 if (count > 1) 2263 return (0); 2264 2265 /* 2266 * If the file has been `unlinked', then purge the 2267 * DNLC so that this vnode will get reycled quicker 2268 * and the .nfs* file on the server will get removed. 2269 */ 2270 if (rp->r_unldvp != NULL) 2271 dnlc_purge_vp(vp); 2272 2273 /* 2274 * If the file was open for write and there are pages, 2275 * do a synchronous flush and commit of all of the 2276 * dirty and uncommitted pages. 2277 */ 2278 ASSERT(!e.error); 2279 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2280 error = nfs4_putpage_commit(vp, 0, 0, cr); 2281 2282 mutex_enter(&rp->r_statelock); 2283 r_error = rp->r_error; 2284 rp->r_error = 0; 2285 mutex_exit(&rp->r_statelock); 2286 2287 /* 2288 * If this file type is one for which no explicit 'open' was 2289 * done, then bail now (ie. no need for protocol 'close'). If 2290 * there was an error w/the vm subsystem, return _that_ error, 2291 * otherwise, return any errors that may've been reported via 2292 * the rnode. 2293 */ 2294 if (vp->v_type != VREG) 2295 return (error ? error : r_error); 2296 2297 /* 2298 * The sync putpage commit may have failed above, but since 2299 * we're working w/a regular file, we need to do the protocol 2300 * 'close' (nfs4close_one will figure out if an otw close is 2301 * needed or not). Report any errors _after_ doing the protocol 2302 * 'close'. 2303 */ 2304 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2305 n4error = e.error ? e.error : geterrno4(e.stat); 2306 2307 /* 2308 * Error reporting prio (Hi -> Lo) 2309 * 2310 * i) nfs4_putpage_commit (error) 2311 * ii) rnode's (r_error) 2312 * iii) nfs4close_one (n4error) 2313 */ 2314 return (error ? error : (r_error ? r_error : n4error)); 2315 } 2316 2317 /* 2318 * Initialize *lost_rqstp. 2319 */ 2320 2321 static void 2322 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2323 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2324 vnode_t *vp) 2325 { 2326 if (error != ETIMEDOUT && error != EINTR && 2327 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2328 lost_rqstp->lr_op = 0; 2329 return; 2330 } 2331 2332 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2333 "nfs4close_save_lost_rqst: error %d", error)); 2334 2335 lost_rqstp->lr_op = OP_CLOSE; 2336 /* 2337 * The vp is held and rele'd via the recovery code. 2338 * See nfs4_save_lost_rqst. 2339 */ 2340 lost_rqstp->lr_vp = vp; 2341 lost_rqstp->lr_dvp = NULL; 2342 lost_rqstp->lr_oop = oop; 2343 lost_rqstp->lr_osp = osp; 2344 ASSERT(osp != NULL); 2345 ASSERT(mutex_owned(&osp->os_sync_lock)); 2346 osp->os_pending_close = 1; 2347 lost_rqstp->lr_lop = NULL; 2348 lost_rqstp->lr_cr = cr; 2349 lost_rqstp->lr_flk = NULL; 2350 lost_rqstp->lr_putfirst = FALSE; 2351 } 2352 2353 /* 2354 * Assumes you already have the open seqid sync grabbed as well as the 2355 * 'os_sync_lock'. Note: this will release the open seqid sync and 2356 * 'os_sync_lock' if client recovery starts. Calling functions have to 2357 * be prepared to handle this. 2358 * 2359 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2360 * was needed and was started, and that the calling function should retry 2361 * this function; otherwise it is returned as 0. 2362 * 2363 * Errors are returned via the nfs4_error_t parameter. 2364 */ 2365 static void 2366 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2367 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2368 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2369 { 2370 COMPOUND4args_clnt args; 2371 COMPOUND4res_clnt res; 2372 CLOSE4args *close_args; 2373 nfs_resop4 *resop; 2374 nfs_argop4 argop[3]; 2375 int doqueue = 1; 2376 mntinfo4_t *mi; 2377 seqid4 seqid; 2378 vnode_t *vp; 2379 bool_t needrecov = FALSE; 2380 nfs4_lost_rqst_t lost_rqst; 2381 hrtime_t t; 2382 2383 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2384 2385 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2386 2387 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2388 2389 /* Only set this to 1 if recovery is started */ 2390 *recov = 0; 2391 2392 /* do the OTW call to close the file */ 2393 2394 if (close_type == CLOSE_RESEND) 2395 args.ctag = TAG_CLOSE_LOST; 2396 else if (close_type == CLOSE_AFTER_RESEND) 2397 args.ctag = TAG_CLOSE_UNDO; 2398 else 2399 args.ctag = TAG_CLOSE; 2400 2401 args.array_len = 3; 2402 args.array = argop; 2403 2404 vp = RTOV4(rp); 2405 2406 mi = VTOMI4(vp); 2407 2408 /* putfh target fh */ 2409 argop[0].argop = OP_CPUTFH; 2410 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2411 2412 argop[1].argop = OP_GETATTR; 2413 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2414 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2415 2416 argop[2].argop = OP_CLOSE; 2417 close_args = &argop[2].nfs_argop4_u.opclose; 2418 2419 seqid = nfs4_get_open_seqid(oop) + 1; 2420 2421 close_args->seqid = seqid; 2422 close_args->open_stateid = osp->open_stateid; 2423 2424 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2425 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2426 rnode4info(rp))); 2427 2428 t = gethrtime(); 2429 2430 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2431 2432 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2433 nfs4_set_open_seqid(seqid, oop, args.ctag); 2434 } 2435 2436 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2437 if (ep->error && !needrecov) { 2438 /* 2439 * if there was an error and no recovery is to be done 2440 * then then set up the file to flush its cache if 2441 * needed for the next caller. 2442 */ 2443 mutex_enter(&rp->r_statelock); 2444 PURGE_ATTRCACHE4_LOCKED(rp); 2445 rp->r_flags &= ~R4WRITEMODIFIED; 2446 mutex_exit(&rp->r_statelock); 2447 return; 2448 } 2449 2450 if (needrecov) { 2451 bool_t abort; 2452 nfs4_bseqid_entry_t *bsep = NULL; 2453 2454 if (close_type != CLOSE_RESEND) 2455 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2456 osp, cred_otw, vp); 2457 2458 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2459 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2460 0, args.ctag, close_args->seqid); 2461 2462 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2463 "nfs4close_otw: initiating recovery. error %d " 2464 "res.status %d", ep->error, res.status)); 2465 2466 /* 2467 * Drop the 'os_sync_lock' here so we don't hit 2468 * a potential recursive mutex_enter via an 2469 * 'open_stream_hold()'. 2470 */ 2471 mutex_exit(&osp->os_sync_lock); 2472 *have_sync_lockp = 0; 2473 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2474 (close_type != CLOSE_RESEND && 2475 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2476 OP_CLOSE, bsep); 2477 2478 /* drop open seq sync, and let the calling function regrab it */ 2479 nfs4_end_open_seqid_sync(oop); 2480 *did_start_seqid_syncp = 0; 2481 2482 if (bsep) 2483 kmem_free(bsep, sizeof (*bsep)); 2484 /* 2485 * For signals, the caller wants to quit, so don't say to 2486 * retry. For forced unmount, if it's a user thread, it 2487 * wants to quit. If it's a recovery thread, the retry 2488 * will happen higher-up on the call stack. Either way, 2489 * don't say to retry. 2490 */ 2491 if (abort == FALSE && ep->error != EINTR && 2492 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2493 close_type != CLOSE_RESEND && 2494 close_type != CLOSE_AFTER_RESEND) 2495 *recov = 1; 2496 else 2497 *recov = 0; 2498 2499 if (!ep->error) 2500 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2501 return; 2502 } 2503 2504 if (res.status) { 2505 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2506 return; 2507 } 2508 2509 mutex_enter(&rp->r_statev4_lock); 2510 rp->created_v4 = 0; 2511 mutex_exit(&rp->r_statev4_lock); 2512 2513 resop = &res.array[2]; 2514 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2515 osp->os_valid = 0; 2516 2517 /* 2518 * This removes the reference obtained at OPEN; ie, when the 2519 * open stream structure was created. 2520 * 2521 * We don't have to worry about calling 'open_stream_rele' 2522 * since we our currently holding a reference to the open 2523 * stream which means the count cannot go to 0 with this 2524 * decrement. 2525 */ 2526 ASSERT(osp->os_ref_count >= 2); 2527 osp->os_ref_count--; 2528 2529 if (!ep->error) 2530 nfs4_attr_cache(vp, 2531 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2532 t, cred_otw, TRUE, NULL); 2533 2534 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2535 " returning %d", ep->error)); 2536 2537 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2538 } 2539 2540 /* ARGSUSED */ 2541 static int 2542 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2543 caller_context_t *ct) 2544 { 2545 rnode4_t *rp; 2546 u_offset_t off; 2547 offset_t diff; 2548 uint_t on; 2549 uint_t n; 2550 caddr_t base; 2551 uint_t flags; 2552 int error; 2553 mntinfo4_t *mi; 2554 2555 rp = VTOR4(vp); 2556 2557 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2558 2559 if (IS_SHADOW(vp, rp)) 2560 vp = RTOV4(rp); 2561 2562 if (vp->v_type != VREG) 2563 return (EISDIR); 2564 2565 mi = VTOMI4(vp); 2566 2567 if (nfs_zone() != mi->mi_zone) 2568 return (EIO); 2569 2570 if (uiop->uio_resid == 0) 2571 return (0); 2572 2573 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2574 return (EINVAL); 2575 2576 mutex_enter(&rp->r_statelock); 2577 if (rp->r_flags & R4RECOVERRP) 2578 error = (rp->r_error ? rp->r_error : EIO); 2579 else 2580 error = 0; 2581 mutex_exit(&rp->r_statelock); 2582 if (error) 2583 return (error); 2584 2585 /* 2586 * Bypass VM if caching has been disabled (e.g., locking) or if 2587 * using client-side direct I/O and the file is not mmap'd and 2588 * there are no cached pages. 2589 */ 2590 if ((vp->v_flag & VNOCACHE) || 2591 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2592 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2593 size_t resid = 0; 2594 2595 return (nfs4read(vp, NULL, uiop->uio_loffset, 2596 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2597 } 2598 2599 error = 0; 2600 2601 do { 2602 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2603 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2604 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2605 2606 if (error = nfs4_validate_caches(vp, cr)) 2607 break; 2608 2609 mutex_enter(&rp->r_statelock); 2610 diff = rp->r_size - uiop->uio_loffset; 2611 mutex_exit(&rp->r_statelock); 2612 if (diff <= 0) 2613 break; 2614 if (diff < n) 2615 n = (uint_t)diff; 2616 2617 if (vpm_enable) { 2618 /* 2619 * Copy data. 2620 */ 2621 error = vpm_data_copy(vp, off + on, n, uiop, 2622 1, NULL, 0, S_READ); 2623 2624 } else { 2625 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2626 S_READ); 2627 2628 error = uiomove(base + on, n, UIO_READ, uiop); 2629 } 2630 2631 if (!error) { 2632 /* 2633 * If read a whole block or read to eof, 2634 * won't need this buffer again soon. 2635 */ 2636 mutex_enter(&rp->r_statelock); 2637 if (n + on == MAXBSIZE || 2638 uiop->uio_loffset == rp->r_size) 2639 flags = SM_DONTNEED; 2640 else 2641 flags = 0; 2642 mutex_exit(&rp->r_statelock); 2643 if (vpm_enable) { 2644 error = vpm_sync_pages(vp, off, n, flags); 2645 } else { 2646 error = segmap_release(segkmap, base, flags); 2647 } 2648 } else { 2649 if (vpm_enable) { 2650 (void) vpm_sync_pages(vp, off, n, 0); 2651 } else { 2652 (void) segmap_release(segkmap, base, 0); 2653 } 2654 } 2655 } while (!error && uiop->uio_resid > 0); 2656 2657 return (error); 2658 } 2659 2660 /* ARGSUSED */ 2661 static int 2662 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2663 caller_context_t *ct) 2664 { 2665 rlim64_t limit = uiop->uio_llimit; 2666 rnode4_t *rp; 2667 u_offset_t off; 2668 caddr_t base; 2669 uint_t flags; 2670 int remainder; 2671 size_t n; 2672 int on; 2673 int error; 2674 int resid; 2675 u_offset_t offset; 2676 mntinfo4_t *mi; 2677 uint_t bsize; 2678 2679 rp = VTOR4(vp); 2680 2681 if (IS_SHADOW(vp, rp)) 2682 vp = RTOV4(rp); 2683 2684 if (vp->v_type != VREG) 2685 return (EISDIR); 2686 2687 mi = VTOMI4(vp); 2688 2689 if (nfs_zone() != mi->mi_zone) 2690 return (EIO); 2691 2692 if (uiop->uio_resid == 0) 2693 return (0); 2694 2695 mutex_enter(&rp->r_statelock); 2696 if (rp->r_flags & R4RECOVERRP) 2697 error = (rp->r_error ? rp->r_error : EIO); 2698 else 2699 error = 0; 2700 mutex_exit(&rp->r_statelock); 2701 if (error) 2702 return (error); 2703 2704 if (ioflag & FAPPEND) { 2705 struct vattr va; 2706 2707 /* 2708 * Must serialize if appending. 2709 */ 2710 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2711 nfs_rw_exit(&rp->r_rwlock); 2712 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2713 INTR(vp))) 2714 return (EINTR); 2715 } 2716 2717 va.va_mask = AT_SIZE; 2718 error = nfs4getattr(vp, &va, cr); 2719 if (error) 2720 return (error); 2721 uiop->uio_loffset = va.va_size; 2722 } 2723 2724 offset = uiop->uio_loffset + uiop->uio_resid; 2725 2726 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2727 return (EINVAL); 2728 2729 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2730 limit = MAXOFFSET_T; 2731 2732 /* 2733 * Check to make sure that the process will not exceed 2734 * its limit on file size. It is okay to write up to 2735 * the limit, but not beyond. Thus, the write which 2736 * reaches the limit will be short and the next write 2737 * will return an error. 2738 */ 2739 remainder = 0; 2740 if (offset > uiop->uio_llimit) { 2741 remainder = offset - uiop->uio_llimit; 2742 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2743 if (uiop->uio_resid <= 0) { 2744 proc_t *p = ttoproc(curthread); 2745 2746 uiop->uio_resid += remainder; 2747 mutex_enter(&p->p_lock); 2748 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2749 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2750 mutex_exit(&p->p_lock); 2751 return (EFBIG); 2752 } 2753 } 2754 2755 /* update the change attribute, if we have a write delegation */ 2756 2757 mutex_enter(&rp->r_statev4_lock); 2758 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2759 rp->r_deleg_change++; 2760 2761 mutex_exit(&rp->r_statev4_lock); 2762 2763 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2764 return (EINTR); 2765 2766 /* 2767 * Bypass VM if caching has been disabled (e.g., locking) or if 2768 * using client-side direct I/O and the file is not mmap'd and 2769 * there are no cached pages. 2770 */ 2771 if ((vp->v_flag & VNOCACHE) || 2772 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2773 rp->r_mapcnt == 0 && !nfs4_has_pages(vp))) { 2774 size_t bufsize; 2775 int count; 2776 u_offset_t org_offset; 2777 stable_how4 stab_comm; 2778 nfs4_fwrite: 2779 if (rp->r_flags & R4STALE) { 2780 resid = uiop->uio_resid; 2781 offset = uiop->uio_loffset; 2782 error = rp->r_error; 2783 goto bottom; 2784 } 2785 2786 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2787 base = kmem_alloc(bufsize, KM_SLEEP); 2788 do { 2789 if (ioflag & FDSYNC) 2790 stab_comm = DATA_SYNC4; 2791 else 2792 stab_comm = FILE_SYNC4; 2793 resid = uiop->uio_resid; 2794 offset = uiop->uio_loffset; 2795 count = MIN(uiop->uio_resid, bufsize); 2796 org_offset = uiop->uio_loffset; 2797 error = uiomove(base, count, UIO_WRITE, uiop); 2798 if (!error) { 2799 error = nfs4write(vp, base, org_offset, 2800 count, cr, &stab_comm); 2801 if (!error) { 2802 mutex_enter(&rp->r_statelock); 2803 if (rp->r_size < uiop->uio_loffset) 2804 rp->r_size = uiop->uio_loffset; 2805 mutex_exit(&rp->r_statelock); 2806 } 2807 } 2808 } while (!error && uiop->uio_resid > 0); 2809 kmem_free(base, bufsize); 2810 goto bottom; 2811 } 2812 2813 bsize = vp->v_vfsp->vfs_bsize; 2814 2815 do { 2816 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2817 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2818 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2819 2820 resid = uiop->uio_resid; 2821 offset = uiop->uio_loffset; 2822 2823 if (rp->r_flags & R4STALE) { 2824 error = rp->r_error; 2825 break; 2826 } 2827 2828 /* 2829 * Don't create dirty pages faster than they 2830 * can be cleaned so that the system doesn't 2831 * get imbalanced. If the async queue is 2832 * maxed out, then wait for it to drain before 2833 * creating more dirty pages. Also, wait for 2834 * any threads doing pagewalks in the vop_getattr 2835 * entry points so that they don't block for 2836 * long periods. 2837 */ 2838 mutex_enter(&rp->r_statelock); 2839 while ((mi->mi_max_threads != 0 && 2840 rp->r_awcount > 2 * mi->mi_max_threads) || 2841 rp->r_gcount > 0) 2842 cv_wait(&rp->r_cv, &rp->r_statelock); 2843 mutex_exit(&rp->r_statelock); 2844 2845 if (vpm_enable) { 2846 /* 2847 * It will use kpm mappings, so no need to 2848 * pass an address. 2849 */ 2850 error = writerp4(rp, NULL, n, uiop, 0); 2851 } else { 2852 if (segmap_kpm) { 2853 int pon = uiop->uio_loffset & PAGEOFFSET; 2854 size_t pn = MIN(PAGESIZE - pon, 2855 uiop->uio_resid); 2856 int pagecreate; 2857 2858 mutex_enter(&rp->r_statelock); 2859 pagecreate = (pon == 0) && (pn == PAGESIZE || 2860 uiop->uio_loffset + pn >= rp->r_size); 2861 mutex_exit(&rp->r_statelock); 2862 2863 base = segmap_getmapflt(segkmap, vp, off + on, 2864 pn, !pagecreate, S_WRITE); 2865 2866 error = writerp4(rp, base + pon, n, uiop, 2867 pagecreate); 2868 2869 } else { 2870 base = segmap_getmapflt(segkmap, vp, off + on, 2871 n, 0, S_READ); 2872 error = writerp4(rp, base + on, n, uiop, 0); 2873 } 2874 } 2875 2876 if (!error) { 2877 if (mi->mi_flags & MI4_NOAC) 2878 flags = SM_WRITE; 2879 else if ((uiop->uio_loffset % bsize) == 0 || 2880 IS_SWAPVP(vp)) { 2881 /* 2882 * Have written a whole block. 2883 * Start an asynchronous write 2884 * and mark the buffer to 2885 * indicate that it won't be 2886 * needed again soon. 2887 */ 2888 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2889 } else 2890 flags = 0; 2891 if ((ioflag & (FSYNC|FDSYNC)) || 2892 (rp->r_flags & R4OUTOFSPACE)) { 2893 flags &= ~SM_ASYNC; 2894 flags |= SM_WRITE; 2895 } 2896 if (vpm_enable) { 2897 error = vpm_sync_pages(vp, off, n, flags); 2898 } else { 2899 error = segmap_release(segkmap, base, flags); 2900 } 2901 } else { 2902 if (vpm_enable) { 2903 (void) vpm_sync_pages(vp, off, n, 0); 2904 } else { 2905 (void) segmap_release(segkmap, base, 0); 2906 } 2907 /* 2908 * In the event that we got an access error while 2909 * faulting in a page for a write-only file just 2910 * force a write. 2911 */ 2912 if (error == EACCES) 2913 goto nfs4_fwrite; 2914 } 2915 } while (!error && uiop->uio_resid > 0); 2916 2917 bottom: 2918 if (error) { 2919 uiop->uio_resid = resid + remainder; 2920 uiop->uio_loffset = offset; 2921 } else { 2922 uiop->uio_resid += remainder; 2923 2924 mutex_enter(&rp->r_statev4_lock); 2925 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2926 gethrestime(&rp->r_attr.va_mtime); 2927 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2928 } 2929 mutex_exit(&rp->r_statev4_lock); 2930 } 2931 2932 nfs_rw_exit(&rp->r_lkserlock); 2933 2934 return (error); 2935 } 2936 2937 /* 2938 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2939 */ 2940 static int 2941 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2942 int flags, cred_t *cr) 2943 { 2944 struct buf *bp; 2945 int error; 2946 page_t *savepp; 2947 uchar_t fsdata; 2948 stable_how4 stab_comm; 2949 2950 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 2951 bp = pageio_setup(pp, len, vp, flags); 2952 ASSERT(bp != NULL); 2953 2954 /* 2955 * pageio_setup should have set b_addr to 0. This 2956 * is correct since we want to do I/O on a page 2957 * boundary. bp_mapin will use this addr to calculate 2958 * an offset, and then set b_addr to the kernel virtual 2959 * address it allocated for us. 2960 */ 2961 ASSERT(bp->b_un.b_addr == 0); 2962 2963 bp->b_edev = 0; 2964 bp->b_dev = 0; 2965 bp->b_lblkno = lbtodb(off); 2966 bp->b_file = vp; 2967 bp->b_offset = (offset_t)off; 2968 bp_mapin(bp); 2969 2970 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 2971 freemem > desfree) 2972 stab_comm = UNSTABLE4; 2973 else 2974 stab_comm = FILE_SYNC4; 2975 2976 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 2977 2978 bp_mapout(bp); 2979 pageio_done(bp); 2980 2981 if (stab_comm == UNSTABLE4) 2982 fsdata = C_DELAYCOMMIT; 2983 else 2984 fsdata = C_NOCOMMIT; 2985 2986 savepp = pp; 2987 do { 2988 pp->p_fsdata = fsdata; 2989 } while ((pp = pp->p_next) != savepp); 2990 2991 return (error); 2992 } 2993 2994 /* 2995 */ 2996 static int 2997 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 2998 { 2999 nfs4_open_owner_t *oop; 3000 nfs4_open_stream_t *osp; 3001 rnode4_t *rp = VTOR4(vp); 3002 mntinfo4_t *mi = VTOMI4(vp); 3003 int reopen_needed; 3004 3005 ASSERT(nfs_zone() == mi->mi_zone); 3006 3007 3008 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3009 if (!oop) 3010 return (EIO); 3011 3012 /* returns with 'os_sync_lock' held */ 3013 osp = find_open_stream(oop, rp); 3014 if (!osp) { 3015 open_owner_rele(oop); 3016 return (EIO); 3017 } 3018 3019 if (osp->os_failed_reopen) { 3020 mutex_exit(&osp->os_sync_lock); 3021 open_stream_rele(osp, rp); 3022 open_owner_rele(oop); 3023 return (EIO); 3024 } 3025 3026 /* 3027 * Determine whether a reopen is needed. If this 3028 * is a delegation open stream, then the os_delegation bit 3029 * should be set. 3030 */ 3031 3032 reopen_needed = osp->os_delegation; 3033 3034 mutex_exit(&osp->os_sync_lock); 3035 open_owner_rele(oop); 3036 3037 if (reopen_needed) { 3038 nfs4_error_zinit(ep); 3039 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3040 mutex_enter(&osp->os_sync_lock); 3041 if (ep->error || ep->stat || osp->os_failed_reopen) { 3042 mutex_exit(&osp->os_sync_lock); 3043 open_stream_rele(osp, rp); 3044 return (EIO); 3045 } 3046 mutex_exit(&osp->os_sync_lock); 3047 } 3048 open_stream_rele(osp, rp); 3049 3050 return (0); 3051 } 3052 3053 /* 3054 * Write to file. Writes to remote server in largest size 3055 * chunks that the server can handle. Write is synchronous. 3056 */ 3057 static int 3058 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3059 stable_how4 *stab_comm) 3060 { 3061 mntinfo4_t *mi; 3062 COMPOUND4args_clnt args; 3063 COMPOUND4res_clnt res; 3064 WRITE4args *wargs; 3065 WRITE4res *wres; 3066 nfs_argop4 argop[2]; 3067 nfs_resop4 *resop; 3068 int tsize; 3069 stable_how4 stable; 3070 rnode4_t *rp; 3071 int doqueue = 1; 3072 bool_t needrecov; 3073 nfs4_recov_state_t recov_state; 3074 nfs4_stateid_types_t sid_types; 3075 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3076 3077 rp = VTOR4(vp); 3078 mi = VTOMI4(vp); 3079 3080 ASSERT(nfs_zone() == mi->mi_zone); 3081 3082 stable = *stab_comm; 3083 *stab_comm = FILE_SYNC4; 3084 3085 needrecov = FALSE; 3086 recov_state.rs_flags = 0; 3087 recov_state.rs_num_retry_despite_err = 0; 3088 nfs4_init_stateid_types(&sid_types); 3089 3090 recov_retry: 3091 args.ctag = TAG_WRITE; 3092 args.array_len = 2; 3093 args.array = argop; 3094 3095 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3096 &recov_state, NULL); 3097 if (e.error) 3098 return (e.error); 3099 3100 /* 0. putfh target fh */ 3101 argop[0].argop = OP_CPUTFH; 3102 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3103 3104 /* 1. write */ 3105 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3106 3107 do { 3108 3109 wargs->offset = (offset4)offset; 3110 wargs->data_val = base; 3111 3112 if (mi->mi_io_kstats) { 3113 mutex_enter(&mi->mi_lock); 3114 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3115 mutex_exit(&mi->mi_lock); 3116 } 3117 3118 if ((vp->v_flag & VNOCACHE) || 3119 (rp->r_flags & R4DIRECTIO) || 3120 (mi->mi_flags & MI4_DIRECTIO)) 3121 tsize = MIN(mi->mi_stsize, count); 3122 else 3123 tsize = MIN(mi->mi_curwrite, count); 3124 wargs->data_len = (uint_t)tsize; 3125 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3126 3127 if (mi->mi_io_kstats) { 3128 mutex_enter(&mi->mi_lock); 3129 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3130 mutex_exit(&mi->mi_lock); 3131 } 3132 3133 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3134 if (e.error && !needrecov) { 3135 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3136 &recov_state, needrecov); 3137 return (e.error); 3138 } 3139 3140 3141 /* 3142 * Do handling of OLD_STATEID outside 3143 * of the normal recovery framework. 3144 * 3145 * If write receives a BAD stateid error while using a 3146 * delegation stateid, retry using the open stateid (if it 3147 * exists). If it doesn't have an open stateid, reopen the 3148 * file first, then retry. 3149 */ 3150 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3151 sid_types.cur_sid_type != SPEC_SID) { 3152 nfs4_save_stateid(&wargs->stateid, &sid_types); 3153 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3154 &recov_state, needrecov); 3155 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3156 goto recov_retry; 3157 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3158 sid_types.cur_sid_type == DEL_SID) { 3159 nfs4_save_stateid(&wargs->stateid, &sid_types); 3160 mutex_enter(&rp->r_statev4_lock); 3161 rp->r_deleg_return_pending = TRUE; 3162 mutex_exit(&rp->r_statev4_lock); 3163 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3164 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3165 &recov_state, needrecov); 3166 (void) xdr_free(xdr_COMPOUND4res_clnt, 3167 (caddr_t)&res); 3168 return (EIO); 3169 } 3170 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3171 &recov_state, needrecov); 3172 /* hold needed for nfs4delegreturn_thread */ 3173 VN_HOLD(vp); 3174 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3175 NFS4_DR_DISCARD), FALSE); 3176 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3177 goto recov_retry; 3178 } 3179 3180 if (needrecov) { 3181 bool_t abort; 3182 3183 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3184 "nfs4write: client got error %d, res.status %d" 3185 ", so start recovery", e.error, res.status)); 3186 3187 abort = nfs4_start_recovery(&e, 3188 VTOMI4(vp), vp, NULL, &wargs->stateid, 3189 NULL, OP_WRITE, NULL); 3190 if (!e.error) { 3191 e.error = geterrno4(res.status); 3192 (void) xdr_free(xdr_COMPOUND4res_clnt, 3193 (caddr_t)&res); 3194 } 3195 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3196 &recov_state, needrecov); 3197 if (abort == FALSE) 3198 goto recov_retry; 3199 return (e.error); 3200 } 3201 3202 if (res.status) { 3203 e.error = geterrno4(res.status); 3204 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3205 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3206 &recov_state, needrecov); 3207 return (e.error); 3208 } 3209 3210 resop = &res.array[1]; /* write res */ 3211 wres = &resop->nfs_resop4_u.opwrite; 3212 3213 if ((int)wres->count > tsize) { 3214 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3215 3216 zcmn_err(getzoneid(), CE_WARN, 3217 "nfs4write: server wrote %u, requested was %u", 3218 (int)wres->count, tsize); 3219 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3220 &recov_state, needrecov); 3221 return (EIO); 3222 } 3223 if (wres->committed == UNSTABLE4) { 3224 *stab_comm = UNSTABLE4; 3225 if (wargs->stable == DATA_SYNC4 || 3226 wargs->stable == FILE_SYNC4) { 3227 (void) xdr_free(xdr_COMPOUND4res_clnt, 3228 (caddr_t)&res); 3229 zcmn_err(getzoneid(), CE_WARN, 3230 "nfs4write: server %s did not commit " 3231 "to stable storage", 3232 rp->r_server->sv_hostname); 3233 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3234 &recov_state, needrecov); 3235 return (EIO); 3236 } 3237 } 3238 3239 tsize = (int)wres->count; 3240 count -= tsize; 3241 base += tsize; 3242 offset += tsize; 3243 if (mi->mi_io_kstats) { 3244 mutex_enter(&mi->mi_lock); 3245 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3246 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3247 tsize; 3248 mutex_exit(&mi->mi_lock); 3249 } 3250 lwp_stat_update(LWP_STAT_OUBLK, 1); 3251 mutex_enter(&rp->r_statelock); 3252 if (rp->r_flags & R4HAVEVERF) { 3253 if (rp->r_writeverf != wres->writeverf) { 3254 nfs4_set_mod(vp); 3255 rp->r_writeverf = wres->writeverf; 3256 } 3257 } else { 3258 rp->r_writeverf = wres->writeverf; 3259 rp->r_flags |= R4HAVEVERF; 3260 } 3261 PURGE_ATTRCACHE4_LOCKED(rp); 3262 rp->r_flags |= R4WRITEMODIFIED; 3263 gethrestime(&rp->r_attr.va_mtime); 3264 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3265 mutex_exit(&rp->r_statelock); 3266 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3267 } while (count); 3268 3269 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, needrecov); 3270 3271 return (e.error); 3272 } 3273 3274 /* 3275 * Read from a file. Reads data in largest chunks our interface can handle. 3276 */ 3277 static int 3278 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3279 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3280 { 3281 mntinfo4_t *mi; 3282 COMPOUND4args_clnt args; 3283 COMPOUND4res_clnt res; 3284 READ4args *rargs; 3285 nfs_argop4 argop[2]; 3286 int tsize; 3287 int doqueue; 3288 rnode4_t *rp; 3289 int data_len; 3290 bool_t is_eof; 3291 bool_t needrecov = FALSE; 3292 nfs4_recov_state_t recov_state; 3293 nfs4_stateid_types_t sid_types; 3294 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3295 3296 rp = VTOR4(vp); 3297 mi = VTOMI4(vp); 3298 doqueue = 1; 3299 3300 ASSERT(nfs_zone() == mi->mi_zone); 3301 3302 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3303 3304 args.array_len = 2; 3305 args.array = argop; 3306 3307 nfs4_init_stateid_types(&sid_types); 3308 3309 recov_state.rs_flags = 0; 3310 recov_state.rs_num_retry_despite_err = 0; 3311 3312 recov_retry: 3313 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3314 &recov_state, NULL); 3315 if (e.error) 3316 return (e.error); 3317 3318 /* putfh target fh */ 3319 argop[0].argop = OP_CPUTFH; 3320 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3321 3322 /* read */ 3323 argop[1].argop = OP_READ; 3324 rargs = &argop[1].nfs_argop4_u.opread; 3325 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3326 OP_READ, &sid_types, async); 3327 3328 do { 3329 if (mi->mi_io_kstats) { 3330 mutex_enter(&mi->mi_lock); 3331 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3332 mutex_exit(&mi->mi_lock); 3333 } 3334 3335 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3336 "nfs4read: %s call, rp %s", 3337 needrecov ? "recov" : "first", 3338 rnode4info(rp))); 3339 3340 if ((vp->v_flag & VNOCACHE) || 3341 (rp->r_flags & R4DIRECTIO) || 3342 (mi->mi_flags & MI4_DIRECTIO)) 3343 tsize = MIN(mi->mi_tsize, count); 3344 else 3345 tsize = MIN(mi->mi_curread, count); 3346 rargs->offset = (offset4)offset; 3347 rargs->count = (count4)tsize; 3348 rargs->res_data_val_alt = NULL; 3349 rargs->res_mblk = NULL; 3350 rargs->res_uiop = NULL; 3351 rargs->res_maxsize = 0; 3352 if (uiop) 3353 rargs->res_uiop = uiop; 3354 else 3355 rargs->res_data_val_alt = base; 3356 rargs->res_maxsize = tsize; 3357 3358 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3359 #ifdef DEBUG 3360 if (nfs4read_error_inject) { 3361 res.status = nfs4read_error_inject; 3362 nfs4read_error_inject = 0; 3363 } 3364 #endif 3365 3366 if (mi->mi_io_kstats) { 3367 mutex_enter(&mi->mi_lock); 3368 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3369 mutex_exit(&mi->mi_lock); 3370 } 3371 3372 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3373 if (e.error != 0 && !needrecov) { 3374 nfs4_end_fop(mi, vp, NULL, OH_READ, 3375 &recov_state, needrecov); 3376 return (e.error); 3377 } 3378 3379 /* 3380 * Do proper retry for OLD and BAD stateid errors outside 3381 * of the normal recovery framework. There are two differences 3382 * between async and sync reads. The first is that we allow 3383 * retry on BAD_STATEID for async reads, but not sync reads. 3384 * The second is that we mark the file dead for a failed 3385 * attempt with a special stateid for sync reads, but just 3386 * return EIO for async reads. 3387 * 3388 * If a sync read receives a BAD stateid error while using a 3389 * delegation stateid, retry using the open stateid (if it 3390 * exists). If it doesn't have an open stateid, reopen the 3391 * file first, then retry. 3392 */ 3393 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3394 res.status == NFS4ERR_BAD_STATEID) && async) { 3395 nfs4_end_fop(mi, vp, NULL, OH_READ, 3396 &recov_state, needrecov); 3397 if (sid_types.cur_sid_type == SPEC_SID) { 3398 (void) xdr_free(xdr_COMPOUND4res_clnt, 3399 (caddr_t)&res); 3400 return (EIO); 3401 } 3402 nfs4_save_stateid(&rargs->stateid, &sid_types); 3403 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3404 goto recov_retry; 3405 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3406 !async && sid_types.cur_sid_type != SPEC_SID) { 3407 nfs4_save_stateid(&rargs->stateid, &sid_types); 3408 nfs4_end_fop(mi, vp, NULL, OH_READ, 3409 &recov_state, needrecov); 3410 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3411 goto recov_retry; 3412 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3413 sid_types.cur_sid_type == DEL_SID) { 3414 nfs4_save_stateid(&rargs->stateid, &sid_types); 3415 mutex_enter(&rp->r_statev4_lock); 3416 rp->r_deleg_return_pending = TRUE; 3417 mutex_exit(&rp->r_statev4_lock); 3418 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3419 nfs4_end_fop(mi, vp, NULL, OH_READ, 3420 &recov_state, needrecov); 3421 (void) xdr_free(xdr_COMPOUND4res_clnt, 3422 (caddr_t)&res); 3423 return (EIO); 3424 } 3425 nfs4_end_fop(mi, vp, NULL, OH_READ, 3426 &recov_state, needrecov); 3427 /* hold needed for nfs4delegreturn_thread */ 3428 VN_HOLD(vp); 3429 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3430 NFS4_DR_DISCARD), FALSE); 3431 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3432 goto recov_retry; 3433 } 3434 if (needrecov) { 3435 bool_t abort; 3436 3437 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3438 "nfs4read: initiating recovery\n")); 3439 3440 abort = nfs4_start_recovery(&e, 3441 mi, vp, NULL, &rargs->stateid, 3442 NULL, OP_READ, NULL); 3443 nfs4_end_fop(mi, vp, NULL, OH_READ, 3444 &recov_state, needrecov); 3445 /* 3446 * Do not retry if we got OLD_STATEID using a special 3447 * stateid. This avoids looping with a broken server. 3448 */ 3449 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3450 sid_types.cur_sid_type == SPEC_SID) 3451 abort = TRUE; 3452 3453 if (abort == FALSE) { 3454 /* 3455 * Need to retry all possible stateids in 3456 * case the recovery error wasn't stateid 3457 * related or the stateids have become 3458 * stale (server reboot). 3459 */ 3460 nfs4_init_stateid_types(&sid_types); 3461 (void) xdr_free(xdr_COMPOUND4res_clnt, 3462 (caddr_t)&res); 3463 goto recov_retry; 3464 } 3465 3466 if (!e.error) { 3467 e.error = geterrno4(res.status); 3468 (void) xdr_free(xdr_COMPOUND4res_clnt, 3469 (caddr_t)&res); 3470 } 3471 return (e.error); 3472 } 3473 3474 if (res.status) { 3475 e.error = geterrno4(res.status); 3476 nfs4_end_fop(mi, vp, NULL, OH_READ, 3477 &recov_state, needrecov); 3478 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3479 return (e.error); 3480 } 3481 3482 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3483 count -= data_len; 3484 if (base) 3485 base += data_len; 3486 offset += data_len; 3487 if (mi->mi_io_kstats) { 3488 mutex_enter(&mi->mi_lock); 3489 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3490 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3491 mutex_exit(&mi->mi_lock); 3492 } 3493 lwp_stat_update(LWP_STAT_INBLK, 1); 3494 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3495 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3496 3497 } while (count && !is_eof); 3498 3499 *residp = count; 3500 3501 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3502 3503 return (e.error); 3504 } 3505 3506 /* ARGSUSED */ 3507 static int 3508 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) 3509 { 3510 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3511 return (EIO); 3512 switch (cmd) { 3513 case _FIODIRECTIO: 3514 return (nfs4_directio(vp, (int)arg, cr)); 3515 default: 3516 return (ENOTTY); 3517 } 3518 } 3519 3520 static int 3521 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr) 3522 { 3523 int error; 3524 rnode4_t *rp = VTOR4(vp); 3525 3526 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3527 return (EIO); 3528 /* 3529 * If it has been specified that the return value will 3530 * just be used as a hint, and we are only being asked 3531 * for size, fsid or rdevid, then return the client's 3532 * notion of these values without checking to make sure 3533 * that the attribute cache is up to date. 3534 * The whole point is to avoid an over the wire GETATTR 3535 * call. 3536 */ 3537 if (flags & ATTR_HINT) { 3538 if (vap->va_mask == 3539 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3540 mutex_enter(&rp->r_statelock); 3541 if (vap->va_mask | AT_SIZE) 3542 vap->va_size = rp->r_size; 3543 if (vap->va_mask | AT_FSID) 3544 vap->va_fsid = rp->r_attr.va_fsid; 3545 if (vap->va_mask | AT_RDEV) 3546 vap->va_rdev = rp->r_attr.va_rdev; 3547 mutex_exit(&rp->r_statelock); 3548 return (0); 3549 } 3550 } 3551 3552 /* 3553 * Only need to flush pages if asking for the mtime 3554 * and if there any dirty pages or any outstanding 3555 * asynchronous (write) requests for this file. 3556 */ 3557 if (vap->va_mask & AT_MTIME) { 3558 rp = VTOR4(vp); 3559 if (nfs4_has_pages(vp)) { 3560 mutex_enter(&rp->r_statev4_lock); 3561 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3562 mutex_exit(&rp->r_statev4_lock); 3563 if (rp->r_flags & R4DIRTY || 3564 rp->r_awcount > 0) { 3565 mutex_enter(&rp->r_statelock); 3566 rp->r_gcount++; 3567 mutex_exit(&rp->r_statelock); 3568 error = 3569 nfs4_putpage(vp, (u_offset_t)0, 3570 0, 0, cr); 3571 mutex_enter(&rp->r_statelock); 3572 if (error && (error == ENOSPC || 3573 error == EDQUOT)) { 3574 if (!rp->r_error) 3575 rp->r_error = error; 3576 } 3577 if (--rp->r_gcount == 0) 3578 cv_broadcast(&rp->r_cv); 3579 mutex_exit(&rp->r_statelock); 3580 } 3581 } else { 3582 mutex_exit(&rp->r_statev4_lock); 3583 } 3584 } 3585 } 3586 return (nfs4getattr(vp, vap, cr)); 3587 } 3588 3589 int 3590 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3591 { 3592 /* 3593 * If these are the only two bits cleared 3594 * on the server then return 0 (OK) else 3595 * return 1 (BAD). 3596 */ 3597 on_client &= ~(S_ISUID|S_ISGID); 3598 if (on_client == from_server) 3599 return (0); 3600 else 3601 return (1); 3602 } 3603 3604 /*ARGSUSED4*/ 3605 static int 3606 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3607 caller_context_t *ct) 3608 { 3609 if (vap->va_mask & AT_NOSET) 3610 return (EINVAL); 3611 3612 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3613 return (EIO); 3614 3615 /* 3616 * Don't call secpolicy_vnode_setattr, the client cannot 3617 * use its cached attributes to make security decisions 3618 * as the server may be faking mode bits or mapping uid/gid. 3619 * Always just let the server to the checking. 3620 * If we provide the ability to remove basic priviledges 3621 * to setattr (e.g. basic without chmod) then we will 3622 * need to add a check here before calling the server. 3623 */ 3624 3625 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3626 } 3627 3628 /* 3629 * To replace the "guarded" version 3 setattr, we use two types of compound 3630 * setattr requests: 3631 * 1. The "normal" setattr, used when the size of the file isn't being 3632 * changed - { Putfh <fh>; Setattr; Getattr }/ 3633 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3634 * with only ctime as the argument. If the server ctime differs from 3635 * what is cached on the client, the verify will fail, but we would 3636 * already have the ctime from the preceding getattr, so just set it 3637 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3638 * Setattr; Getattr }. 3639 * 3640 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3641 * this setattr and NULL if they are not. 3642 */ 3643 static int 3644 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3645 vsecattr_t *vsap) 3646 { 3647 COMPOUND4args_clnt args; 3648 COMPOUND4res_clnt res, *resp = NULL; 3649 nfs4_ga_res_t *garp = NULL; 3650 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3651 nfs_argop4 argop[5]; 3652 int verify_argop = -1; 3653 int setattr_argop = 1; 3654 nfs_resop4 *resop; 3655 vattr_t va; 3656 rnode4_t *rp; 3657 int doqueue = 1; 3658 uint_t mask = vap->va_mask; 3659 mode_t omode; 3660 vsecattr_t *vsp; 3661 timestruc_t ctime; 3662 bool_t needrecov = FALSE; 3663 nfs4_recov_state_t recov_state; 3664 nfs4_stateid_types_t sid_types; 3665 stateid4 stateid; 3666 hrtime_t t; 3667 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3668 servinfo4_t *svp; 3669 bitmap4 supp_attrs; 3670 3671 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3672 rp = VTOR4(vp); 3673 nfs4_init_stateid_types(&sid_types); 3674 3675 /* 3676 * Only need to flush pages if there are any pages and 3677 * if the file is marked as dirty in some fashion. The 3678 * file must be flushed so that we can accurately 3679 * determine the size of the file and the cached data 3680 * after the SETATTR returns. A file is considered to 3681 * be dirty if it is either marked with R4DIRTY, has 3682 * outstanding i/o's active, or is mmap'd. In this 3683 * last case, we can't tell whether there are dirty 3684 * pages, so we flush just to be sure. 3685 */ 3686 if (nfs4_has_pages(vp) && 3687 ((rp->r_flags & R4DIRTY) || 3688 rp->r_count > 0 || 3689 rp->r_mapcnt > 0)) { 3690 ASSERT(vp->v_type != VCHR); 3691 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr); 3692 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3693 mutex_enter(&rp->r_statelock); 3694 if (!rp->r_error) 3695 rp->r_error = e.error; 3696 mutex_exit(&rp->r_statelock); 3697 } 3698 } 3699 3700 if (mask & AT_SIZE) { 3701 /* 3702 * Verification setattr compound for non-deleg AT_SIZE: 3703 * { Putfh; Getattr; Verify; Setattr; Getattr } 3704 * Set ctime local here (outside the do_again label) 3705 * so that subsequent retries (after failed VERIFY) 3706 * will use ctime from GETATTR results (from failed 3707 * verify compound) as VERIFY arg. 3708 * If file has delegation, then VERIFY(time_metadata) 3709 * is of little added value, so don't bother. 3710 */ 3711 mutex_enter(&rp->r_statev4_lock); 3712 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3713 rp->r_deleg_return_pending) { 3714 numops = 5; 3715 ctime = rp->r_attr.va_ctime; 3716 } 3717 mutex_exit(&rp->r_statev4_lock); 3718 } 3719 3720 recov_state.rs_flags = 0; 3721 recov_state.rs_num_retry_despite_err = 0; 3722 3723 args.ctag = TAG_SETATTR; 3724 do_again: 3725 recov_retry: 3726 setattr_argop = numops - 2; 3727 3728 args.array = argop; 3729 args.array_len = numops; 3730 3731 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3732 if (e.error) 3733 return (e.error); 3734 3735 3736 /* putfh target fh */ 3737 argop[0].argop = OP_CPUTFH; 3738 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3739 3740 if (numops == 5) { 3741 /* 3742 * We only care about the ctime, but need to get mtime 3743 * and size for proper cache update. 3744 */ 3745 /* getattr */ 3746 argop[1].argop = OP_GETATTR; 3747 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3748 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3749 3750 /* verify - set later in loop */ 3751 verify_argop = 2; 3752 } 3753 3754 /* setattr */ 3755 svp = rp->r_server; 3756 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3757 supp_attrs = svp->sv_supp_attrs; 3758 nfs_rw_exit(&svp->sv_lock); 3759 3760 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3761 supp_attrs, &e.error, &sid_types); 3762 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3763 if (e.error) { 3764 /* req time field(s) overflow - return immediately */ 3765 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3766 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3767 opsetattr.obj_attributes); 3768 return (e.error); 3769 } 3770 omode = rp->r_attr.va_mode; 3771 3772 /* getattr */ 3773 argop[numops-1].argop = OP_GETATTR; 3774 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3775 /* 3776 * If we are setting the ACL (indicated only by vsap != NULL), request 3777 * the ACL in this getattr. The ACL returned from this getattr will be 3778 * used in updating the ACL cache. 3779 */ 3780 if (vsap != NULL) 3781 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3782 FATTR4_ACL_MASK; 3783 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3784 3785 /* 3786 * setattr iterates if the object size is set and the cached ctime 3787 * does not match the file ctime. In that case, verify the ctime first. 3788 */ 3789 3790 do { 3791 if (verify_argop != -1) { 3792 /* 3793 * Verify that the ctime match before doing setattr. 3794 */ 3795 va.va_mask = AT_CTIME; 3796 va.va_ctime = ctime; 3797 svp = rp->r_server; 3798 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3799 supp_attrs = svp->sv_supp_attrs; 3800 nfs_rw_exit(&svp->sv_lock); 3801 e.error = nfs4args_verify(&argop[verify_argop], &va, 3802 OP_VERIFY, supp_attrs); 3803 if (e.error) { 3804 /* req time field(s) overflow - return */ 3805 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3806 needrecov); 3807 break; 3808 } 3809 } 3810 3811 doqueue = 1; 3812 3813 t = gethrtime(); 3814 3815 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3816 3817 /* 3818 * Purge the access cache and ACL cache if changing either the 3819 * owner of the file, the group owner, or the mode. These may 3820 * change the access permissions of the file, so purge old 3821 * information and start over again. 3822 */ 3823 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3824 (void) nfs4_access_purge_rp(rp); 3825 if (rp->r_secattr != NULL) { 3826 mutex_enter(&rp->r_statelock); 3827 vsp = rp->r_secattr; 3828 rp->r_secattr = NULL; 3829 mutex_exit(&rp->r_statelock); 3830 if (vsp != NULL) 3831 nfs4_acl_free_cache(vsp); 3832 } 3833 } 3834 3835 /* 3836 * If res.array_len == numops, then everything succeeded, 3837 * except for possibly the final getattr. If only the 3838 * last getattr failed, give up, and don't try recovery. 3839 */ 3840 if (res.array_len == numops) { 3841 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3842 needrecov); 3843 if (! e.error) 3844 resp = &res; 3845 break; 3846 } 3847 3848 /* 3849 * if either rpc call failed or completely succeeded - done 3850 */ 3851 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3852 if (e.error) { 3853 PURGE_ATTRCACHE4(vp); 3854 if (!needrecov) { 3855 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3856 needrecov); 3857 break; 3858 } 3859 } 3860 3861 /* 3862 * Do proper retry for OLD_STATEID outside of the normal 3863 * recovery framework. 3864 */ 3865 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3866 sid_types.cur_sid_type != SPEC_SID && 3867 sid_types.cur_sid_type != NO_SID) { 3868 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3869 needrecov); 3870 nfs4_save_stateid(&stateid, &sid_types); 3871 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3872 opsetattr.obj_attributes); 3873 if (verify_argop != -1) { 3874 nfs4args_verify_free(&argop[verify_argop]); 3875 verify_argop = -1; 3876 } 3877 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3878 goto recov_retry; 3879 } 3880 3881 if (needrecov) { 3882 bool_t abort; 3883 3884 abort = nfs4_start_recovery(&e, 3885 VTOMI4(vp), vp, NULL, NULL, NULL, 3886 OP_SETATTR, NULL); 3887 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3888 needrecov); 3889 /* 3890 * Do not retry if we failed with OLD_STATEID using 3891 * a special stateid. This is done to avoid looping 3892 * with a broken server. 3893 */ 3894 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3895 (sid_types.cur_sid_type == SPEC_SID || 3896 sid_types.cur_sid_type == NO_SID)) 3897 abort = TRUE; 3898 if (!e.error) { 3899 if (res.status == NFS4ERR_BADOWNER) 3900 nfs4_log_badowner(VTOMI4(vp), 3901 OP_SETATTR); 3902 3903 e.error = geterrno4(res.status); 3904 (void) xdr_free(xdr_COMPOUND4res_clnt, 3905 (caddr_t)&res); 3906 } 3907 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3908 opsetattr.obj_attributes); 3909 if (verify_argop != -1) { 3910 nfs4args_verify_free(&argop[verify_argop]); 3911 verify_argop = -1; 3912 } 3913 if (abort == FALSE) { 3914 /* 3915 * Need to retry all possible stateids in 3916 * case the recovery error wasn't stateid 3917 * related or the stateids have become 3918 * stale (server reboot). 3919 */ 3920 nfs4_init_stateid_types(&sid_types); 3921 goto recov_retry; 3922 } 3923 return (e.error); 3924 } 3925 3926 /* 3927 * Need to call nfs4_end_op before nfs4getattr to 3928 * avoid potential nfs4_start_op deadlock. See RFE 3929 * 4777612. Calls to nfs4_invalidate_pages() and 3930 * nfs4_purge_stale_fh() might also generate over the 3931 * wire calls which my cause nfs4_start_op() deadlock. 3932 */ 3933 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3934 3935 /* 3936 * Check to update lease. 3937 */ 3938 resp = &res; 3939 if (res.status == NFS4_OK) { 3940 break; 3941 } 3942 3943 /* 3944 * Check if verify failed to see if try again 3945 */ 3946 if ((verify_argop == -1) || (res.array_len != 3)) { 3947 /* 3948 * can't continue... 3949 */ 3950 if (res.status == NFS4ERR_BADOWNER) 3951 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 3952 3953 e.error = geterrno4(res.status); 3954 } else { 3955 /* 3956 * When the verify request fails, the client ctime is 3957 * not in sync with the server. This is the same as 3958 * the version 3 "not synchronized" error, and we 3959 * handle it in a similar manner (XXX do we need to???). 3960 * Use the ctime returned in the first getattr for 3961 * the input to the next verify. 3962 * If we couldn't get the attributes, then we give up 3963 * because we can't complete the operation as required. 3964 */ 3965 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 3966 } 3967 if (e.error) { 3968 PURGE_ATTRCACHE4(vp); 3969 nfs4_purge_stale_fh(e.error, vp, cr); 3970 } else { 3971 /* 3972 * retry with a new verify value 3973 */ 3974 ctime = garp->n4g_va.va_ctime; 3975 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3976 resp = NULL; 3977 } 3978 if (!e.error) { 3979 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3980 opsetattr.obj_attributes); 3981 if (verify_argop != -1) { 3982 nfs4args_verify_free(&argop[verify_argop]); 3983 verify_argop = -1; 3984 } 3985 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3986 goto do_again; 3987 } 3988 } while (!e.error); 3989 3990 if (e.error) { 3991 /* 3992 * If we are here, rfs4call has an irrecoverable error - return 3993 */ 3994 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3995 opsetattr.obj_attributes); 3996 if (verify_argop != -1) { 3997 nfs4args_verify_free(&argop[verify_argop]); 3998 verify_argop = -1; 3999 } 4000 if (resp) 4001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4002 return (e.error); 4003 } 4004 4005 4006 4007 /* 4008 * If changing the size of the file, invalidate 4009 * any local cached data which is no longer part 4010 * of the file. We also possibly invalidate the 4011 * last page in the file. We could use 4012 * pvn_vpzero(), but this would mark the page as 4013 * modified and require it to be written back to 4014 * the server for no particularly good reason. 4015 * This way, if we access it, then we bring it 4016 * back in. A read should be cheaper than a 4017 * write. 4018 */ 4019 if (mask & AT_SIZE) { 4020 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4021 } 4022 4023 /* either no error or one of the postop getattr failed */ 4024 4025 /* 4026 * XXX Perform a simplified version of wcc checking. Instead of 4027 * have another getattr to get pre-op, just purge cache if 4028 * any of the ops prior to and including the getattr failed. 4029 * If the getattr succeeded then update the attrcache accordingly. 4030 */ 4031 4032 garp = NULL; 4033 if (res.status == NFS4_OK) { 4034 /* 4035 * Last getattr 4036 */ 4037 resop = &res.array[numops - 1]; 4038 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4039 } 4040 /* 4041 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4042 * rather than filling it. See the function itself for details. 4043 */ 4044 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4045 if (garp != NULL) { 4046 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4047 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4048 vs_ace4_destroy(&garp->n4g_vsa); 4049 } else { 4050 if (vsap != NULL) { 4051 /* 4052 * The ACL was supposed to be set and to be 4053 * returned in the last getattr of this 4054 * compound, but for some reason the getattr 4055 * result doesn't contain the ACL. In this 4056 * case, purge the ACL cache. 4057 */ 4058 if (rp->r_secattr != NULL) { 4059 mutex_enter(&rp->r_statelock); 4060 vsp = rp->r_secattr; 4061 rp->r_secattr = NULL; 4062 mutex_exit(&rp->r_statelock); 4063 if (vsp != NULL) 4064 nfs4_acl_free_cache(vsp); 4065 } 4066 } 4067 } 4068 } 4069 4070 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4071 /* 4072 * Set the size, rather than relying on getting it updated 4073 * via a GETATTR. With delegations the client tries to 4074 * suppress GETATTR calls. 4075 */ 4076 mutex_enter(&rp->r_statelock); 4077 rp->r_size = vap->va_size; 4078 mutex_exit(&rp->r_statelock); 4079 } 4080 4081 /* 4082 * Can free up request args and res 4083 */ 4084 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4085 opsetattr.obj_attributes); 4086 if (verify_argop != -1) { 4087 nfs4args_verify_free(&argop[verify_argop]); 4088 verify_argop = -1; 4089 } 4090 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4091 4092 /* 4093 * Some servers will change the mode to clear the setuid 4094 * and setgid bits when changing the uid or gid. The 4095 * client needs to compensate appropriately. 4096 */ 4097 if (mask & (AT_UID | AT_GID)) { 4098 int terror, do_setattr; 4099 4100 do_setattr = 0; 4101 va.va_mask = AT_MODE; 4102 terror = nfs4getattr(vp, &va, cr); 4103 if (!terror && 4104 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4105 (!(mask & AT_MODE) && va.va_mode != omode))) { 4106 va.va_mask = AT_MODE; 4107 if (mask & AT_MODE) { 4108 /* 4109 * We asked the mode to be changed and what 4110 * we just got from the server in getattr is 4111 * not what we wanted it to be, so set it now. 4112 */ 4113 va.va_mode = vap->va_mode; 4114 do_setattr = 1; 4115 } else { 4116 /* 4117 * We did not ask the mode to be changed, 4118 * Check to see that the server just cleared 4119 * I_SUID and I_GUID from it. If not then 4120 * set mode to omode with UID/GID cleared. 4121 */ 4122 if (nfs4_compare_modes(va.va_mode, omode)) { 4123 omode &= ~(S_ISUID|S_ISGID); 4124 va.va_mode = omode; 4125 do_setattr = 1; 4126 } 4127 } 4128 4129 if (do_setattr) 4130 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4131 } 4132 } 4133 4134 return (e.error); 4135 } 4136 4137 /* ARGSUSED */ 4138 static int 4139 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr) 4140 { 4141 COMPOUND4args_clnt args; 4142 COMPOUND4res_clnt res; 4143 int doqueue; 4144 uint32_t acc, resacc, argacc; 4145 rnode4_t *rp; 4146 cred_t *cred, *ncr, *ncrfree = NULL; 4147 nfs4_access_type_t cacc; 4148 int num_ops; 4149 nfs_argop4 argop[3]; 4150 nfs_resop4 *resop; 4151 bool_t needrecov = FALSE, do_getattr; 4152 nfs4_recov_state_t recov_state; 4153 int rpc_error; 4154 hrtime_t t; 4155 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4156 mntinfo4_t *mi = VTOMI4(vp); 4157 4158 if (nfs_zone() != mi->mi_zone) 4159 return (EIO); 4160 4161 acc = 0; 4162 if (mode & VREAD) 4163 acc |= ACCESS4_READ; 4164 if (mode & VWRITE) { 4165 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4166 return (EROFS); 4167 if (vp->v_type == VDIR) 4168 acc |= ACCESS4_DELETE; 4169 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4170 } 4171 if (mode & VEXEC) { 4172 if (vp->v_type == VDIR) 4173 acc |= ACCESS4_LOOKUP; 4174 else 4175 acc |= ACCESS4_EXECUTE; 4176 } 4177 4178 if (VTOR4(vp)->r_acache != NULL) { 4179 e.error = nfs4_validate_caches(vp, cr); 4180 if (e.error) 4181 return (e.error); 4182 } 4183 4184 rp = VTOR4(vp); 4185 if (vp->v_type == VDIR) { 4186 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4187 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4188 } else { 4189 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4190 ACCESS4_EXECUTE; 4191 } 4192 recov_state.rs_flags = 0; 4193 recov_state.rs_num_retry_despite_err = 0; 4194 4195 cred = cr; 4196 /* 4197 * ncr and ncrfree both initially 4198 * point to the memory area returned 4199 * by crnetadjust(); 4200 * ncrfree not NULL when exiting means 4201 * that we need to release it 4202 */ 4203 ncr = crnetadjust(cred); 4204 ncrfree = ncr; 4205 4206 tryagain: 4207 cacc = nfs4_access_check(rp, acc, cred); 4208 if (cacc == NFS4_ACCESS_ALLOWED) { 4209 if (ncrfree != NULL) 4210 crfree(ncrfree); 4211 return (0); 4212 } 4213 if (cacc == NFS4_ACCESS_DENIED) { 4214 /* 4215 * If the cred can be adjusted, try again 4216 * with the new cred. 4217 */ 4218 if (ncr != NULL) { 4219 cred = ncr; 4220 ncr = NULL; 4221 goto tryagain; 4222 } 4223 if (ncrfree != NULL) 4224 crfree(ncrfree); 4225 return (EACCES); 4226 } 4227 4228 recov_retry: 4229 /* 4230 * Don't take with r_statev4_lock here. r_deleg_type could 4231 * change as soon as lock is released. Since it is an int, 4232 * there is no atomicity issue. 4233 */ 4234 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4235 num_ops = do_getattr ? 3 : 2; 4236 4237 args.ctag = TAG_ACCESS; 4238 4239 args.array_len = num_ops; 4240 args.array = argop; 4241 4242 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4243 &recov_state, NULL)) { 4244 if (ncrfree != NULL) 4245 crfree(ncrfree); 4246 return (e.error); 4247 } 4248 4249 /* putfh target fh */ 4250 argop[0].argop = OP_CPUTFH; 4251 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4252 4253 /* access */ 4254 argop[1].argop = OP_ACCESS; 4255 argop[1].nfs_argop4_u.opaccess.access = argacc; 4256 4257 /* getattr */ 4258 if (do_getattr) { 4259 argop[2].argop = OP_GETATTR; 4260 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4261 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4262 } 4263 4264 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4265 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4266 rnode4info(VTOR4(vp)))); 4267 4268 doqueue = 1; 4269 t = gethrtime(); 4270 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4271 rpc_error = e.error; 4272 4273 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4274 if (needrecov) { 4275 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4276 "nfs4_access: initiating recovery\n")); 4277 4278 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4279 NULL, OP_ACCESS, NULL) == FALSE) { 4280 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4281 &recov_state, needrecov); 4282 if (!e.error) 4283 (void) xdr_free(xdr_COMPOUND4res_clnt, 4284 (caddr_t)&res); 4285 goto recov_retry; 4286 } 4287 } 4288 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4289 4290 if (e.error) 4291 goto out; 4292 4293 if (res.status) { 4294 e.error = geterrno4(res.status); 4295 /* 4296 * This might generate over the wire calls throught 4297 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4298 * here to avoid a deadlock. 4299 */ 4300 nfs4_purge_stale_fh(e.error, vp, cr); 4301 goto out; 4302 } 4303 resop = &res.array[1]; /* access res */ 4304 4305 resacc = resop->nfs_resop4_u.opaccess.access; 4306 4307 if (do_getattr) { 4308 resop++; /* getattr res */ 4309 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4310 t, cr, FALSE, NULL); 4311 } 4312 4313 if (!e.error) { 4314 nfs4_access_cache(rp, argacc, resacc, cred); 4315 /* 4316 * we just cached results with cred; if cred is the 4317 * adjusted credentials from crnetadjust, we do not want 4318 * to release them before exiting: hence setting ncrfree 4319 * to NULL 4320 */ 4321 if (cred != cr) 4322 ncrfree = NULL; 4323 /* XXX check the supported bits too? */ 4324 if ((acc & resacc) != acc) { 4325 /* 4326 * The following code implements the semantic 4327 * that a setuid root program has *at least* the 4328 * permissions of the user that is running the 4329 * program. See rfs3call() for more portions 4330 * of the implementation of this functionality. 4331 */ 4332 /* XXX-LP */ 4333 if (ncr != NULL) { 4334 (void) xdr_free(xdr_COMPOUND4res_clnt, 4335 (caddr_t)&res); 4336 cred = ncr; 4337 ncr = NULL; 4338 goto tryagain; 4339 } 4340 e.error = EACCES; 4341 } 4342 } 4343 4344 out: 4345 if (!rpc_error) 4346 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4347 4348 if (ncrfree != NULL) 4349 crfree(ncrfree); 4350 4351 return (e.error); 4352 } 4353 4354 static int 4355 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr) 4356 { 4357 COMPOUND4args_clnt args; 4358 COMPOUND4res_clnt res; 4359 int doqueue; 4360 rnode4_t *rp; 4361 nfs_argop4 argop[3]; 4362 nfs_resop4 *resop; 4363 READLINK4res *lr_res; 4364 nfs4_ga_res_t *garp; 4365 uint_t len; 4366 char *linkdata; 4367 bool_t needrecov = FALSE; 4368 nfs4_recov_state_t recov_state; 4369 hrtime_t t; 4370 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4371 4372 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4373 return (EIO); 4374 /* 4375 * Can't readlink anything other than a symbolic link. 4376 */ 4377 if (vp->v_type != VLNK) 4378 return (EINVAL); 4379 4380 rp = VTOR4(vp); 4381 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4382 e.error = nfs4_validate_caches(vp, cr); 4383 if (e.error) 4384 return (e.error); 4385 mutex_enter(&rp->r_statelock); 4386 if (rp->r_symlink.contents != NULL) { 4387 e.error = uiomove(rp->r_symlink.contents, 4388 rp->r_symlink.len, UIO_READ, uiop); 4389 mutex_exit(&rp->r_statelock); 4390 return (e.error); 4391 } 4392 mutex_exit(&rp->r_statelock); 4393 } 4394 recov_state.rs_flags = 0; 4395 recov_state.rs_num_retry_despite_err = 0; 4396 4397 recov_retry: 4398 args.array_len = 3; 4399 args.array = argop; 4400 args.ctag = TAG_READLINK; 4401 4402 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4403 if (e.error) { 4404 return (e.error); 4405 } 4406 4407 /* 0. putfh symlink fh */ 4408 argop[0].argop = OP_CPUTFH; 4409 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4410 4411 /* 1. readlink */ 4412 argop[1].argop = OP_READLINK; 4413 4414 /* 2. getattr */ 4415 argop[2].argop = OP_GETATTR; 4416 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4417 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4418 4419 doqueue = 1; 4420 4421 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4422 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4423 rnode4info(VTOR4(vp)))); 4424 4425 t = gethrtime(); 4426 4427 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4428 4429 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4430 if (needrecov) { 4431 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4432 "nfs4_readlink: initiating recovery\n")); 4433 4434 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4435 NULL, OP_READLINK, NULL) == FALSE) { 4436 if (!e.error) 4437 (void) xdr_free(xdr_COMPOUND4res_clnt, 4438 (caddr_t)&res); 4439 4440 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4441 needrecov); 4442 goto recov_retry; 4443 } 4444 } 4445 4446 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4447 4448 if (e.error) 4449 return (e.error); 4450 4451 /* 4452 * There is an path in the code below which calls 4453 * nfs4_purge_stale_fh(), which may generate otw calls through 4454 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4455 * here to avoid nfs4_start_op() deadlock. 4456 */ 4457 4458 if (res.status && (res.array_len < args.array_len)) { 4459 /* 4460 * either Putfh or Link failed 4461 */ 4462 e.error = geterrno4(res.status); 4463 nfs4_purge_stale_fh(e.error, vp, cr); 4464 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4465 return (e.error); 4466 } 4467 4468 resop = &res.array[1]; /* readlink res */ 4469 lr_res = &resop->nfs_resop4_u.opreadlink; 4470 4471 /* 4472 * treat symlink names as data 4473 */ 4474 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4475 if (linkdata != NULL) { 4476 int uio_len = len - 1; 4477 /* len includes null byte, which we won't uiomove */ 4478 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4479 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4480 mutex_enter(&rp->r_statelock); 4481 if (rp->r_symlink.contents == NULL) { 4482 rp->r_symlink.contents = linkdata; 4483 rp->r_symlink.len = uio_len; 4484 rp->r_symlink.size = len; 4485 mutex_exit(&rp->r_statelock); 4486 } else { 4487 mutex_exit(&rp->r_statelock); 4488 kmem_free(linkdata, len); 4489 } 4490 } else { 4491 kmem_free(linkdata, len); 4492 } 4493 } 4494 if (res.status == NFS4_OK) { 4495 resop++; /* getattr res */ 4496 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4497 } 4498 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4499 4500 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4501 4502 /* 4503 * The over the wire error for attempting to readlink something 4504 * other than a symbolic link is ENXIO. However, we need to 4505 * return EINVAL instead of ENXIO, so we map it here. 4506 */ 4507 return (e.error == ENXIO ? EINVAL : e.error); 4508 } 4509 4510 /* 4511 * Flush local dirty pages to stable storage on the server. 4512 * 4513 * If FNODSYNC is specified, then there is nothing to do because 4514 * metadata changes are not cached on the client before being 4515 * sent to the server. 4516 */ 4517 static int 4518 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr) 4519 { 4520 int error; 4521 4522 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4523 return (0); 4524 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4525 return (EIO); 4526 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4527 if (!error) 4528 error = VTOR4(vp)->r_error; 4529 return (error); 4530 } 4531 4532 /* 4533 * Weirdness: if the file was removed or the target of a rename 4534 * operation while it was open, it got renamed instead. Here we 4535 * remove the renamed file. 4536 */ 4537 static void 4538 nfs4_inactive(vnode_t *vp, cred_t *cr) 4539 { 4540 rnode4_t *rp; 4541 4542 ASSERT(vp != DNLC_NO_VNODE); 4543 4544 rp = VTOR4(vp); 4545 4546 if (IS_SHADOW(vp, rp)) { 4547 sv_inactive(vp); 4548 return; 4549 } 4550 4551 /* 4552 * If this is coming from the wrong zone, we let someone in the right 4553 * zone take care of it asynchronously. We can get here due to 4554 * VN_RELE() being called from pageout() or fsflush(). This call may 4555 * potentially turn into an expensive no-op if, for instance, v_count 4556 * gets incremented in the meantime, but it's still correct. 4557 */ 4558 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4559 nfs4_async_inactive(vp, cr); 4560 return; 4561 } 4562 4563 /* 4564 * Some of the cleanup steps might require over-the-wire 4565 * operations. Since VOP_INACTIVE can get called as a result of 4566 * other over-the-wire operations (e.g., an attribute cache update 4567 * can lead to a DNLC purge), doing those steps now would lead to a 4568 * nested call to the recovery framework, which can deadlock. So 4569 * do any over-the-wire cleanups asynchronously, in a separate 4570 * thread. 4571 */ 4572 4573 mutex_enter(&rp->r_os_lock); 4574 mutex_enter(&rp->r_statelock); 4575 mutex_enter(&rp->r_statev4_lock); 4576 4577 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4578 mutex_exit(&rp->r_statev4_lock); 4579 mutex_exit(&rp->r_statelock); 4580 mutex_exit(&rp->r_os_lock); 4581 nfs4_async_inactive(vp, cr); 4582 return; 4583 } 4584 4585 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4586 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4587 mutex_exit(&rp->r_statev4_lock); 4588 mutex_exit(&rp->r_statelock); 4589 mutex_exit(&rp->r_os_lock); 4590 nfs4_async_inactive(vp, cr); 4591 return; 4592 } 4593 4594 if (rp->r_unldvp != NULL) { 4595 mutex_exit(&rp->r_statev4_lock); 4596 mutex_exit(&rp->r_statelock); 4597 mutex_exit(&rp->r_os_lock); 4598 nfs4_async_inactive(vp, cr); 4599 return; 4600 } 4601 mutex_exit(&rp->r_statev4_lock); 4602 mutex_exit(&rp->r_statelock); 4603 mutex_exit(&rp->r_os_lock); 4604 4605 rp4_addfree(rp, cr); 4606 } 4607 4608 /* 4609 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4610 * various bits of state. The caller must not refer to vp after this call. 4611 */ 4612 4613 void 4614 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4615 { 4616 rnode4_t *rp = VTOR4(vp); 4617 nfs4_recov_state_t recov_state; 4618 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4619 vnode_t *unldvp; 4620 char *unlname; 4621 cred_t *unlcred; 4622 COMPOUND4args_clnt args; 4623 COMPOUND4res_clnt res, *resp; 4624 nfs_argop4 argop[2]; 4625 int doqueue; 4626 #ifdef DEBUG 4627 char *name; 4628 #endif 4629 4630 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4631 ASSERT(!IS_SHADOW(vp, rp)); 4632 4633 #ifdef DEBUG 4634 name = fn_name(VTOSV(vp)->sv_name); 4635 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4636 "release vnode %s", name)); 4637 kmem_free(name, MAXNAMELEN); 4638 #endif 4639 4640 if (vp->v_type == VREG) { 4641 bool_t recov_failed = FALSE; 4642 4643 e.error = nfs4close_all(vp, cr); 4644 if (e.error) { 4645 /* Check to see if recovery failed */ 4646 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4647 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4648 recov_failed = TRUE; 4649 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4650 if (!recov_failed) { 4651 mutex_enter(&rp->r_statelock); 4652 if (rp->r_flags & R4RECOVERR) 4653 recov_failed = TRUE; 4654 mutex_exit(&rp->r_statelock); 4655 } 4656 if (recov_failed) { 4657 NFS4_DEBUG(nfs4_client_recov_debug, 4658 (CE_NOTE, "nfs4_inactive_otw: " 4659 "close failed (recovery failure)")); 4660 } 4661 } 4662 } 4663 4664 redo: 4665 if (rp->r_unldvp == NULL) { 4666 rp4_addfree(rp, cr); 4667 return; 4668 } 4669 4670 /* 4671 * Save the vnode pointer for the directory where the 4672 * unlinked-open file got renamed, then set it to NULL 4673 * to prevent another thread from getting here before 4674 * we're done with the remove. While we have the 4675 * statelock, make local copies of the pertinent rnode 4676 * fields. If we weren't to do this in an atomic way, the 4677 * the unl* fields could become inconsistent with respect 4678 * to each other due to a race condition between this 4679 * code and nfs_remove(). See bug report 1034328. 4680 */ 4681 mutex_enter(&rp->r_statelock); 4682 if (rp->r_unldvp == NULL) { 4683 mutex_exit(&rp->r_statelock); 4684 rp4_addfree(rp, cr); 4685 return; 4686 } 4687 4688 unldvp = rp->r_unldvp; 4689 rp->r_unldvp = NULL; 4690 unlname = rp->r_unlname; 4691 rp->r_unlname = NULL; 4692 unlcred = rp->r_unlcred; 4693 rp->r_unlcred = NULL; 4694 mutex_exit(&rp->r_statelock); 4695 4696 /* 4697 * If there are any dirty pages left, then flush 4698 * them. This is unfortunate because they just 4699 * may get thrown away during the remove operation, 4700 * but we have to do this for correctness. 4701 */ 4702 if (nfs4_has_pages(vp) && 4703 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4704 ASSERT(vp->v_type != VCHR); 4705 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 4706 if (e.error) { 4707 mutex_enter(&rp->r_statelock); 4708 if (!rp->r_error) 4709 rp->r_error = e.error; 4710 mutex_exit(&rp->r_statelock); 4711 } 4712 } 4713 4714 recov_state.rs_flags = 0; 4715 recov_state.rs_num_retry_despite_err = 0; 4716 recov_retry_remove: 4717 /* 4718 * Do the remove operation on the renamed file 4719 */ 4720 args.ctag = TAG_INACTIVE; 4721 4722 /* 4723 * Remove ops: putfh dir; remove 4724 */ 4725 args.array_len = 2; 4726 args.array = argop; 4727 4728 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4729 if (e.error) { 4730 kmem_free(unlname, MAXNAMELEN); 4731 crfree(unlcred); 4732 VN_RELE(unldvp); 4733 /* 4734 * Try again; this time around r_unldvp will be NULL, so we'll 4735 * just call rp4_addfree() and return. 4736 */ 4737 goto redo; 4738 } 4739 4740 /* putfh directory */ 4741 argop[0].argop = OP_CPUTFH; 4742 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4743 4744 /* remove */ 4745 argop[1].argop = OP_CREMOVE; 4746 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4747 4748 doqueue = 1; 4749 resp = &res; 4750 4751 #if 0 /* notyet */ 4752 /* 4753 * Can't do this yet. We may be being called from 4754 * dnlc_purge_XXX while that routine is holding a 4755 * mutex lock to the nc_rele list. The calls to 4756 * nfs3_cache_wcc_data may result in calls to 4757 * dnlc_purge_XXX. This will result in a deadlock. 4758 */ 4759 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4760 if (e.error) { 4761 PURGE_ATTRCACHE4(unldvp); 4762 resp = NULL; 4763 } else if (res.status) { 4764 e.error = geterrno4(res.status); 4765 PURGE_ATTRCACHE4(unldvp); 4766 /* 4767 * This code is inactive right now 4768 * but if made active there should 4769 * be a nfs4_end_op() call before 4770 * nfs4_purge_stale_fh to avoid start_op() 4771 * deadlock. See BugId: 4948726 4772 */ 4773 nfs4_purge_stale_fh(error, unldvp, cr); 4774 } else { 4775 nfs_resop4 *resop; 4776 REMOVE4res *rm_res; 4777 4778 resop = &res.array[1]; 4779 rm_res = &resop->nfs_resop4_u.opremove; 4780 /* 4781 * Update directory cache attribute, 4782 * readdir and dnlc caches. 4783 */ 4784 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4785 } 4786 #else 4787 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4788 4789 PURGE_ATTRCACHE4(unldvp); 4790 #endif 4791 4792 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4793 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4794 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4795 if (!e.error) 4796 (void) xdr_free(xdr_COMPOUND4res_clnt, 4797 (caddr_t)&res); 4798 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4799 &recov_state, TRUE); 4800 goto recov_retry_remove; 4801 } 4802 } 4803 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4804 4805 /* 4806 * Release stuff held for the remove 4807 */ 4808 VN_RELE(unldvp); 4809 if (!e.error && resp) 4810 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4811 4812 kmem_free(unlname, MAXNAMELEN); 4813 crfree(unlcred); 4814 goto redo; 4815 } 4816 4817 /* 4818 * Remote file system operations having to do with directory manipulation. 4819 */ 4820 /* ARGSUSED3 */ 4821 static int 4822 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4823 int flags, vnode_t *rdir, cred_t *cr) 4824 { 4825 int error; 4826 vnode_t *vp, *avp = NULL; 4827 rnode4_t *drp; 4828 4829 *vpp = NULL; 4830 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4831 return (EPERM); 4832 /* 4833 * if LOOKUP_XATTR, must replace dvp (object) with 4834 * object's attrdir before continuing with lookup 4835 */ 4836 if (flags & LOOKUP_XATTR) { 4837 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4838 if (error) 4839 return (error); 4840 4841 dvp = avp; 4842 4843 /* 4844 * If lookup is for "", just return dvp now. The attrdir 4845 * has already been activated (from nfs4lookup_xattr), and 4846 * the caller will RELE the original dvp -- not 4847 * the attrdir. So, set vpp and return. 4848 * Currently, when the LOOKUP_XATTR flag is 4849 * passed to VOP_LOOKUP, the name is always empty, and 4850 * shortcircuiting here avoids 3 unneeded lock/unlock 4851 * pairs. 4852 * 4853 * If a non-empty name was provided, then it is the 4854 * attribute name, and it will be looked up below. 4855 */ 4856 if (*nm == '\0') { 4857 *vpp = dvp; 4858 return (0); 4859 } 4860 4861 /* 4862 * The vfs layer never sends a name when asking for the 4863 * attrdir, so we should never get here (unless of course 4864 * name is passed at some time in future -- at which time 4865 * we'll blow up here). 4866 */ 4867 ASSERT(0); 4868 } 4869 4870 drp = VTOR4(dvp); 4871 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4872 return (EINTR); 4873 4874 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4875 nfs_rw_exit(&drp->r_rwlock); 4876 4877 /* 4878 * If vnode is a device, create special vnode. 4879 */ 4880 if (!error && ISVDEV((*vpp)->v_type)) { 4881 vp = *vpp; 4882 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4883 VN_RELE(vp); 4884 } 4885 4886 return (error); 4887 } 4888 4889 /* ARGSUSED */ 4890 static int 4891 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4892 { 4893 int error; 4894 rnode4_t *drp; 4895 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4896 mntinfo4_t *mi; 4897 4898 mi = VTOMI4(dvp); 4899 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR)) 4900 return (EINVAL); 4901 4902 drp = VTOR4(dvp); 4903 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4904 return (EINTR); 4905 4906 mutex_enter(&drp->r_statelock); 4907 /* 4908 * If the server doesn't support xattrs just return EINVAL 4909 */ 4910 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4911 mutex_exit(&drp->r_statelock); 4912 nfs_rw_exit(&drp->r_rwlock); 4913 return (EINVAL); 4914 } 4915 4916 /* 4917 * If there is a cached xattr directory entry, 4918 * use it as long as the attributes are valid. If the 4919 * attributes are not valid, take the simple approach and 4920 * free the cached value and re-fetch a new value. 4921 * 4922 * We don't negative entry cache for now, if we did we 4923 * would need to check if the file has changed on every 4924 * lookup. But xattrs don't exist very often and failing 4925 * an openattr is not much more expensive than and NVERIFY or GETATTR 4926 * so do an openattr over the wire for now. 4927 */ 4928 if (drp->r_xattr_dir != NULL) { 4929 if (ATTRCACHE4_VALID(dvp)) { 4930 VN_HOLD(drp->r_xattr_dir); 4931 *vpp = drp->r_xattr_dir; 4932 mutex_exit(&drp->r_statelock); 4933 nfs_rw_exit(&drp->r_rwlock); 4934 return (0); 4935 } 4936 VN_RELE(drp->r_xattr_dir); 4937 drp->r_xattr_dir = NULL; 4938 } 4939 mutex_exit(&drp->r_statelock); 4940 4941 error = nfs4openattr(dvp, vpp, cflag, cr); 4942 4943 nfs_rw_exit(&drp->r_rwlock); 4944 4945 return (error); 4946 } 4947 4948 static int 4949 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 4950 { 4951 int error; 4952 rnode4_t *drp; 4953 4954 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 4955 4956 /* 4957 * If lookup is for "", just return dvp. Don't need 4958 * to send it over the wire, look it up in the dnlc, 4959 * or perform any access checks. 4960 */ 4961 if (*nm == '\0') { 4962 VN_HOLD(dvp); 4963 *vpp = dvp; 4964 return (0); 4965 } 4966 4967 /* 4968 * Can't do lookups in non-directories. 4969 */ 4970 if (dvp->v_type != VDIR) 4971 return (ENOTDIR); 4972 4973 /* 4974 * If lookup is for ".", just return dvp. Don't need 4975 * to send it over the wire or look it up in the dnlc, 4976 * just need to check access. 4977 */ 4978 if (nm[0] == '.' && nm[1] == '\0') { 4979 error = nfs4_access(dvp, VEXEC, 0, cr); 4980 if (error) 4981 return (error); 4982 VN_HOLD(dvp); 4983 *vpp = dvp; 4984 return (0); 4985 } 4986 4987 drp = VTOR4(dvp); 4988 if (!(drp->r_flags & R4LOOKUP)) { 4989 mutex_enter(&drp->r_statelock); 4990 drp->r_flags |= R4LOOKUP; 4991 mutex_exit(&drp->r_statelock); 4992 } 4993 4994 *vpp = NULL; 4995 /* 4996 * Lookup this name in the DNLC. If there is no entry 4997 * lookup over the wire. 4998 */ 4999 if (!skipdnlc) 5000 *vpp = dnlc_lookup(dvp, nm); 5001 if (*vpp == NULL) { 5002 /* 5003 * We need to go over the wire to lookup the name. 5004 */ 5005 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5006 } 5007 5008 /* 5009 * We hit on the dnlc 5010 */ 5011 if (*vpp != DNLC_NO_VNODE || 5012 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5013 /* 5014 * But our attrs may not be valid. 5015 */ 5016 if (ATTRCACHE4_VALID(dvp)) { 5017 error = nfs4_waitfor_purge_complete(dvp); 5018 if (error) { 5019 VN_RELE(*vpp); 5020 *vpp = NULL; 5021 return (error); 5022 } 5023 5024 /* 5025 * If after the purge completes, check to make sure 5026 * our attrs are still valid. 5027 */ 5028 if (ATTRCACHE4_VALID(dvp)) { 5029 /* 5030 * If we waited for a purge we may have 5031 * lost our vnode so look it up again. 5032 */ 5033 VN_RELE(*vpp); 5034 *vpp = dnlc_lookup(dvp, nm); 5035 if (*vpp == NULL) 5036 return (nfs4lookupnew_otw(dvp, 5037 nm, vpp, cr)); 5038 5039 /* 5040 * The access cache should almost always hit 5041 */ 5042 error = nfs4_access(dvp, VEXEC, 0, cr); 5043 5044 if (error) { 5045 VN_RELE(*vpp); 5046 *vpp = NULL; 5047 return (error); 5048 } 5049 if (*vpp == DNLC_NO_VNODE) { 5050 VN_RELE(*vpp); 5051 *vpp = NULL; 5052 return (ENOENT); 5053 } 5054 return (0); 5055 } 5056 } 5057 } 5058 5059 ASSERT(*vpp != NULL); 5060 5061 /* 5062 * We may have gotten here we have one of the following cases: 5063 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5064 * need to validate them. 5065 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5066 * must validate. 5067 * 5068 * Go to the server and check if the directory has changed, if 5069 * it hasn't we are done and can use the dnlc entry. 5070 */ 5071 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5072 } 5073 5074 /* 5075 * Go to the server and check if the directory has changed, if 5076 * it hasn't we are done and can use the dnlc entry. If it 5077 * has changed we get a new copy of its attributes and check 5078 * the access for VEXEC, then relookup the filename and 5079 * get its filehandle and attributes. 5080 * 5081 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5082 * if the NVERIFY failed we must 5083 * purge the caches 5084 * cache new attributes (will set r_time_attr_inval) 5085 * cache new access 5086 * recheck VEXEC access 5087 * add name to dnlc, possibly negative 5088 * if LOOKUP succeeded 5089 * cache new attributes 5090 * else 5091 * set a new r_time_attr_inval for dvp 5092 * check to make sure we have access 5093 * 5094 * The vpp returned is the vnode passed in if the directory is valid, 5095 * a new vnode if successful lookup, or NULL on error. 5096 */ 5097 static int 5098 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5099 { 5100 COMPOUND4args_clnt args; 5101 COMPOUND4res_clnt res; 5102 fattr4 *ver_fattr; 5103 fattr4_change dchange; 5104 int32_t *ptr; 5105 int argoplist_size = 7 * sizeof (nfs_argop4); 5106 nfs_argop4 *argop; 5107 int doqueue; 5108 mntinfo4_t *mi; 5109 nfs4_recov_state_t recov_state; 5110 hrtime_t t; 5111 int isdotdot; 5112 vnode_t *nvp; 5113 nfs_fh4 *fhp; 5114 nfs4_sharedfh_t *sfhp; 5115 nfs4_access_type_t cacc; 5116 rnode4_t *nrp; 5117 rnode4_t *drp = VTOR4(dvp); 5118 nfs4_ga_res_t *garp = NULL; 5119 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5120 5121 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5122 ASSERT(nm != NULL); 5123 ASSERT(nm[0] != '\0'); 5124 ASSERT(dvp->v_type == VDIR); 5125 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5126 ASSERT(*vpp != NULL); 5127 5128 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5129 isdotdot = 1; 5130 args.ctag = TAG_LOOKUP_VPARENT; 5131 } else { 5132 /* 5133 * Do not allow crossing of server mount points. The 5134 * only visible entries in a SRVSTUB dir are . and .. 5135 * This code handles the non-.. case. We can't even get 5136 * this far if looking up ".". 5137 */ 5138 if (VTOR4(dvp)->r_flags & R4SRVSTUB) { 5139 VN_RELE(*vpp); 5140 *vpp = NULL; 5141 return (ENOENT); 5142 } 5143 isdotdot = 0; 5144 args.ctag = TAG_LOOKUP_VALID; 5145 } 5146 5147 mi = VTOMI4(dvp); 5148 recov_state.rs_flags = 0; 5149 recov_state.rs_num_retry_despite_err = 0; 5150 5151 nvp = NULL; 5152 5153 /* Save the original mount point security information */ 5154 (void) save_mnt_secinfo(mi->mi_curr_serv); 5155 5156 recov_retry: 5157 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5158 &recov_state, NULL); 5159 if (e.error) { 5160 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5161 VN_RELE(*vpp); 5162 *vpp = NULL; 5163 return (e.error); 5164 } 5165 5166 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5167 5168 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5169 args.array_len = 7; 5170 args.array = argop; 5171 5172 /* 0. putfh file */ 5173 argop[0].argop = OP_CPUTFH; 5174 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5175 5176 /* 1. nverify the change info */ 5177 argop[1].argop = OP_NVERIFY; 5178 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5179 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5180 ver_fattr->attrlist4 = (char *)&dchange; 5181 ptr = (int32_t *)&dchange; 5182 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5183 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5184 5185 /* 2. getattr directory */ 5186 argop[2].argop = OP_GETATTR; 5187 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5188 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5189 5190 /* 3. access directory */ 5191 argop[3].argop = OP_ACCESS; 5192 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5193 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5194 5195 /* 4. lookup name */ 5196 if (isdotdot) { 5197 argop[4].argop = OP_LOOKUPP; 5198 } else { 5199 argop[4].argop = OP_CLOOKUP; 5200 argop[4].nfs_argop4_u.opclookup.cname = nm; 5201 } 5202 5203 /* 5. resulting file handle */ 5204 argop[5].argop = OP_GETFH; 5205 5206 /* 6. resulting file attributes */ 5207 argop[6].argop = OP_GETATTR; 5208 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5209 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5210 5211 doqueue = 1; 5212 t = gethrtime(); 5213 5214 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5215 5216 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5217 /* 5218 * For WRONGSEC of a non-dotdot case, send secinfo directly 5219 * from this thread, do not go thru the recovery thread since 5220 * we need the nm information. 5221 * 5222 * Not doing dotdot case because there is no specification 5223 * for (PUTFH, SECINFO "..") yet. 5224 */ 5225 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5226 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5227 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5228 &recov_state, FALSE); 5229 } else { 5230 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5231 &recov_state, TRUE); 5232 } 5233 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5234 kmem_free(argop, argoplist_size); 5235 if (!e.error) 5236 goto recov_retry; 5237 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5238 VN_RELE(*vpp); 5239 *vpp = NULL; 5240 return (e.error); 5241 } 5242 5243 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5244 OP_LOOKUP, NULL) == FALSE) { 5245 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5246 &recov_state, TRUE); 5247 5248 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5249 kmem_free(argop, argoplist_size); 5250 goto recov_retry; 5251 } 5252 } 5253 5254 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5255 5256 if (e.error || res.array_len == 0) { 5257 /* 5258 * If e.error isn't set, then reply has no ops (or we couldn't 5259 * be here). The only legal way to reply without an op array 5260 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5261 * be in the reply for all other status values. 5262 * 5263 * For valid replies without an ops array, return ENOTSUP 5264 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5265 * return EIO -- don't trust status. 5266 */ 5267 if (e.error == 0) 5268 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5269 ENOTSUP : EIO; 5270 VN_RELE(*vpp); 5271 *vpp = NULL; 5272 kmem_free(argop, argoplist_size); 5273 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5274 return (e.error); 5275 } 5276 5277 if (res.status != NFS4ERR_SAME) { 5278 e.error = geterrno4(res.status); 5279 5280 /* 5281 * The NVERIFY "failed" so the directory has changed 5282 * First make sure PUTFH succeeded and NVERIFY "failed" 5283 * cleanly. 5284 */ 5285 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5286 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5287 nfs4_purge_stale_fh(e.error, dvp, cr); 5288 VN_RELE(*vpp); 5289 *vpp = NULL; 5290 goto exit; 5291 } 5292 5293 /* 5294 * We know the NVERIFY "failed" so we must: 5295 * purge the caches (access and indirectly dnlc if needed) 5296 */ 5297 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5298 5299 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5300 nfs4_purge_stale_fh(e.error, dvp, cr); 5301 VN_RELE(*vpp); 5302 *vpp = NULL; 5303 goto exit; 5304 } 5305 5306 /* 5307 * Install new cached attributes for the directory 5308 */ 5309 nfs4_attr_cache(dvp, 5310 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5311 t, cr, FALSE, NULL); 5312 5313 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5314 nfs4_purge_stale_fh(e.error, dvp, cr); 5315 VN_RELE(*vpp); 5316 *vpp = NULL; 5317 e.error = geterrno4(res.status); 5318 goto exit; 5319 } 5320 5321 /* 5322 * Now we know the directory is valid, 5323 * cache new directory access 5324 */ 5325 nfs4_access_cache(drp, 5326 args.array[3].nfs_argop4_u.opaccess.access, 5327 res.array[3].nfs_resop4_u.opaccess.access, cr); 5328 5329 /* 5330 * recheck VEXEC access 5331 */ 5332 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5333 if (cacc != NFS4_ACCESS_ALLOWED) { 5334 /* 5335 * Directory permissions might have been revoked 5336 */ 5337 if (cacc == NFS4_ACCESS_DENIED) { 5338 e.error = EACCES; 5339 VN_RELE(*vpp); 5340 *vpp = NULL; 5341 goto exit; 5342 } 5343 5344 /* 5345 * Somehow we must not have asked for enough 5346 * so try a singleton ACCESS, should never happen. 5347 */ 5348 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5349 if (e.error) { 5350 VN_RELE(*vpp); 5351 *vpp = NULL; 5352 goto exit; 5353 } 5354 } 5355 5356 e.error = geterrno4(res.status); 5357 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5358 /* 5359 * The lookup failed, probably no entry 5360 */ 5361 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5362 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5363 } else { 5364 /* 5365 * Might be some other error, so remove 5366 * the dnlc entry to make sure we start all 5367 * over again, next time. 5368 */ 5369 dnlc_remove(dvp, nm); 5370 } 5371 VN_RELE(*vpp); 5372 *vpp = NULL; 5373 goto exit; 5374 } 5375 5376 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5377 /* 5378 * The file exists but we can't get its fh for 5379 * some unknown reason. Remove it from the dnlc 5380 * and error out to be safe. 5381 */ 5382 dnlc_remove(dvp, nm); 5383 VN_RELE(*vpp); 5384 *vpp = NULL; 5385 goto exit; 5386 } 5387 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5388 if (fhp->nfs_fh4_len == 0) { 5389 /* 5390 * The file exists but a bogus fh 5391 * some unknown reason. Remove it from the dnlc 5392 * and error out to be safe. 5393 */ 5394 e.error = ENOENT; 5395 dnlc_remove(dvp, nm); 5396 VN_RELE(*vpp); 5397 *vpp = NULL; 5398 goto exit; 5399 } 5400 sfhp = sfh4_get(fhp, mi); 5401 5402 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5403 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5404 5405 /* 5406 * Make the new rnode 5407 */ 5408 if (isdotdot) { 5409 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5410 if (e.error) { 5411 sfh4_rele(&sfhp); 5412 VN_RELE(*vpp); 5413 *vpp = NULL; 5414 goto exit; 5415 } 5416 /* 5417 * XXX if nfs4_make_dotdot uses an existing rnode 5418 * XXX it doesn't update the attributes. 5419 * XXX for now just save them again to save an OTW 5420 */ 5421 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5422 } else { 5423 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5424 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5425 /* 5426 * If v_type == VNON, then garp was NULL because 5427 * the last op in the compound failed and makenfs4node 5428 * could not find the vnode for sfhp. It created 5429 * a new vnode, so we have nothing to purge here. 5430 */ 5431 if (nvp->v_type == VNON) { 5432 vattr_t vattr; 5433 5434 vattr.va_mask = AT_TYPE; 5435 /* 5436 * N.B. We've already called nfs4_end_fop above. 5437 */ 5438 e.error = nfs4getattr(nvp, &vattr, cr); 5439 if (e.error) { 5440 sfh4_rele(&sfhp); 5441 VN_RELE(*vpp); 5442 *vpp = NULL; 5443 VN_RELE(nvp); 5444 goto exit; 5445 } 5446 nvp->v_type = vattr.va_type; 5447 } 5448 } 5449 sfh4_rele(&sfhp); 5450 5451 nrp = VTOR4(nvp); 5452 mutex_enter(&nrp->r_statev4_lock); 5453 if (!nrp->created_v4) { 5454 mutex_exit(&nrp->r_statev4_lock); 5455 dnlc_update(dvp, nm, nvp); 5456 } else 5457 mutex_exit(&nrp->r_statev4_lock); 5458 5459 VN_RELE(*vpp); 5460 *vpp = nvp; 5461 } else { 5462 hrtime_t now; 5463 hrtime_t delta = 0; 5464 5465 e.error = 0; 5466 5467 /* 5468 * Because the NVERIFY "succeeded" we know that the 5469 * directory attributes are still valid 5470 * so update r_time_attr_inval 5471 */ 5472 now = gethrtime(); 5473 mutex_enter(&drp->r_statelock); 5474 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5475 delta = now - drp->r_time_attr_saved; 5476 if (delta < mi->mi_acdirmin) 5477 delta = mi->mi_acdirmin; 5478 else if (delta > mi->mi_acdirmax) 5479 delta = mi->mi_acdirmax; 5480 } 5481 drp->r_time_attr_inval = now + delta; 5482 mutex_exit(&drp->r_statelock); 5483 dnlc_update(dvp, nm, *vpp); 5484 5485 /* 5486 * Even though we have a valid directory attr cache 5487 * and dnlc entry, we may not have access. 5488 * This should almost always hit the cache. 5489 */ 5490 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5491 if (e.error) { 5492 VN_RELE(*vpp); 5493 *vpp = NULL; 5494 } 5495 5496 if (*vpp == DNLC_NO_VNODE) { 5497 VN_RELE(*vpp); 5498 *vpp = NULL; 5499 e.error = ENOENT; 5500 } 5501 } 5502 5503 exit: 5504 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5505 kmem_free(argop, argoplist_size); 5506 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5507 return (e.error); 5508 } 5509 5510 /* 5511 * We need to go over the wire to lookup the name, but 5512 * while we are there verify the directory has not 5513 * changed but if it has, get new attributes and check access 5514 * 5515 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5516 * NVERIFY GETATTR ACCESS 5517 * 5518 * With the results: 5519 * if the NVERIFY failed we must purge the caches, add new attributes, 5520 * and cache new access. 5521 * set a new r_time_attr_inval 5522 * add name to dnlc, possibly negative 5523 * if LOOKUP succeeded 5524 * cache new attributes 5525 */ 5526 static int 5527 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5528 { 5529 COMPOUND4args_clnt args; 5530 COMPOUND4res_clnt res; 5531 fattr4 *ver_fattr; 5532 fattr4_change dchange; 5533 int32_t *ptr; 5534 nfs4_ga_res_t *garp = NULL; 5535 int argoplist_size = 9 * sizeof (nfs_argop4); 5536 nfs_argop4 *argop; 5537 int doqueue; 5538 mntinfo4_t *mi; 5539 nfs4_recov_state_t recov_state; 5540 hrtime_t t; 5541 int isdotdot; 5542 vnode_t *nvp; 5543 nfs_fh4 *fhp; 5544 nfs4_sharedfh_t *sfhp; 5545 nfs4_access_type_t cacc; 5546 rnode4_t *nrp; 5547 rnode4_t *drp = VTOR4(dvp); 5548 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5549 5550 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5551 ASSERT(nm != NULL); 5552 ASSERT(nm[0] != '\0'); 5553 ASSERT(dvp->v_type == VDIR); 5554 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5555 ASSERT(*vpp == NULL); 5556 5557 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5558 isdotdot = 1; 5559 args.ctag = TAG_LOOKUP_PARENT; 5560 } else { 5561 /* 5562 * Do not allow crossing of server mount points. The 5563 * only visible entries in a SRVSTUB dir are . and .. 5564 * This code handles the non-.. case. We can't even get 5565 * this far if looking up ".". 5566 */ 5567 if (VTOR4(dvp)->r_flags & R4SRVSTUB) 5568 return (ENOENT); 5569 5570 isdotdot = 0; 5571 args.ctag = TAG_LOOKUP; 5572 } 5573 5574 mi = VTOMI4(dvp); 5575 recov_state.rs_flags = 0; 5576 recov_state.rs_num_retry_despite_err = 0; 5577 5578 nvp = NULL; 5579 5580 /* Save the original mount point security information */ 5581 (void) save_mnt_secinfo(mi->mi_curr_serv); 5582 5583 recov_retry: 5584 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5585 &recov_state, NULL); 5586 if (e.error) { 5587 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5588 return (e.error); 5589 } 5590 5591 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5592 5593 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5594 args.array_len = 9; 5595 args.array = argop; 5596 5597 /* 0. putfh file */ 5598 argop[0].argop = OP_CPUTFH; 5599 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5600 5601 /* 1. savefh for the nverify */ 5602 argop[1].argop = OP_SAVEFH; 5603 5604 /* 2. lookup name */ 5605 if (isdotdot) { 5606 argop[2].argop = OP_LOOKUPP; 5607 } else { 5608 argop[2].argop = OP_CLOOKUP; 5609 argop[2].nfs_argop4_u.opclookup.cname = nm; 5610 } 5611 5612 /* 3. resulting file handle */ 5613 argop[3].argop = OP_GETFH; 5614 5615 /* 4. resulting file attributes */ 5616 argop[4].argop = OP_GETATTR; 5617 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5618 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5619 5620 /* 5. restorefh back the directory for the nverify */ 5621 argop[5].argop = OP_RESTOREFH; 5622 5623 /* 6. nverify the change info */ 5624 argop[6].argop = OP_NVERIFY; 5625 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5626 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5627 ver_fattr->attrlist4 = (char *)&dchange; 5628 ptr = (int32_t *)&dchange; 5629 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5630 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5631 5632 /* 7. getattr directory */ 5633 argop[7].argop = OP_GETATTR; 5634 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5635 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5636 5637 /* 8. access directory */ 5638 argop[8].argop = OP_ACCESS; 5639 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5640 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5641 5642 doqueue = 1; 5643 t = gethrtime(); 5644 5645 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5646 5647 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5648 /* 5649 * For WRONGSEC of a non-dotdot case, send secinfo directly 5650 * from this thread, do not go thru the recovery thread since 5651 * we need the nm information. 5652 * 5653 * Not doing dotdot case because there is no specification 5654 * for (PUTFH, SECINFO "..") yet. 5655 */ 5656 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5657 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) { 5658 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5659 &recov_state, FALSE); 5660 } else { 5661 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5662 &recov_state, TRUE); 5663 } 5664 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5665 kmem_free(argop, argoplist_size); 5666 if (!e.error) 5667 goto recov_retry; 5668 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5669 return (e.error); 5670 } 5671 5672 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5673 OP_LOOKUP, NULL) == FALSE) { 5674 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5675 &recov_state, TRUE); 5676 5677 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5678 kmem_free(argop, argoplist_size); 5679 goto recov_retry; 5680 } 5681 } 5682 5683 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5684 5685 if (e.error || res.array_len == 0) { 5686 /* 5687 * If e.error isn't set, then reply has no ops (or we couldn't 5688 * be here). The only legal way to reply without an op array 5689 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5690 * be in the reply for all other status values. 5691 * 5692 * For valid replies without an ops array, return ENOTSUP 5693 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5694 * return EIO -- don't trust status. 5695 */ 5696 if (e.error == 0) 5697 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5698 ENOTSUP : EIO; 5699 5700 kmem_free(argop, argoplist_size); 5701 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5702 return (e.error); 5703 } 5704 5705 e.error = geterrno4(res.status); 5706 5707 /* 5708 * The PUTFH and SAVEFH may have failed. 5709 */ 5710 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5711 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5712 nfs4_purge_stale_fh(e.error, dvp, cr); 5713 goto exit; 5714 } 5715 5716 /* 5717 * Check if the file exists, if it does delay entering 5718 * into the dnlc until after we update the directory 5719 * attributes so we don't cause it to get purged immediately. 5720 */ 5721 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5722 /* 5723 * The lookup failed, probably no entry 5724 */ 5725 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5726 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5727 } 5728 goto exit; 5729 } 5730 5731 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5732 /* 5733 * The file exists but we can't get its fh for 5734 * some unknown reason. Error out to be safe. 5735 */ 5736 goto exit; 5737 } 5738 5739 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5740 if (fhp->nfs_fh4_len == 0) { 5741 /* 5742 * The file exists but a bogus fh 5743 * some unknown reason. Error out to be safe. 5744 */ 5745 e.error = EIO; 5746 goto exit; 5747 } 5748 sfhp = sfh4_get(fhp, mi); 5749 5750 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5751 sfh4_rele(&sfhp); 5752 e.error = EIO; 5753 goto exit; 5754 } 5755 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5756 5757 /* 5758 * The RESTOREFH may have failed 5759 */ 5760 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5761 sfh4_rele(&sfhp); 5762 e.error = EIO; 5763 goto exit; 5764 } 5765 5766 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5767 /* 5768 * First make sure the NVERIFY failed as we expected, 5769 * if it didn't then be conservative and error out 5770 * as we can't trust the directory. 5771 */ 5772 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5773 sfh4_rele(&sfhp); 5774 e.error = EIO; 5775 goto exit; 5776 } 5777 5778 /* 5779 * We know the NVERIFY "failed" so the directory has changed, 5780 * so we must: 5781 * purge the caches (access and indirectly dnlc if needed) 5782 */ 5783 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5784 5785 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5786 sfh4_rele(&sfhp); 5787 goto exit; 5788 } 5789 nfs4_attr_cache(dvp, 5790 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5791 t, cr, FALSE, NULL); 5792 5793 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5794 nfs4_purge_stale_fh(e.error, dvp, cr); 5795 sfh4_rele(&sfhp); 5796 e.error = geterrno4(res.status); 5797 goto exit; 5798 } 5799 5800 /* 5801 * Now we know the directory is valid, 5802 * cache new directory access 5803 */ 5804 nfs4_access_cache(drp, 5805 args.array[8].nfs_argop4_u.opaccess.access, 5806 res.array[8].nfs_resop4_u.opaccess.access, cr); 5807 5808 /* 5809 * recheck VEXEC access 5810 */ 5811 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5812 if (cacc != NFS4_ACCESS_ALLOWED) { 5813 /* 5814 * Directory permissions might have been revoked 5815 */ 5816 if (cacc == NFS4_ACCESS_DENIED) { 5817 sfh4_rele(&sfhp); 5818 e.error = EACCES; 5819 goto exit; 5820 } 5821 5822 /* 5823 * Somehow we must not have asked for enough 5824 * so try a singleton ACCESS should never happen 5825 */ 5826 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5827 if (e.error) { 5828 sfh4_rele(&sfhp); 5829 goto exit; 5830 } 5831 } 5832 5833 e.error = geterrno4(res.status); 5834 } else { 5835 hrtime_t now; 5836 hrtime_t delta = 0; 5837 5838 e.error = 0; 5839 5840 /* 5841 * Because the NVERIFY "succeeded" we know that the 5842 * directory attributes are still valid 5843 * so update r_time_attr_inval 5844 */ 5845 now = gethrtime(); 5846 mutex_enter(&drp->r_statelock); 5847 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5848 delta = now - drp->r_time_attr_saved; 5849 if (delta < mi->mi_acdirmin) 5850 delta = mi->mi_acdirmin; 5851 else if (delta > mi->mi_acdirmax) 5852 delta = mi->mi_acdirmax; 5853 } 5854 drp->r_time_attr_inval = now + delta; 5855 mutex_exit(&drp->r_statelock); 5856 5857 /* 5858 * Even though we have a valid directory attr cache, 5859 * we may not have access. 5860 * This should almost always hit the cache. 5861 */ 5862 e.error = nfs4_access(dvp, VEXEC, 0, cr); 5863 if (e.error) { 5864 sfh4_rele(&sfhp); 5865 goto exit; 5866 } 5867 } 5868 5869 /* 5870 * Now we have successfully completed the lookup, if the 5871 * directory has changed we now have the valid attributes. 5872 * We also know we have directory access. 5873 * Create the new rnode and insert it in the dnlc. 5874 */ 5875 if (isdotdot) { 5876 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5877 if (e.error) { 5878 sfh4_rele(&sfhp); 5879 goto exit; 5880 } 5881 /* 5882 * XXX if nfs4_make_dotdot uses an existing rnode 5883 * XXX it doesn't update the attributes. 5884 * XXX for now just save them again to save an OTW 5885 */ 5886 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5887 } else { 5888 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5889 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 5890 } 5891 sfh4_rele(&sfhp); 5892 5893 nrp = VTOR4(nvp); 5894 mutex_enter(&nrp->r_statev4_lock); 5895 if (!nrp->created_v4) { 5896 mutex_exit(&nrp->r_statev4_lock); 5897 dnlc_update(dvp, nm, nvp); 5898 } else 5899 mutex_exit(&nrp->r_statev4_lock); 5900 5901 *vpp = nvp; 5902 5903 exit: 5904 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5905 kmem_free(argop, argoplist_size); 5906 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5907 return (e.error); 5908 } 5909 5910 #ifdef DEBUG 5911 void 5912 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5913 { 5914 uint_t i, len; 5915 zoneid_t zoneid = getzoneid(); 5916 char *s; 5917 5918 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5919 for (i = 0; i < argcnt; i++) { 5920 nfs_argop4 *op = &argbase[i]; 5921 switch (op->argop) { 5922 case OP_CPUTFH: 5923 case OP_PUTFH: 5924 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5925 break; 5926 case OP_PUTROOTFH: 5927 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5928 break; 5929 case OP_CLOOKUP: 5930 s = op->nfs_argop4_u.opclookup.cname; 5931 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5932 break; 5933 case OP_LOOKUP: 5934 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 5935 &len, NULL); 5936 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 5937 kmem_free(s, len); 5938 break; 5939 case OP_LOOKUPP: 5940 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 5941 break; 5942 case OP_GETFH: 5943 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 5944 break; 5945 case OP_GETATTR: 5946 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 5947 break; 5948 case OP_OPENATTR: 5949 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 5950 break; 5951 default: 5952 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 5953 op->argop); 5954 break; 5955 } 5956 } 5957 } 5958 #endif 5959 5960 /* 5961 * nfs4lookup_setup - constructs a multi-lookup compound request. 5962 * 5963 * Given the path "nm1/nm2/.../nmn", the following compound requests 5964 * may be created: 5965 * 5966 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 5967 * is faster, for now. 5968 * 5969 * l4_getattrs indicates the type of compound requested. 5970 * 5971 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 5972 * 5973 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 5974 * 5975 * total number of ops is n + 1. 5976 * 5977 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 5978 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 5979 * before the last component, and only get attributes 5980 * for the last component. Note that the second-to-last 5981 * pathname component is XATTR_RPATH, which does NOT go 5982 * over-the-wire as a lookup. 5983 * 5984 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 5985 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 5986 * 5987 * and total number of ops is n + 5. 5988 * 5989 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 5990 * attribute directory: create lookups plus an OPENATTR 5991 * replacing the last lookup. Note that the last pathname 5992 * component is XATTR_RPATH, which does NOT go over-the-wire 5993 * as a lookup. 5994 * 5995 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 5996 * Openattr; Getfh; Getattr } 5997 * 5998 * and total number of ops is n + 5. 5999 * 6000 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6001 * nodes too. 6002 * 6003 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6004 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6005 * 6006 * and total number of ops is 3*n + 1. 6007 * 6008 * All cases: returns the index in the arg array of the final LOOKUP op, or 6009 * -1 if no LOOKUPs were used. 6010 */ 6011 int 6012 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6013 { 6014 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6015 nfs_argop4 *argbase, *argop; 6016 int arglen, argcnt; 6017 int n = 1; /* number of components */ 6018 int nga = 1; /* number of Getattr's in request */ 6019 char c = '\0', *s, *p; 6020 int lookup_idx = -1; 6021 int argoplist_size; 6022 6023 /* set lookuparg response result to 0 */ 6024 lookupargp->resp->status = NFS4_OK; 6025 6026 /* skip leading "/" or "." e.g. ".//./" if there is */ 6027 for (; ; nm++) { 6028 if (*nm != '/' && *nm != '.') 6029 break; 6030 6031 /* ".." is counted as 1 component */ 6032 if (*nm == '.' && *(nm + 1) == '.') 6033 break; 6034 } 6035 6036 /* 6037 * Find n = number of components - nm must be null terminated 6038 * Skip "." components. 6039 */ 6040 if (*nm != '\0') { 6041 for (n = 1, s = nm; *s != '\0'; s++) { 6042 if ((*s == '/') && (*(s + 1) != '/') && 6043 (*(s + 1) != '\0') && 6044 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6045 *(s + 2) == '\0'))) 6046 n++; 6047 } 6048 } else 6049 n = 0; 6050 6051 /* 6052 * nga is number of components that need Getfh+Getattr 6053 */ 6054 switch (l4_getattrs) { 6055 case LKP4_NO_ATTRIBUTES: 6056 nga = 0; 6057 break; 6058 case LKP4_ALL_ATTRIBUTES: 6059 nga = n; 6060 /* 6061 * Always have at least 1 getfh, getattr pair 6062 */ 6063 if (nga == 0) 6064 nga++; 6065 break; 6066 case LKP4_LAST_ATTRDIR: 6067 case LKP4_LAST_NAMED_ATTR: 6068 nga = n+1; 6069 break; 6070 } 6071 6072 /* 6073 * If change to use the filehandle attr instead of getfh 6074 * the following line can be deleted. 6075 */ 6076 nga *= 2; 6077 6078 /* 6079 * calculate number of ops in request as 6080 * header + trailer + lookups + getattrs 6081 */ 6082 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6083 6084 argoplist_size = arglen * sizeof (nfs_argop4); 6085 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6086 lookupargp->argsp->array = argop; 6087 6088 argcnt = lookupargp->header_len; 6089 argop += argcnt; 6090 6091 /* 6092 * loop and create a lookup op and possibly getattr/getfh for 6093 * each component. Skip "." components. 6094 */ 6095 for (s = nm; *s != '\0'; s = p) { 6096 /* 6097 * Set up a pathname struct for each component if needed 6098 */ 6099 while (*s == '/') 6100 s++; 6101 if (*s == '\0') 6102 break; 6103 for (p = s; (*p != '/') && (*p != '\0'); p++); 6104 c = *p; 6105 *p = '\0'; 6106 6107 if (s[0] == '.' && s[1] == '\0') { 6108 *p = c; 6109 continue; 6110 } 6111 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6112 strcmp(s, XATTR_RPATH) == 0) { 6113 /* getfh XXX may not be needed in future */ 6114 argop->argop = OP_GETFH; 6115 argop++; 6116 argcnt++; 6117 6118 /* getattr */ 6119 argop->argop = OP_GETATTR; 6120 argop->nfs_argop4_u.opgetattr.attr_request = 6121 lookupargp->ga_bits; 6122 argop->nfs_argop4_u.opgetattr.mi = 6123 lookupargp->mi; 6124 argop++; 6125 argcnt++; 6126 6127 /* openattr */ 6128 argop->argop = OP_OPENATTR; 6129 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6130 strcmp(s, XATTR_RPATH) == 0) { 6131 /* openattr */ 6132 argop->argop = OP_OPENATTR; 6133 argop++; 6134 argcnt++; 6135 6136 /* getfh XXX may not be needed in future */ 6137 argop->argop = OP_GETFH; 6138 argop++; 6139 argcnt++; 6140 6141 /* getattr */ 6142 argop->argop = OP_GETATTR; 6143 argop->nfs_argop4_u.opgetattr.attr_request = 6144 lookupargp->ga_bits; 6145 argop->nfs_argop4_u.opgetattr.mi = 6146 lookupargp->mi; 6147 argop++; 6148 argcnt++; 6149 *p = c; 6150 continue; 6151 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6152 /* lookupp */ 6153 argop->argop = OP_LOOKUPP; 6154 } else { 6155 /* lookup */ 6156 argop->argop = OP_LOOKUP; 6157 (void) str_to_utf8(s, 6158 &argop->nfs_argop4_u.oplookup.objname); 6159 } 6160 lookup_idx = argcnt; 6161 argop++; 6162 argcnt++; 6163 6164 *p = c; 6165 6166 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6167 /* getfh XXX may not be needed in future */ 6168 argop->argop = OP_GETFH; 6169 argop++; 6170 argcnt++; 6171 6172 /* getattr */ 6173 argop->argop = OP_GETATTR; 6174 argop->nfs_argop4_u.opgetattr.attr_request = 6175 lookupargp->ga_bits; 6176 argop->nfs_argop4_u.opgetattr.mi = 6177 lookupargp->mi; 6178 argop++; 6179 argcnt++; 6180 } 6181 } 6182 6183 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6184 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6185 if (needgetfh) { 6186 /* stick in a post-lookup getfh */ 6187 argop->argop = OP_GETFH; 6188 argcnt++; 6189 argop++; 6190 } 6191 /* post-lookup getattr */ 6192 argop->argop = OP_GETATTR; 6193 argop->nfs_argop4_u.opgetattr.attr_request = 6194 lookupargp->ga_bits; 6195 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6196 argcnt++; 6197 } 6198 argcnt += lookupargp->trailer_len; /* actual op count */ 6199 lookupargp->argsp->array_len = argcnt; 6200 lookupargp->arglen = arglen; 6201 6202 #ifdef DEBUG 6203 if (nfs4_client_lookup_debug) 6204 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6205 #endif 6206 6207 return (lookup_idx); 6208 } 6209 6210 static int 6211 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6212 { 6213 COMPOUND4args_clnt args; 6214 COMPOUND4res_clnt res; 6215 GETFH4res *gf_res = NULL; 6216 nfs_argop4 argop[4]; 6217 nfs_resop4 *resop = NULL; 6218 nfs4_sharedfh_t *sfhp; 6219 hrtime_t t; 6220 nfs4_error_t e; 6221 6222 rnode4_t *drp; 6223 int doqueue = 1; 6224 vnode_t *vp; 6225 int needrecov = 0; 6226 nfs4_recov_state_t recov_state; 6227 6228 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6229 6230 *avp = NULL; 6231 recov_state.rs_flags = 0; 6232 recov_state.rs_num_retry_despite_err = 0; 6233 6234 recov_retry: 6235 /* COMPOUND: putfh, openattr, getfh, getattr */ 6236 args.array_len = 4; 6237 args.array = argop; 6238 args.ctag = TAG_OPENATTR; 6239 6240 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6241 if (e.error) 6242 return (e.error); 6243 6244 drp = VTOR4(dvp); 6245 6246 /* putfh */ 6247 argop[0].argop = OP_CPUTFH; 6248 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6249 6250 /* openattr */ 6251 argop[1].argop = OP_OPENATTR; 6252 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6253 6254 /* getfh */ 6255 argop[2].argop = OP_GETFH; 6256 6257 /* getattr */ 6258 argop[3].argop = OP_GETATTR; 6259 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6260 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6261 6262 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6263 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6264 rnode4info(drp))); 6265 6266 t = gethrtime(); 6267 6268 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6269 6270 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6271 if (needrecov) { 6272 bool_t abort; 6273 6274 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6275 "nfs4openattr: initiating recovery\n")); 6276 6277 abort = nfs4_start_recovery(&e, 6278 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6279 OP_OPENATTR, NULL); 6280 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6281 if (!e.error) { 6282 e.error = geterrno4(res.status); 6283 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6284 } 6285 if (abort == FALSE) 6286 goto recov_retry; 6287 return (e.error); 6288 } 6289 6290 if (e.error) { 6291 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6292 return (e.error); 6293 } 6294 6295 if (res.status) { 6296 /* 6297 * If OTW errro is NOTSUPP, then it should be 6298 * translated to EINVAL. All Solaris file system 6299 * implementations return EINVAL to the syscall layer 6300 * when the attrdir cannot be created due to an 6301 * implementation restriction or noxattr mount option. 6302 */ 6303 if (res.status == NFS4ERR_NOTSUPP) { 6304 mutex_enter(&drp->r_statelock); 6305 if (drp->r_xattr_dir) 6306 VN_RELE(drp->r_xattr_dir); 6307 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6308 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6309 mutex_exit(&drp->r_statelock); 6310 6311 e.error = EINVAL; 6312 } else { 6313 e.error = geterrno4(res.status); 6314 } 6315 6316 if (e.error) { 6317 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6318 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6319 needrecov); 6320 return (e.error); 6321 } 6322 } 6323 6324 resop = &res.array[0]; /* putfh res */ 6325 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6326 6327 resop = &res.array[1]; /* openattr res */ 6328 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6329 6330 resop = &res.array[2]; /* getfh res */ 6331 gf_res = &resop->nfs_resop4_u.opgetfh; 6332 if (gf_res->object.nfs_fh4_len == 0) { 6333 *avp = NULL; 6334 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6335 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6336 return (ENOENT); 6337 } 6338 6339 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6340 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6341 dvp->v_vfsp, t, cr, dvp, 6342 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH)); 6343 sfh4_rele(&sfhp); 6344 6345 if (e.error) 6346 PURGE_ATTRCACHE4(vp); 6347 6348 mutex_enter(&vp->v_lock); 6349 vp->v_flag |= V_XATTRDIR; 6350 mutex_exit(&vp->v_lock); 6351 6352 *avp = vp; 6353 6354 mutex_enter(&drp->r_statelock); 6355 if (drp->r_xattr_dir) 6356 VN_RELE(drp->r_xattr_dir); 6357 VN_HOLD(vp); 6358 drp->r_xattr_dir = vp; 6359 6360 /* 6361 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6362 * NULL. xattrs could be created at any time, and we have no 6363 * way to update pc4_xattr_exists in the base object if/when 6364 * it happens. 6365 */ 6366 drp->r_pathconf.pc4_xattr_valid = 0; 6367 6368 mutex_exit(&drp->r_statelock); 6369 6370 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6371 6372 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6373 6374 return (0); 6375 } 6376 6377 /* ARGSUSED */ 6378 static int 6379 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6380 int mode, vnode_t **vpp, cred_t *cr, int flags) 6381 { 6382 int error; 6383 vnode_t *vp = NULL; 6384 rnode4_t *rp; 6385 struct vattr vattr; 6386 rnode4_t *drp; 6387 vnode_t *tempvp; 6388 enum createmode4 createmode; 6389 bool_t must_trunc = FALSE; 6390 6391 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6392 return (EPERM); 6393 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6394 return (EINVAL); 6395 } 6396 6397 /* . and .. have special meaning in the protocol, reject them. */ 6398 6399 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6400 return (EISDIR); 6401 6402 drp = VTOR4(dvp); 6403 6404 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6405 return (EINTR); 6406 6407 top: 6408 /* 6409 * We make a copy of the attributes because the caller does not 6410 * expect us to change what va points to. 6411 */ 6412 vattr = *va; 6413 6414 /* 6415 * If the pathname is "", then dvp is the root vnode of 6416 * a remote file mounted over a local directory. 6417 * All that needs to be done is access 6418 * checking and truncation. Note that we avoid doing 6419 * open w/ create because the parent directory might 6420 * be in pseudo-fs and the open would fail. 6421 */ 6422 if (*nm == '\0') { 6423 error = 0; 6424 VN_HOLD(dvp); 6425 vp = dvp; 6426 must_trunc = TRUE; 6427 } else { 6428 /* 6429 * We need to go over the wire, just to be sure whether the 6430 * file exists or not. Using the DNLC can be dangerous in 6431 * this case when making a decision regarding existence. 6432 */ 6433 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6434 } 6435 6436 if (exclusive) 6437 createmode = EXCLUSIVE4; 6438 else 6439 createmode = GUARDED4; 6440 6441 /* 6442 * error would be set if the file does not exist on the 6443 * server, so lets go create it. 6444 */ 6445 if (error) { 6446 goto create_otw; 6447 } 6448 6449 /* 6450 * File does exist on the server 6451 */ 6452 if (exclusive == EXCL) 6453 error = EEXIST; 6454 else if (vp->v_type == VDIR && (mode & VWRITE)) 6455 error = EISDIR; 6456 else { 6457 /* 6458 * If vnode is a device, create special vnode. 6459 */ 6460 if (ISVDEV(vp->v_type)) { 6461 tempvp = vp; 6462 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6463 VN_RELE(tempvp); 6464 } 6465 if (!(error = VOP_ACCESS(vp, mode, 0, cr))) { 6466 if ((vattr.va_mask & AT_SIZE) && 6467 vp->v_type == VREG) { 6468 rp = VTOR4(vp); 6469 /* 6470 * Check here for large file handled 6471 * by LF-unaware process (as 6472 * ufs_create() does) 6473 */ 6474 if (!(flags & FOFFMAX)) { 6475 mutex_enter(&rp->r_statelock); 6476 if (rp->r_size > MAXOFF32_T) 6477 error = EOVERFLOW; 6478 mutex_exit(&rp->r_statelock); 6479 } 6480 6481 /* if error is set then we need to return */ 6482 if (error) { 6483 nfs_rw_exit(&drp->r_rwlock); 6484 VN_RELE(vp); 6485 return (error); 6486 } 6487 6488 if (must_trunc) { 6489 vattr.va_mask = AT_SIZE; 6490 error = nfs4setattr(vp, &vattr, 0, cr, 6491 NULL); 6492 } else { 6493 /* 6494 * we know we have a regular file that already 6495 * exists and we may end up truncating the file 6496 * as a result of the open_otw, so flush out 6497 * any dirty pages for this file first. 6498 */ 6499 if (nfs4_has_pages(vp) && 6500 ((rp->r_flags & R4DIRTY) || 6501 rp->r_count > 0 || 6502 rp->r_mapcnt > 0)) { 6503 error = nfs4_putpage(vp, 6504 (offset_t)0, 0, 0, cr); 6505 if (error && (error == ENOSPC || 6506 error == EDQUOT)) { 6507 mutex_enter( 6508 &rp->r_statelock); 6509 if (!rp->r_error) 6510 rp->r_error = 6511 error; 6512 mutex_exit( 6513 &rp->r_statelock); 6514 } 6515 } 6516 vattr.va_mask = (AT_SIZE | 6517 AT_TYPE | AT_MODE); 6518 vattr.va_type = VREG; 6519 createmode = UNCHECKED4; 6520 goto create_otw; 6521 } 6522 } 6523 } 6524 } 6525 nfs_rw_exit(&drp->r_rwlock); 6526 if (error) { 6527 VN_RELE(vp); 6528 } else { 6529 *vpp = vp; 6530 } 6531 return (error); 6532 6533 create_otw: 6534 dnlc_remove(dvp, nm); 6535 6536 ASSERT(vattr.va_mask & AT_TYPE); 6537 6538 /* 6539 * If not a regular file let nfs4mknod() handle it. 6540 */ 6541 if (vattr.va_type != VREG) { 6542 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6543 nfs_rw_exit(&drp->r_rwlock); 6544 return (error); 6545 } 6546 6547 /* 6548 * It _is_ a regular file. 6549 */ 6550 ASSERT(vattr.va_mask & AT_MODE); 6551 if (MANDMODE(vattr.va_mode)) { 6552 nfs_rw_exit(&drp->r_rwlock); 6553 return (EACCES); 6554 } 6555 6556 /* 6557 * If this happens to be a mknod of a regular file, then flags will 6558 * have neither FREAD or FWRITE. However, we must set at least one 6559 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6560 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6561 * set (based on openmode specified by app). 6562 */ 6563 if ((flags & (FREAD|FWRITE)) == 0) 6564 flags |= (FREAD|FWRITE); 6565 6566 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6567 6568 if (vp != NULL) { 6569 /* if create was successful, throw away the file's pages */ 6570 if (!error && (vattr.va_mask & AT_SIZE)) 6571 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6572 cr); 6573 /* release the lookup hold */ 6574 VN_RELE(vp); 6575 vp = NULL; 6576 } 6577 6578 /* 6579 * validate that we opened a regular file. This handles a misbehaving 6580 * server that returns an incorrect FH. 6581 */ 6582 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6583 error = EISDIR; 6584 VN_RELE(*vpp); 6585 } 6586 6587 /* 6588 * If this is not an exclusive create, then the CREATE 6589 * request will be made with the GUARDED mode set. This 6590 * means that the server will return EEXIST if the file 6591 * exists. The file could exist because of a retransmitted 6592 * request. In this case, we recover by starting over and 6593 * checking to see whether the file exists. This second 6594 * time through it should and a CREATE request will not be 6595 * sent. 6596 * 6597 * This handles the problem of a dangling CREATE request 6598 * which contains attributes which indicate that the file 6599 * should be truncated. This retransmitted request could 6600 * possibly truncate valid data in the file if not caught 6601 * by the duplicate request mechanism on the server or if 6602 * not caught by other means. The scenario is: 6603 * 6604 * Client transmits CREATE request with size = 0 6605 * Client times out, retransmits request. 6606 * Response to the first request arrives from the server 6607 * and the client proceeds on. 6608 * Client writes data to the file. 6609 * The server now processes retransmitted CREATE request 6610 * and truncates file. 6611 * 6612 * The use of the GUARDED CREATE request prevents this from 6613 * happening because the retransmitted CREATE would fail 6614 * with EEXIST and would not truncate the file. 6615 */ 6616 if (error == EEXIST && exclusive == NONEXCL) { 6617 #ifdef DEBUG 6618 nfs4_create_misses++; 6619 #endif 6620 goto top; 6621 } 6622 nfs_rw_exit(&drp->r_rwlock); 6623 return (error); 6624 } 6625 6626 /* 6627 * Create compound (for mkdir, mknod, symlink): 6628 * { Putfh <dfh>; Create; Getfh; Getattr } 6629 * It's okay if setattr failed to set gid - this is not considered 6630 * an error, but purge attrs in that case. 6631 */ 6632 static int 6633 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6634 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6635 { 6636 int need_end_op = FALSE; 6637 COMPOUND4args_clnt args; 6638 COMPOUND4res_clnt res, *resp = NULL; 6639 nfs_argop4 *argop; 6640 nfs_resop4 *resop; 6641 int doqueue; 6642 mntinfo4_t *mi; 6643 rnode4_t *drp = VTOR4(dvp); 6644 change_info4 *cinfo; 6645 GETFH4res *gf_res; 6646 struct vattr vattr; 6647 vnode_t *vp; 6648 fattr4 *crattr; 6649 bool_t needrecov = FALSE; 6650 nfs4_recov_state_t recov_state; 6651 nfs4_sharedfh_t *sfhp = NULL; 6652 hrtime_t t; 6653 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6654 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6655 dirattr_info_t dinfo, *dinfop; 6656 servinfo4_t *svp; 6657 bitmap4 supp_attrs; 6658 6659 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6660 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6661 6662 mi = VTOMI4(dvp); 6663 6664 /* 6665 * Make sure we properly deal with setting the right gid 6666 * on a new directory to reflect the parent's setgid bit 6667 */ 6668 setgid_flag = 0; 6669 if (type == NF4DIR) { 6670 struct vattr dva; 6671 6672 va->va_mode &= ~VSGID; 6673 dva.va_mask = AT_MODE | AT_GID; 6674 if (VOP_GETATTR(dvp, &dva, 0, cr) == 0) { 6675 6676 /* 6677 * If the parent's directory has the setgid bit set 6678 * _and_ the client was able to get a valid mapping 6679 * for the parent dir's owner_group, we want to 6680 * append NVERIFY(owner_group == dva.va_gid) and 6681 * SETTATTR to the CREATE compound. 6682 */ 6683 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6684 setgid_flag = 1; 6685 va->va_mode |= VSGID; 6686 if (dva.va_gid != GID_NOBODY) { 6687 va->va_mask |= AT_GID; 6688 va->va_gid = dva.va_gid; 6689 } 6690 } 6691 } 6692 } 6693 6694 /* 6695 * Create ops: 6696 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6697 * 5:restorefh(dir) 6:getattr(dir) 6698 * 6699 * if (setgid) 6700 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6701 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6702 * 8:nverify 9:setattr 6703 */ 6704 if (setgid_flag) { 6705 numops = 10; 6706 idx_create = 1; 6707 idx_fattr = 3; 6708 } else { 6709 numops = 7; 6710 idx_create = 2; 6711 idx_fattr = 4; 6712 } 6713 6714 ASSERT(nfs_zone() == mi->mi_zone); 6715 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6716 return (EINTR); 6717 } 6718 recov_state.rs_flags = 0; 6719 recov_state.rs_num_retry_despite_err = 0; 6720 6721 argoplist_size = numops * sizeof (nfs_argop4); 6722 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6723 6724 recov_retry: 6725 if (type == NF4LNK) 6726 args.ctag = TAG_SYMLINK; 6727 else if (type == NF4DIR) 6728 args.ctag = TAG_MKDIR; 6729 else 6730 args.ctag = TAG_MKNOD; 6731 6732 args.array_len = numops; 6733 args.array = argop; 6734 6735 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6736 nfs_rw_exit(&drp->r_rwlock); 6737 kmem_free(argop, argoplist_size); 6738 return (e.error); 6739 } 6740 need_end_op = TRUE; 6741 6742 6743 /* 0: putfh directory */ 6744 argop[0].argop = OP_CPUTFH; 6745 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6746 6747 /* 1/2: Create object */ 6748 argop[idx_create].argop = OP_CCREATE; 6749 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6750 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6751 if (type == NF4LNK) { 6752 /* 6753 * symlink, treat name as data 6754 */ 6755 ASSERT(data != NULL); 6756 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6757 (char *)data; 6758 } 6759 if (type == NF4BLK || type == NF4CHR) { 6760 ASSERT(data != NULL); 6761 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6762 *((specdata4 *)data); 6763 } 6764 6765 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6766 6767 svp = drp->r_server; 6768 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6769 supp_attrs = svp->sv_supp_attrs; 6770 nfs_rw_exit(&svp->sv_lock); 6771 6772 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6773 nfs_rw_exit(&drp->r_rwlock); 6774 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6775 e.error = EINVAL; 6776 kmem_free(argop, argoplist_size); 6777 return (e.error); 6778 } 6779 6780 /* 2/3: getfh fh of created object */ 6781 ASSERT(idx_create + 1 == idx_fattr - 1); 6782 argop[idx_create + 1].argop = OP_GETFH; 6783 6784 /* 3/4: getattr of new object */ 6785 argop[idx_fattr].argop = OP_GETATTR; 6786 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6787 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6788 6789 if (setgid_flag) { 6790 vattr_t _v; 6791 6792 argop[4].argop = OP_SAVEFH; 6793 6794 argop[5].argop = OP_CPUTFH; 6795 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6796 6797 argop[6].argop = OP_GETATTR; 6798 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6799 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6800 6801 argop[7].argop = OP_RESTOREFH; 6802 6803 /* 6804 * nverify 6805 * 6806 * XXX - Revisit the last argument to nfs4_end_op() 6807 * once 5020486 is fixed. 6808 */ 6809 _v.va_mask = AT_GID; 6810 _v.va_gid = va->va_gid; 6811 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6812 supp_attrs)) { 6813 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6814 nfs_rw_exit(&drp->r_rwlock); 6815 nfs4_fattr4_free(crattr); 6816 kmem_free(argop, argoplist_size); 6817 return (e.error); 6818 } 6819 6820 /* 6821 * setattr 6822 * 6823 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6824 * so no need for stateid or flags. Also we specify NULL 6825 * rp since we're only interested in setting owner_group 6826 * attributes. 6827 */ 6828 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6829 &e.error, 0); 6830 6831 if (e.error) { 6832 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6833 nfs_rw_exit(&drp->r_rwlock); 6834 nfs4_fattr4_free(crattr); 6835 nfs4args_verify_free(&argop[8]); 6836 kmem_free(argop, argoplist_size); 6837 return (e.error); 6838 } 6839 } else { 6840 argop[1].argop = OP_SAVEFH; 6841 6842 argop[5].argop = OP_RESTOREFH; 6843 6844 argop[6].argop = OP_GETATTR; 6845 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6846 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6847 } 6848 6849 dnlc_remove(dvp, nm); 6850 6851 doqueue = 1; 6852 t = gethrtime(); 6853 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6854 6855 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6856 if (e.error) { 6857 PURGE_ATTRCACHE4(dvp); 6858 if (!needrecov) 6859 goto out; 6860 } 6861 6862 if (needrecov) { 6863 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6864 OP_CREATE, NULL) == FALSE) { 6865 nfs4_end_op(mi, dvp, NULL, &recov_state, 6866 needrecov); 6867 need_end_op = FALSE; 6868 nfs4_fattr4_free(crattr); 6869 if (setgid_flag) { 6870 nfs4args_verify_free(&argop[8]); 6871 nfs4args_setattr_free(&argop[9]); 6872 } 6873 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6874 goto recov_retry; 6875 } 6876 } 6877 6878 resp = &res; 6879 6880 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6881 6882 if (res.status == NFS4ERR_BADOWNER) 6883 nfs4_log_badowner(mi, OP_CREATE); 6884 6885 e.error = geterrno4(res.status); 6886 6887 /* 6888 * This check is left over from when create was implemented 6889 * using a setattr op (instead of createattrs). If the 6890 * putfh/create/getfh failed, the error was returned. If 6891 * setattr/getattr failed, we keep going. 6892 * 6893 * It might be better to get rid of the GETFH also, and just 6894 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6895 * Then if any of the operations failed, we could return the 6896 * error now, and remove much of the error code below. 6897 */ 6898 if (res.array_len <= idx_fattr) { 6899 /* 6900 * Either Putfh, Create or Getfh failed. 6901 */ 6902 PURGE_ATTRCACHE4(dvp); 6903 /* 6904 * nfs4_purge_stale_fh() may generate otw calls through 6905 * nfs4_invalidate_pages. Hence the need to call 6906 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 6907 */ 6908 nfs4_end_op(mi, dvp, NULL, &recov_state, 6909 needrecov); 6910 need_end_op = FALSE; 6911 nfs4_purge_stale_fh(e.error, dvp, cr); 6912 goto out; 6913 } 6914 } 6915 6916 resop = &res.array[idx_create]; /* create res */ 6917 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 6918 6919 resop = &res.array[idx_create + 1]; /* getfh res */ 6920 gf_res = &resop->nfs_resop4_u.opgetfh; 6921 6922 sfhp = sfh4_get(&gf_res->object, mi); 6923 if (e.error) { 6924 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 6925 fn_get(VTOSV(dvp)->sv_name, nm)); 6926 if (vp->v_type == VNON) { 6927 vattr.va_mask = AT_TYPE; 6928 /* 6929 * Need to call nfs4_end_op before nfs4getattr to avoid 6930 * potential nfs4_start_op deadlock. See RFE 4777612. 6931 */ 6932 nfs4_end_op(mi, dvp, NULL, &recov_state, 6933 needrecov); 6934 need_end_op = FALSE; 6935 e.error = nfs4getattr(vp, &vattr, cr); 6936 if (e.error) { 6937 VN_RELE(vp); 6938 *vpp = NULL; 6939 goto out; 6940 } 6941 vp->v_type = vattr.va_type; 6942 } 6943 e.error = 0; 6944 } else { 6945 *vpp = vp = makenfs4node(sfhp, 6946 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 6947 dvp->v_vfsp, t, cr, 6948 dvp, fn_get(VTOSV(dvp)->sv_name, nm)); 6949 } 6950 6951 /* 6952 * If compound succeeded, then update dir attrs 6953 */ 6954 if (res.status == NFS4_OK) { 6955 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 6956 dinfo.di_cred = cr; 6957 dinfo.di_time_call = t; 6958 dinfop = &dinfo; 6959 } else 6960 dinfop = NULL; 6961 6962 /* Update directory cache attribute, readdir and dnlc caches */ 6963 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 6964 6965 out: 6966 if (sfhp != NULL) 6967 sfh4_rele(&sfhp); 6968 nfs_rw_exit(&drp->r_rwlock); 6969 nfs4_fattr4_free(crattr); 6970 if (setgid_flag) { 6971 nfs4args_verify_free(&argop[8]); 6972 nfs4args_setattr_free(&argop[9]); 6973 } 6974 if (resp) 6975 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 6976 if (need_end_op) 6977 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6978 6979 kmem_free(argop, argoplist_size); 6980 return (e.error); 6981 } 6982 6983 /* ARGSUSED */ 6984 static int 6985 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6986 int mode, vnode_t **vpp, cred_t *cr) 6987 { 6988 int error; 6989 vnode_t *vp; 6990 nfs_ftype4 type; 6991 specdata4 spec, *specp = NULL; 6992 6993 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6994 6995 switch (va->va_type) { 6996 case VCHR: 6997 case VBLK: 6998 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 6999 spec.specdata1 = getmajor(va->va_rdev); 7000 spec.specdata2 = getminor(va->va_rdev); 7001 specp = &spec; 7002 break; 7003 7004 case VFIFO: 7005 type = NF4FIFO; 7006 break; 7007 case VSOCK: 7008 type = NF4SOCK; 7009 break; 7010 7011 default: 7012 return (EINVAL); 7013 } 7014 7015 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7016 if (error) { 7017 return (error); 7018 } 7019 7020 /* 7021 * This might not be needed any more; special case to deal 7022 * with problematic v2/v3 servers. Since create was unable 7023 * to set group correctly, not sure what hope setattr has. 7024 */ 7025 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7026 va->va_mask = AT_GID; 7027 (void) nfs4setattr(vp, va, 0, cr, NULL); 7028 } 7029 7030 /* 7031 * If vnode is a device create special vnode 7032 */ 7033 if (ISVDEV(vp->v_type)) { 7034 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7035 VN_RELE(vp); 7036 } else { 7037 *vpp = vp; 7038 } 7039 return (error); 7040 } 7041 7042 /* 7043 * Remove requires that the current fh be the target directory. 7044 * After the operation, the current fh is unchanged. 7045 * The compound op structure is: 7046 * PUTFH(targetdir), REMOVE 7047 * 7048 * Weirdness: if the vnode to be removed is open 7049 * we rename it instead of removing it and nfs_inactive 7050 * will remove the new name. 7051 */ 7052 static int 7053 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr) 7054 { 7055 COMPOUND4args_clnt args; 7056 COMPOUND4res_clnt res, *resp = NULL; 7057 REMOVE4res *rm_res; 7058 nfs_argop4 argop[3]; 7059 nfs_resop4 *resop; 7060 vnode_t *vp; 7061 char *tmpname; 7062 int doqueue; 7063 mntinfo4_t *mi; 7064 rnode4_t *rp; 7065 rnode4_t *drp; 7066 int needrecov = 0; 7067 nfs4_recov_state_t recov_state; 7068 int isopen; 7069 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7070 dirattr_info_t dinfo; 7071 7072 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7073 return (EPERM); 7074 drp = VTOR4(dvp); 7075 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7076 return (EINTR); 7077 7078 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7079 if (e.error) { 7080 nfs_rw_exit(&drp->r_rwlock); 7081 return (e.error); 7082 } 7083 7084 if (vp->v_type == VDIR) { 7085 VN_RELE(vp); 7086 nfs_rw_exit(&drp->r_rwlock); 7087 return (EISDIR); 7088 } 7089 7090 /* 7091 * First just remove the entry from the name cache, as it 7092 * is most likely the only entry for this vp. 7093 */ 7094 dnlc_remove(dvp, nm); 7095 7096 rp = VTOR4(vp); 7097 7098 /* 7099 * For regular file types, check to see if the file is open by looking 7100 * at the open streams. 7101 * For all other types, check the reference count on the vnode. Since 7102 * they are not opened OTW they never have an open stream. 7103 * 7104 * If the file is open, rename it to .nfsXXXX. 7105 */ 7106 if (vp->v_type != VREG) { 7107 /* 7108 * If the file has a v_count > 1 then there may be more than one 7109 * entry in the name cache due multiple links or an open file, 7110 * but we don't have the real reference count so flush all 7111 * possible entries. 7112 */ 7113 if (vp->v_count > 1) 7114 dnlc_purge_vp(vp); 7115 7116 /* 7117 * Now we have the real reference count. 7118 */ 7119 isopen = vp->v_count > 1; 7120 } else { 7121 mutex_enter(&rp->r_os_lock); 7122 isopen = list_head(&rp->r_open_streams) != NULL; 7123 mutex_exit(&rp->r_os_lock); 7124 } 7125 7126 mutex_enter(&rp->r_statelock); 7127 if (isopen && 7128 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7129 mutex_exit(&rp->r_statelock); 7130 tmpname = newname(); 7131 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr); 7132 if (e.error) 7133 kmem_free(tmpname, MAXNAMELEN); 7134 else { 7135 mutex_enter(&rp->r_statelock); 7136 if (rp->r_unldvp == NULL) { 7137 VN_HOLD(dvp); 7138 rp->r_unldvp = dvp; 7139 if (rp->r_unlcred != NULL) 7140 crfree(rp->r_unlcred); 7141 crhold(cr); 7142 rp->r_unlcred = cr; 7143 rp->r_unlname = tmpname; 7144 } else { 7145 kmem_free(rp->r_unlname, MAXNAMELEN); 7146 rp->r_unlname = tmpname; 7147 } 7148 mutex_exit(&rp->r_statelock); 7149 } 7150 VN_RELE(vp); 7151 nfs_rw_exit(&drp->r_rwlock); 7152 return (e.error); 7153 } 7154 /* 7155 * Actually remove the file/dir 7156 */ 7157 mutex_exit(&rp->r_statelock); 7158 7159 /* 7160 * We need to flush any dirty pages which happen to 7161 * be hanging around before removing the file. 7162 * This shouldn't happen very often since in NFSv4 7163 * we should be close to open consistent. 7164 */ 7165 if (nfs4_has_pages(vp) && 7166 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7167 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr); 7168 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7169 mutex_enter(&rp->r_statelock); 7170 if (!rp->r_error) 7171 rp->r_error = e.error; 7172 mutex_exit(&rp->r_statelock); 7173 } 7174 } 7175 7176 mi = VTOMI4(dvp); 7177 7178 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7179 recov_state.rs_flags = 0; 7180 recov_state.rs_num_retry_despite_err = 0; 7181 7182 recov_retry: 7183 /* 7184 * Remove ops: putfh dir; remove 7185 */ 7186 args.ctag = TAG_REMOVE; 7187 args.array_len = 3; 7188 args.array = argop; 7189 7190 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7191 if (e.error) { 7192 nfs_rw_exit(&drp->r_rwlock); 7193 VN_RELE(vp); 7194 return (e.error); 7195 } 7196 7197 /* putfh directory */ 7198 argop[0].argop = OP_CPUTFH; 7199 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7200 7201 /* remove */ 7202 argop[1].argop = OP_CREMOVE; 7203 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7204 7205 /* getattr dir */ 7206 argop[2].argop = OP_GETATTR; 7207 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7208 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7209 7210 doqueue = 1; 7211 dinfo.di_time_call = gethrtime(); 7212 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7213 7214 PURGE_ATTRCACHE4(vp); 7215 7216 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7217 if (e.error) 7218 PURGE_ATTRCACHE4(dvp); 7219 7220 if (needrecov) { 7221 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7222 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7223 if (!e.error) 7224 (void) xdr_free(xdr_COMPOUND4res_clnt, 7225 (caddr_t)&res); 7226 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7227 needrecov); 7228 goto recov_retry; 7229 } 7230 } 7231 7232 /* 7233 * Matching nfs4_end_op() for start_op() above. 7234 * There is a path in the code below which calls 7235 * nfs4_purge_stale_fh(), which may generate otw calls through 7236 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7237 * here to avoid nfs4_start_op() deadlock. 7238 */ 7239 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7240 7241 if (!e.error) { 7242 resp = &res; 7243 7244 if (res.status) { 7245 e.error = geterrno4(res.status); 7246 PURGE_ATTRCACHE4(dvp); 7247 nfs4_purge_stale_fh(e.error, dvp, cr); 7248 } else { 7249 resop = &res.array[1]; /* remove res */ 7250 rm_res = &resop->nfs_resop4_u.opremove; 7251 7252 dinfo.di_garp = 7253 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7254 dinfo.di_cred = cr; 7255 7256 /* Update directory attr, readdir and dnlc caches */ 7257 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7258 &dinfo); 7259 } 7260 } 7261 nfs_rw_exit(&drp->r_rwlock); 7262 if (resp) 7263 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7264 7265 VN_RELE(vp); 7266 return (e.error); 7267 } 7268 7269 /* 7270 * Link requires that the current fh be the target directory and the 7271 * saved fh be the source fh. After the operation, the current fh is unchanged. 7272 * Thus the compound op structure is: 7273 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7274 * GETATTR(file) 7275 */ 7276 static int 7277 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr) 7278 { 7279 COMPOUND4args_clnt args; 7280 COMPOUND4res_clnt res, *resp = NULL; 7281 LINK4res *ln_res; 7282 int argoplist_size = 7 * sizeof (nfs_argop4); 7283 nfs_argop4 *argop; 7284 nfs_resop4 *resop; 7285 vnode_t *realvp, *nvp; 7286 int doqueue; 7287 mntinfo4_t *mi; 7288 rnode4_t *tdrp; 7289 bool_t needrecov = FALSE; 7290 nfs4_recov_state_t recov_state; 7291 hrtime_t t; 7292 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7293 dirattr_info_t dinfo; 7294 7295 ASSERT(*tnm != '\0'); 7296 ASSERT(tdvp->v_type == VDIR); 7297 ASSERT(nfs4_consistent_type(tdvp)); 7298 ASSERT(nfs4_consistent_type(svp)); 7299 7300 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7301 return (EPERM); 7302 if (VOP_REALVP(svp, &realvp) == 0) { 7303 svp = realvp; 7304 ASSERT(nfs4_consistent_type(svp)); 7305 } 7306 7307 tdrp = VTOR4(tdvp); 7308 mi = VTOMI4(svp); 7309 7310 if (!(mi->mi_flags & MI4_LINK)) { 7311 return (EOPNOTSUPP); 7312 } 7313 recov_state.rs_flags = 0; 7314 recov_state.rs_num_retry_despite_err = 0; 7315 7316 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7317 return (EINTR); 7318 7319 recov_retry: 7320 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7321 7322 args.ctag = TAG_LINK; 7323 7324 /* 7325 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7326 * restorefh; getattr(fl) 7327 */ 7328 args.array_len = 7; 7329 args.array = argop; 7330 7331 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7332 if (e.error) { 7333 kmem_free(argop, argoplist_size); 7334 nfs_rw_exit(&tdrp->r_rwlock); 7335 return (e.error); 7336 } 7337 7338 /* 0. putfh file */ 7339 argop[0].argop = OP_CPUTFH; 7340 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7341 7342 /* 1. save current fh to free up the space for the dir */ 7343 argop[1].argop = OP_SAVEFH; 7344 7345 /* 2. putfh targetdir */ 7346 argop[2].argop = OP_CPUTFH; 7347 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7348 7349 /* 3. link: current_fh is targetdir, saved_fh is source */ 7350 argop[3].argop = OP_CLINK; 7351 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7352 7353 /* 4. Get attributes of dir */ 7354 argop[4].argop = OP_GETATTR; 7355 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7356 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7357 7358 /* 5. If link was successful, restore current vp to file */ 7359 argop[5].argop = OP_RESTOREFH; 7360 7361 /* 6. Get attributes of linked object */ 7362 argop[6].argop = OP_GETATTR; 7363 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7364 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7365 7366 dnlc_remove(tdvp, tnm); 7367 7368 doqueue = 1; 7369 t = gethrtime(); 7370 7371 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7372 7373 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7374 if (e.error != 0 && !needrecov) { 7375 PURGE_ATTRCACHE4(tdvp); 7376 PURGE_ATTRCACHE4(svp); 7377 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7378 goto out; 7379 } 7380 7381 if (needrecov) { 7382 bool_t abort; 7383 7384 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7385 NULL, NULL, OP_LINK, NULL); 7386 if (abort == FALSE) { 7387 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7388 needrecov); 7389 kmem_free(argop, argoplist_size); 7390 if (!e.error) 7391 (void) xdr_free(xdr_COMPOUND4res_clnt, 7392 (caddr_t)&res); 7393 goto recov_retry; 7394 } else { 7395 if (e.error != 0) { 7396 PURGE_ATTRCACHE4(tdvp); 7397 PURGE_ATTRCACHE4(svp); 7398 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7399 &recov_state, needrecov); 7400 goto out; 7401 } 7402 /* fall through for res.status case */ 7403 } 7404 } 7405 7406 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7407 7408 resp = &res; 7409 if (res.status) { 7410 /* If link succeeded, then don't return error */ 7411 e.error = geterrno4(res.status); 7412 if (res.array_len <= 4) { 7413 /* 7414 * Either Putfh, Savefh, Putfh dir, or Link failed 7415 */ 7416 PURGE_ATTRCACHE4(svp); 7417 PURGE_ATTRCACHE4(tdvp); 7418 if (e.error == EOPNOTSUPP) { 7419 mutex_enter(&mi->mi_lock); 7420 mi->mi_flags &= ~MI4_LINK; 7421 mutex_exit(&mi->mi_lock); 7422 } 7423 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7424 /* XXX-LP */ 7425 if (e.error == EISDIR && crgetuid(cr) != 0) 7426 e.error = EPERM; 7427 goto out; 7428 } 7429 } 7430 7431 /* either no error or one of the postop getattr failed */ 7432 7433 /* 7434 * XXX - if LINK succeeded, but no attrs were returned for link 7435 * file, purge its cache. 7436 * 7437 * XXX Perform a simplified version of wcc checking. Instead of 7438 * have another getattr to get pre-op, just purge cache if 7439 * any of the ops prior to and including the getattr failed. 7440 * If the getattr succeeded then update the attrcache accordingly. 7441 */ 7442 7443 /* 7444 * update cache with link file postattrs. 7445 * Note: at this point resop points to link res. 7446 */ 7447 resop = &res.array[3]; /* link res */ 7448 ln_res = &resop->nfs_resop4_u.oplink; 7449 if (res.status == NFS4_OK) { 7450 e.error = nfs4_update_attrcache(res.status, 7451 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7452 t, svp, cr); 7453 } 7454 7455 /* 7456 * Call makenfs4node to create the new shadow vp for tnm. 7457 * We pass NULL attrs because we just cached attrs for 7458 * the src object. All we're trying to accomplish is to 7459 * to create the new shadow vnode. 7460 */ 7461 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7462 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm)); 7463 7464 /* Update target cache attribute, readdir and dnlc caches */ 7465 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7466 dinfo.di_time_call = t; 7467 dinfo.di_cred = cr; 7468 7469 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7470 ASSERT(nfs4_consistent_type(tdvp)); 7471 ASSERT(nfs4_consistent_type(svp)); 7472 ASSERT(nfs4_consistent_type(nvp)); 7473 VN_RELE(nvp); 7474 7475 out: 7476 kmem_free(argop, argoplist_size); 7477 if (resp) 7478 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7479 7480 nfs_rw_exit(&tdrp->r_rwlock); 7481 7482 return (e.error); 7483 } 7484 7485 static int 7486 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7487 { 7488 vnode_t *realvp; 7489 7490 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7491 return (EPERM); 7492 if (VOP_REALVP(ndvp, &realvp) == 0) 7493 ndvp = realvp; 7494 7495 return (nfs4rename(odvp, onm, ndvp, nnm, cr)); 7496 } 7497 7498 /* 7499 * nfs4rename does the real work of renaming in NFS Version 4. 7500 * 7501 * A file handle is considered volatile for renaming purposes if either 7502 * of the volatile bits are turned on. However, the compound may differ 7503 * based on the likelihood of the filehandle to change during rename. 7504 */ 7505 static int 7506 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr) 7507 { 7508 int error; 7509 mntinfo4_t *mi; 7510 vnode_t *nvp; 7511 vnode_t *ovp = NULL; 7512 char *tmpname = NULL; 7513 rnode4_t *rp; 7514 rnode4_t *odrp; 7515 rnode4_t *ndrp; 7516 int did_link = 0; 7517 int do_link = 1; 7518 nfsstat4 stat = NFS4_OK; 7519 7520 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7521 ASSERT(nfs4_consistent_type(odvp)); 7522 ASSERT(nfs4_consistent_type(ndvp)); 7523 7524 if (onm[0] == '.' && (onm[1] == '\0' || 7525 (onm[1] == '.' && onm[2] == '\0'))) 7526 return (EINVAL); 7527 7528 if (nnm[0] == '.' && (nnm[1] == '\0' || 7529 (nnm[1] == '.' && nnm[2] == '\0'))) 7530 return (EINVAL); 7531 7532 odrp = VTOR4(odvp); 7533 ndrp = VTOR4(ndvp); 7534 if ((intptr_t)odrp < (intptr_t)ndrp) { 7535 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7536 return (EINTR); 7537 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7538 nfs_rw_exit(&odrp->r_rwlock); 7539 return (EINTR); 7540 } 7541 } else { 7542 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7543 return (EINTR); 7544 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7545 nfs_rw_exit(&ndrp->r_rwlock); 7546 return (EINTR); 7547 } 7548 } 7549 7550 /* 7551 * Lookup the target file. If it exists, it needs to be 7552 * checked to see whether it is a mount point and whether 7553 * it is active (open). 7554 */ 7555 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7556 if (!error) { 7557 int isactive; 7558 7559 ASSERT(nfs4_consistent_type(nvp)); 7560 /* 7561 * If this file has been mounted on, then just 7562 * return busy because renaming to it would remove 7563 * the mounted file system from the name space. 7564 */ 7565 if (vn_ismntpt(nvp)) { 7566 VN_RELE(nvp); 7567 nfs_rw_exit(&odrp->r_rwlock); 7568 nfs_rw_exit(&ndrp->r_rwlock); 7569 return (EBUSY); 7570 } 7571 7572 /* 7573 * First just remove the entry from the name cache, as it 7574 * is most likely the only entry for this vp. 7575 */ 7576 dnlc_remove(ndvp, nnm); 7577 7578 rp = VTOR4(nvp); 7579 7580 if (nvp->v_type != VREG) { 7581 /* 7582 * Purge the name cache of all references to this vnode 7583 * so that we can check the reference count to infer 7584 * whether it is active or not. 7585 */ 7586 if (nvp->v_count > 1) 7587 dnlc_purge_vp(nvp); 7588 7589 isactive = nvp->v_count > 1; 7590 } else { 7591 mutex_enter(&rp->r_os_lock); 7592 isactive = list_head(&rp->r_open_streams) != NULL; 7593 mutex_exit(&rp->r_os_lock); 7594 } 7595 7596 /* 7597 * If the vnode is active and is not a directory, 7598 * arrange to rename it to a 7599 * temporary file so that it will continue to be 7600 * accessible. This implements the "unlink-open-file" 7601 * semantics for the target of a rename operation. 7602 * Before doing this though, make sure that the 7603 * source and target files are not already the same. 7604 */ 7605 if (isactive && nvp->v_type != VDIR) { 7606 /* 7607 * Lookup the source name. 7608 */ 7609 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7610 7611 /* 7612 * The source name *should* already exist. 7613 */ 7614 if (error) { 7615 VN_RELE(nvp); 7616 nfs_rw_exit(&odrp->r_rwlock); 7617 nfs_rw_exit(&ndrp->r_rwlock); 7618 return (error); 7619 } 7620 7621 ASSERT(nfs4_consistent_type(ovp)); 7622 7623 /* 7624 * Compare the two vnodes. If they are the same, 7625 * just release all held vnodes and return success. 7626 */ 7627 if (VN_CMP(ovp, nvp)) { 7628 VN_RELE(ovp); 7629 VN_RELE(nvp); 7630 nfs_rw_exit(&odrp->r_rwlock); 7631 nfs_rw_exit(&ndrp->r_rwlock); 7632 return (0); 7633 } 7634 7635 /* 7636 * Can't mix and match directories and non- 7637 * directories in rename operations. We already 7638 * know that the target is not a directory. If 7639 * the source is a directory, return an error. 7640 */ 7641 if (ovp->v_type == VDIR) { 7642 VN_RELE(ovp); 7643 VN_RELE(nvp); 7644 nfs_rw_exit(&odrp->r_rwlock); 7645 nfs_rw_exit(&ndrp->r_rwlock); 7646 return (ENOTDIR); 7647 } 7648 link_call: 7649 /* 7650 * The target file exists, is not the same as 7651 * the source file, and is active. We first 7652 * try to Link it to a temporary filename to 7653 * avoid having the server removing the file 7654 * completely (which could cause data loss to 7655 * the user's POV in the event the Rename fails 7656 * -- see bug 1165874). 7657 */ 7658 /* 7659 * The do_link and did_link booleans are 7660 * introduced in the event we get NFS4ERR_FILE_OPEN 7661 * returned for the Rename. Some servers can 7662 * not Rename over an Open file, so they return 7663 * this error. The client needs to Remove the 7664 * newly created Link and do two Renames, just 7665 * as if the server didn't support LINK. 7666 */ 7667 tmpname = newname(); 7668 error = 0; 7669 7670 if (do_link) { 7671 error = nfs4_link(ndvp, nvp, tmpname, cr); 7672 } 7673 if (error == EOPNOTSUPP || !do_link) { 7674 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7675 cr); 7676 did_link = 0; 7677 } else { 7678 did_link = 1; 7679 } 7680 if (error) { 7681 kmem_free(tmpname, MAXNAMELEN); 7682 VN_RELE(ovp); 7683 VN_RELE(nvp); 7684 nfs_rw_exit(&odrp->r_rwlock); 7685 nfs_rw_exit(&ndrp->r_rwlock); 7686 return (error); 7687 } 7688 7689 mutex_enter(&rp->r_statelock); 7690 if (rp->r_unldvp == NULL) { 7691 VN_HOLD(ndvp); 7692 rp->r_unldvp = ndvp; 7693 if (rp->r_unlcred != NULL) 7694 crfree(rp->r_unlcred); 7695 crhold(cr); 7696 rp->r_unlcred = cr; 7697 rp->r_unlname = tmpname; 7698 } else { 7699 if (rp->r_unlname) 7700 kmem_free(rp->r_unlname, MAXNAMELEN); 7701 rp->r_unlname = tmpname; 7702 } 7703 mutex_exit(&rp->r_statelock); 7704 } 7705 7706 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7707 7708 ASSERT(nfs4_consistent_type(nvp)); 7709 VN_RELE(nvp); 7710 } 7711 7712 if (ovp == NULL) { 7713 /* 7714 * When renaming directories to be a subdirectory of a 7715 * different parent, the dnlc entry for ".." will no 7716 * longer be valid, so it must be removed. 7717 * 7718 * We do a lookup here to determine whether we are renaming 7719 * a directory and we need to check if we are renaming 7720 * an unlinked file. This might have already been done 7721 * in previous code, so we check ovp == NULL to avoid 7722 * doing it twice. 7723 */ 7724 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7725 /* 7726 * The source name *should* already exist. 7727 */ 7728 if (error) { 7729 nfs_rw_exit(&odrp->r_rwlock); 7730 nfs_rw_exit(&ndrp->r_rwlock); 7731 return (error); 7732 } 7733 ASSERT(ovp != NULL); 7734 ASSERT(nfs4_consistent_type(ovp)); 7735 } 7736 7737 /* 7738 * Is the object being renamed a dir, and if so, is 7739 * it being renamed to a child of itself? The underlying 7740 * fs should ultimately return EINVAL for this case; 7741 * however, buggy beta non-Solaris NFSv4 servers at 7742 * interop testing events have allowed this behavior, 7743 * and it caused our client to panic due to a recursive 7744 * mutex_enter in fn_move. 7745 * 7746 * The tedious locking in fn_move could be changed to 7747 * deal with this case, and the client could avoid the 7748 * panic; however, the client would just confuse itself 7749 * later and misbehave. A better way to handle the broken 7750 * server is to detect this condition and return EINVAL 7751 * without ever sending the the bogus rename to the server. 7752 * We know the rename is invalid -- just fail it now. 7753 */ 7754 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7755 VN_RELE(ovp); 7756 nfs_rw_exit(&odrp->r_rwlock); 7757 nfs_rw_exit(&ndrp->r_rwlock); 7758 return (EINVAL); 7759 } 7760 7761 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7762 7763 /* 7764 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7765 * possible for the filehandle to change due to the rename. 7766 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7767 * the fh will not change because of the rename, but we still need 7768 * to update its rnode entry with the new name for 7769 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7770 * has no effect on these for now, but for future improvements, 7771 * we might want to use it too to simplify handling of files 7772 * that are open with that flag on. (XXX) 7773 */ 7774 mi = VTOMI4(odvp); 7775 if (NFS4_VOLATILE_FH(mi)) { 7776 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7777 &stat); 7778 } else { 7779 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7780 &stat); 7781 } 7782 ASSERT(nfs4_consistent_type(odvp)); 7783 ASSERT(nfs4_consistent_type(ndvp)); 7784 ASSERT(nfs4_consistent_type(ovp)); 7785 7786 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7787 do_link = 0; 7788 /* 7789 * Before the 'link_call' code, we did a nfs4_lookup 7790 * that puts a VN_HOLD on nvp. After the nfs4_link 7791 * call we call VN_RELE to match that hold. We need 7792 * to place an additional VN_HOLD here since we will 7793 * be hitting that VN_RELE again. 7794 */ 7795 VN_HOLD(nvp); 7796 7797 (void) nfs4_remove(ndvp, tmpname, cr); 7798 7799 /* Undo the unlinked file naming stuff we just did */ 7800 mutex_enter(&rp->r_statelock); 7801 if (rp->r_unldvp) { 7802 VN_RELE(ndvp); 7803 rp->r_unldvp = NULL; 7804 if (rp->r_unlcred != NULL) 7805 crfree(rp->r_unlcred); 7806 rp->r_unlcred = NULL; 7807 /* rp->r_unlanme points to tmpname */ 7808 if (rp->r_unlname) 7809 kmem_free(rp->r_unlname, MAXNAMELEN); 7810 rp->r_unlname = NULL; 7811 } 7812 mutex_exit(&rp->r_statelock); 7813 7814 goto link_call; 7815 } 7816 7817 if (error) { 7818 VN_RELE(ovp); 7819 nfs_rw_exit(&odrp->r_rwlock); 7820 nfs_rw_exit(&ndrp->r_rwlock); 7821 return (error); 7822 } 7823 7824 /* 7825 * when renaming directories to be a subdirectory of a 7826 * different parent, the dnlc entry for ".." will no 7827 * longer be valid, so it must be removed 7828 */ 7829 rp = VTOR4(ovp); 7830 if (ndvp != odvp) { 7831 if (ovp->v_type == VDIR) { 7832 dnlc_remove(ovp, ".."); 7833 if (rp->r_dir != NULL) 7834 nfs4_purge_rddir_cache(ovp); 7835 } 7836 } 7837 7838 /* 7839 * If we are renaming the unlinked file, update the 7840 * r_unldvp and r_unlname as needed. 7841 */ 7842 mutex_enter(&rp->r_statelock); 7843 if (rp->r_unldvp != NULL) { 7844 if (strcmp(rp->r_unlname, onm) == 0) { 7845 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7846 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7847 if (ndvp != rp->r_unldvp) { 7848 VN_RELE(rp->r_unldvp); 7849 rp->r_unldvp = ndvp; 7850 VN_HOLD(ndvp); 7851 } 7852 } 7853 } 7854 mutex_exit(&rp->r_statelock); 7855 7856 VN_RELE(ovp); 7857 7858 nfs_rw_exit(&odrp->r_rwlock); 7859 nfs_rw_exit(&ndrp->r_rwlock); 7860 7861 return (error); 7862 } 7863 7864 /* 7865 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 7866 * when it is known that the filehandle is persistent through rename. 7867 * 7868 * Rename requires that the current fh be the target directory and the 7869 * saved fh be the source directory. After the operation, the current fh 7870 * is unchanged. 7871 * The compound op structure for persistent fh rename is: 7872 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 7873 * Rather than bother with the directory postop args, we'll simply 7874 * update that a change occured in the cache, so no post-op getattrs. 7875 */ 7876 static int 7877 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 7878 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 7879 { 7880 COMPOUND4args_clnt args; 7881 COMPOUND4res_clnt res, *resp = NULL; 7882 nfs_argop4 *argop; 7883 nfs_resop4 *resop; 7884 int doqueue, argoplist_size; 7885 mntinfo4_t *mi; 7886 rnode4_t *odrp = VTOR4(odvp); 7887 rnode4_t *ndrp = VTOR4(ndvp); 7888 RENAME4res *rn_res; 7889 bool_t needrecov; 7890 nfs4_recov_state_t recov_state; 7891 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7892 dirattr_info_t dinfo, *dinfop; 7893 7894 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7895 7896 recov_state.rs_flags = 0; 7897 recov_state.rs_num_retry_despite_err = 0; 7898 7899 /* 7900 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 7901 * 7902 * If source/target are different dirs, then append putfh(src); getattr 7903 */ 7904 args.array_len = (odvp == ndvp) ? 5 : 7; 7905 argoplist_size = args.array_len * sizeof (nfs_argop4); 7906 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 7907 7908 recov_retry: 7909 *statp = NFS4_OK; 7910 7911 /* No need to Lookup the file, persistent fh */ 7912 args.ctag = TAG_RENAME; 7913 7914 mi = VTOMI4(odvp); 7915 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 7916 if (e.error) { 7917 kmem_free(argop, argoplist_size); 7918 return (e.error); 7919 } 7920 7921 /* 0: putfh source directory */ 7922 argop[0].argop = OP_CPUTFH; 7923 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 7924 7925 /* 1: Save source fh to free up current for target */ 7926 argop[1].argop = OP_SAVEFH; 7927 7928 /* 2: putfh targetdir */ 7929 argop[2].argop = OP_CPUTFH; 7930 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7931 7932 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 7933 argop[3].argop = OP_CRENAME; 7934 argop[3].nfs_argop4_u.opcrename.coldname = onm; 7935 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 7936 7937 /* 4: getattr (targetdir) */ 7938 argop[4].argop = OP_GETATTR; 7939 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7940 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7941 7942 if (ndvp != odvp) { 7943 7944 /* 5: putfh (sourcedir) */ 7945 argop[5].argop = OP_CPUTFH; 7946 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 7947 7948 /* 6: getattr (sourcedir) */ 7949 argop[6].argop = OP_GETATTR; 7950 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7951 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7952 } 7953 7954 dnlc_remove(odvp, onm); 7955 dnlc_remove(ndvp, nnm); 7956 7957 doqueue = 1; 7958 dinfo.di_time_call = gethrtime(); 7959 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7960 7961 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7962 if (e.error) { 7963 PURGE_ATTRCACHE4(odvp); 7964 PURGE_ATTRCACHE4(ndvp); 7965 } else { 7966 *statp = res.status; 7967 } 7968 7969 if (needrecov) { 7970 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 7971 OP_RENAME, NULL) == FALSE) { 7972 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 7973 if (!e.error) 7974 (void) xdr_free(xdr_COMPOUND4res_clnt, 7975 (caddr_t)&res); 7976 goto recov_retry; 7977 } 7978 } 7979 7980 if (!e.error) { 7981 resp = &res; 7982 /* 7983 * as long as OP_RENAME 7984 */ 7985 if (res.status != NFS4_OK && res.array_len <= 4) { 7986 e.error = geterrno4(res.status); 7987 PURGE_ATTRCACHE4(odvp); 7988 PURGE_ATTRCACHE4(ndvp); 7989 /* 7990 * System V defines rename to return EEXIST, not 7991 * ENOTEMPTY if the target directory is not empty. 7992 * Over the wire, the error is NFSERR_ENOTEMPTY 7993 * which geterrno4 maps to ENOTEMPTY. 7994 */ 7995 if (e.error == ENOTEMPTY) 7996 e.error = EEXIST; 7997 } else { 7998 7999 resop = &res.array[3]; /* rename res */ 8000 rn_res = &resop->nfs_resop4_u.oprename; 8001 8002 if (res.status == NFS4_OK) { 8003 /* 8004 * Update target attribute, readdir and dnlc 8005 * caches. 8006 */ 8007 dinfo.di_garp = 8008 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8009 dinfo.di_cred = cr; 8010 dinfop = &dinfo; 8011 } else 8012 dinfop = NULL; 8013 8014 nfs4_update_dircaches(&rn_res->target_cinfo, 8015 ndvp, NULL, NULL, dinfop); 8016 8017 /* 8018 * Update source attribute, readdir and dnlc caches 8019 * 8020 */ 8021 if (ndvp != odvp) { 8022 if (dinfop) 8023 dinfo.di_garp = 8024 &(res.array[6].nfs_resop4_u. 8025 opgetattr.ga_res); 8026 8027 nfs4_update_dircaches(&rn_res->source_cinfo, 8028 odvp, NULL, NULL, dinfop); 8029 } 8030 8031 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8032 nnm); 8033 } 8034 } 8035 8036 if (resp) 8037 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8038 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8039 kmem_free(argop, argoplist_size); 8040 8041 return (e.error); 8042 } 8043 8044 /* 8045 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8046 * it is possible for the filehandle to change due to the rename. 8047 * 8048 * The compound req in this case includes a post-rename lookup and getattr 8049 * to ensure that we have the correct fh and attributes for the object. 8050 * 8051 * Rename requires that the current fh be the target directory and the 8052 * saved fh be the source directory. After the operation, the current fh 8053 * is unchanged. 8054 * 8055 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8056 * update the filehandle for the renamed object. We also get the old 8057 * filehandle for historical reasons; this should be taken out sometime. 8058 * This results in a rather cumbersome compound... 8059 * 8060 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8061 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8062 * 8063 */ 8064 static int 8065 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8066 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8067 { 8068 COMPOUND4args_clnt args; 8069 COMPOUND4res_clnt res, *resp = NULL; 8070 int argoplist_size; 8071 nfs_argop4 *argop; 8072 nfs_resop4 *resop; 8073 int doqueue; 8074 mntinfo4_t *mi; 8075 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8076 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8077 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8078 RENAME4res *rn_res; 8079 GETFH4res *ngf_res; 8080 bool_t needrecov; 8081 nfs4_recov_state_t recov_state; 8082 hrtime_t t; 8083 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8084 dirattr_info_t dinfo, *dinfop = &dinfo; 8085 8086 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8087 8088 recov_state.rs_flags = 0; 8089 recov_state.rs_num_retry_despite_err = 0; 8090 8091 recov_retry: 8092 *statp = NFS4_OK; 8093 8094 /* 8095 * There is a window between the RPC and updating the path and 8096 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8097 * code, so that it doesn't try to use the old path during that 8098 * window. 8099 */ 8100 mutex_enter(&orp->r_statelock); 8101 while (orp->r_flags & R4RECEXPFH) { 8102 klwp_t *lwp = ttolwp(curthread); 8103 8104 if (lwp != NULL) 8105 lwp->lwp_nostop++; 8106 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8107 mutex_exit(&orp->r_statelock); 8108 if (lwp != NULL) 8109 lwp->lwp_nostop--; 8110 return (EINTR); 8111 } 8112 if (lwp != NULL) 8113 lwp->lwp_nostop--; 8114 } 8115 orp->r_flags |= R4RECEXPFH; 8116 mutex_exit(&orp->r_statelock); 8117 8118 mi = VTOMI4(odvp); 8119 8120 args.ctag = TAG_RENAME_VFH; 8121 args.array_len = (odvp == ndvp) ? 10 : 12; 8122 argoplist_size = args.array_len * sizeof (nfs_argop4); 8123 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8124 8125 /* 8126 * Rename ops: 8127 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8128 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8129 * LOOKUP(trgt), GETFH(new), GETATTR, 8130 * 8131 * if (odvp != ndvp) 8132 * add putfh(sourcedir), getattr(sourcedir) } 8133 */ 8134 args.array = argop; 8135 8136 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8137 &recov_state, NULL); 8138 if (e.error) { 8139 kmem_free(argop, argoplist_size); 8140 mutex_enter(&orp->r_statelock); 8141 orp->r_flags &= ~R4RECEXPFH; 8142 cv_broadcast(&orp->r_cv); 8143 mutex_exit(&orp->r_statelock); 8144 return (e.error); 8145 } 8146 8147 /* 0: putfh source directory */ 8148 argop[0].argop = OP_CPUTFH; 8149 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8150 8151 /* 1: Save source fh to free up current for target */ 8152 argop[1].argop = OP_SAVEFH; 8153 8154 /* 2: Lookup pre-rename fh of renamed object */ 8155 argop[2].argop = OP_CLOOKUP; 8156 argop[2].nfs_argop4_u.opclookup.cname = onm; 8157 8158 /* 3: getfh fh of renamed object (before rename) */ 8159 argop[3].argop = OP_GETFH; 8160 8161 /* 4: putfh targetdir */ 8162 argop[4].argop = OP_CPUTFH; 8163 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8164 8165 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8166 argop[5].argop = OP_CRENAME; 8167 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8168 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8169 8170 /* 6: getattr of target dir (post op attrs) */ 8171 argop[6].argop = OP_GETATTR; 8172 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8173 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8174 8175 /* 7: Lookup post-rename fh of renamed object */ 8176 argop[7].argop = OP_CLOOKUP; 8177 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8178 8179 /* 8: getfh fh of renamed object (after rename) */ 8180 argop[8].argop = OP_GETFH; 8181 8182 /* 9: getattr of renamed object */ 8183 argop[9].argop = OP_GETATTR; 8184 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8185 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8186 8187 /* 8188 * If source/target dirs are different, then get new post-op 8189 * attrs for source dir also. 8190 */ 8191 if (ndvp != odvp) { 8192 /* 10: putfh (sourcedir) */ 8193 argop[10].argop = OP_CPUTFH; 8194 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8195 8196 /* 11: getattr (sourcedir) */ 8197 argop[11].argop = OP_GETATTR; 8198 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8199 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8200 } 8201 8202 dnlc_remove(odvp, onm); 8203 dnlc_remove(ndvp, nnm); 8204 8205 doqueue = 1; 8206 t = gethrtime(); 8207 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8208 8209 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8210 if (e.error) { 8211 PURGE_ATTRCACHE4(odvp); 8212 PURGE_ATTRCACHE4(ndvp); 8213 if (!needrecov) { 8214 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8215 &recov_state, needrecov); 8216 goto out; 8217 } 8218 } else { 8219 *statp = res.status; 8220 } 8221 8222 if (needrecov) { 8223 bool_t abort; 8224 8225 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8226 OP_RENAME, NULL); 8227 if (abort == FALSE) { 8228 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8229 &recov_state, needrecov); 8230 kmem_free(argop, argoplist_size); 8231 if (!e.error) 8232 (void) xdr_free(xdr_COMPOUND4res_clnt, 8233 (caddr_t)&res); 8234 mutex_enter(&orp->r_statelock); 8235 orp->r_flags &= ~R4RECEXPFH; 8236 cv_broadcast(&orp->r_cv); 8237 mutex_exit(&orp->r_statelock); 8238 goto recov_retry; 8239 } else { 8240 if (e.error != 0) { 8241 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8242 &recov_state, needrecov); 8243 goto out; 8244 } 8245 /* fall through for res.status case */ 8246 } 8247 } 8248 8249 resp = &res; 8250 /* 8251 * If OP_RENAME (or any prev op) failed, then return an error. 8252 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8253 */ 8254 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8255 /* 8256 * Error in an op other than last Getattr 8257 */ 8258 e.error = geterrno4(res.status); 8259 PURGE_ATTRCACHE4(odvp); 8260 PURGE_ATTRCACHE4(ndvp); 8261 /* 8262 * System V defines rename to return EEXIST, not 8263 * ENOTEMPTY if the target directory is not empty. 8264 * Over the wire, the error is NFSERR_ENOTEMPTY 8265 * which geterrno4 maps to ENOTEMPTY. 8266 */ 8267 if (e.error == ENOTEMPTY) 8268 e.error = EEXIST; 8269 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8270 needrecov); 8271 goto out; 8272 } 8273 8274 /* rename results */ 8275 rn_res = &res.array[5].nfs_resop4_u.oprename; 8276 8277 if (res.status == NFS4_OK) { 8278 /* Update target attribute, readdir and dnlc caches */ 8279 dinfo.di_garp = 8280 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8281 dinfo.di_cred = cr; 8282 dinfo.di_time_call = t; 8283 } else 8284 dinfop = NULL; 8285 8286 /* Update source cache attribute, readdir and dnlc caches */ 8287 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8288 8289 /* Update source cache attribute, readdir and dnlc caches */ 8290 if (ndvp != odvp) { 8291 8292 /* 8293 * If dinfop is non-NULL, then compound succeded, so 8294 * set di_garp to attrs for source dir. dinfop is only 8295 * set to NULL when compound fails. 8296 */ 8297 if (dinfop) 8298 dinfo.di_garp = 8299 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8300 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8301 dinfop); 8302 } 8303 8304 /* 8305 * Update the rnode with the new component name and args, 8306 * and if the file handle changed, also update it with the new fh. 8307 * This is only necessary if the target object has an rnode 8308 * entry and there is no need to create one for it. 8309 */ 8310 resop = &res.array[8]; /* getfh new res */ 8311 ngf_res = &resop->nfs_resop4_u.opgetfh; 8312 8313 /* 8314 * Update the path and filehandle for the renamed object. 8315 */ 8316 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8317 8318 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8319 8320 if (res.status == NFS4_OK) { 8321 resop++; /* getattr res */ 8322 e.error = nfs4_update_attrcache(res.status, 8323 &resop->nfs_resop4_u.opgetattr.ga_res, 8324 t, ovp, cr); 8325 } 8326 8327 out: 8328 kmem_free(argop, argoplist_size); 8329 if (resp) 8330 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8331 mutex_enter(&orp->r_statelock); 8332 orp->r_flags &= ~R4RECEXPFH; 8333 cv_broadcast(&orp->r_cv); 8334 mutex_exit(&orp->r_statelock); 8335 8336 return (e.error); 8337 } 8338 8339 static int 8340 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr) 8341 { 8342 int error; 8343 vnode_t *vp; 8344 8345 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8346 return (EPERM); 8347 /* 8348 * As ".." has special meaning and rather than send a mkdir 8349 * over the wire to just let the server freak out, we just 8350 * short circuit it here and return EEXIST 8351 */ 8352 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8353 return (EEXIST); 8354 8355 /* 8356 * Decision to get the right gid and setgid bit of the 8357 * new directory is now made in call_nfs4_create_req. 8358 */ 8359 va->va_mask |= AT_MODE; 8360 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8361 if (error) 8362 return (error); 8363 8364 *vpp = vp; 8365 return (0); 8366 } 8367 8368 8369 /* 8370 * rmdir is using the same remove v4 op as does remove. 8371 * Remove requires that the current fh be the target directory. 8372 * After the operation, the current fh is unchanged. 8373 * The compound op structure is: 8374 * PUTFH(targetdir), REMOVE 8375 */ 8376 static int 8377 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr) 8378 { 8379 int need_end_op = FALSE; 8380 COMPOUND4args_clnt args; 8381 COMPOUND4res_clnt res, *resp = NULL; 8382 REMOVE4res *rm_res; 8383 nfs_argop4 argop[3]; 8384 nfs_resop4 *resop; 8385 vnode_t *vp; 8386 int doqueue; 8387 mntinfo4_t *mi; 8388 rnode4_t *drp; 8389 bool_t needrecov = FALSE; 8390 nfs4_recov_state_t recov_state; 8391 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8392 dirattr_info_t dinfo, *dinfop; 8393 8394 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8395 return (EPERM); 8396 /* 8397 * As ".." has special meaning and rather than send a rmdir 8398 * over the wire to just let the server freak out, we just 8399 * short circuit it here and return EEXIST 8400 */ 8401 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8402 return (EEXIST); 8403 8404 drp = VTOR4(dvp); 8405 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8406 return (EINTR); 8407 8408 /* 8409 * Attempt to prevent a rmdir(".") from succeeding. 8410 */ 8411 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8412 if (e.error) { 8413 nfs_rw_exit(&drp->r_rwlock); 8414 return (e.error); 8415 } 8416 if (vp == cdir) { 8417 VN_RELE(vp); 8418 nfs_rw_exit(&drp->r_rwlock); 8419 return (EINVAL); 8420 } 8421 8422 /* 8423 * Since nfsv4 remove op works on both files and directories, 8424 * check that the removed object is indeed a directory. 8425 */ 8426 if (vp->v_type != VDIR) { 8427 VN_RELE(vp); 8428 nfs_rw_exit(&drp->r_rwlock); 8429 return (ENOTDIR); 8430 } 8431 8432 /* 8433 * First just remove the entry from the name cache, as it 8434 * is most likely an entry for this vp. 8435 */ 8436 dnlc_remove(dvp, nm); 8437 8438 /* 8439 * If there vnode reference count is greater than one, then 8440 * there may be additional references in the DNLC which will 8441 * need to be purged. First, trying removing the entry for 8442 * the parent directory and see if that removes the additional 8443 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8444 * to completely remove any references to the directory which 8445 * might still exist in the DNLC. 8446 */ 8447 if (vp->v_count > 1) { 8448 dnlc_remove(vp, ".."); 8449 if (vp->v_count > 1) 8450 dnlc_purge_vp(vp); 8451 } 8452 8453 mi = VTOMI4(dvp); 8454 recov_state.rs_flags = 0; 8455 recov_state.rs_num_retry_despite_err = 0; 8456 8457 recov_retry: 8458 args.ctag = TAG_RMDIR; 8459 8460 /* 8461 * Rmdir ops: putfh dir; remove 8462 */ 8463 args.array_len = 3; 8464 args.array = argop; 8465 8466 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8467 if (e.error) { 8468 nfs_rw_exit(&drp->r_rwlock); 8469 return (e.error); 8470 } 8471 need_end_op = TRUE; 8472 8473 /* putfh directory */ 8474 argop[0].argop = OP_CPUTFH; 8475 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8476 8477 /* remove */ 8478 argop[1].argop = OP_CREMOVE; 8479 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8480 8481 /* getattr (postop attrs for dir that contained removed dir) */ 8482 argop[2].argop = OP_GETATTR; 8483 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8484 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8485 8486 dinfo.di_time_call = gethrtime(); 8487 doqueue = 1; 8488 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8489 8490 PURGE_ATTRCACHE4(vp); 8491 8492 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8493 if (e.error) { 8494 PURGE_ATTRCACHE4(dvp); 8495 } 8496 8497 if (needrecov) { 8498 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8499 NULL, OP_REMOVE, NULL) == FALSE) { 8500 if (!e.error) 8501 (void) xdr_free(xdr_COMPOUND4res_clnt, 8502 (caddr_t)&res); 8503 8504 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8505 needrecov); 8506 need_end_op = FALSE; 8507 goto recov_retry; 8508 } 8509 } 8510 8511 if (!e.error) { 8512 resp = &res; 8513 8514 /* 8515 * Only return error if first 2 ops (OP_REMOVE or earlier) 8516 * failed. 8517 */ 8518 if (res.status != NFS4_OK && res.array_len <= 2) { 8519 e.error = geterrno4(res.status); 8520 PURGE_ATTRCACHE4(dvp); 8521 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8522 &recov_state, needrecov); 8523 need_end_op = FALSE; 8524 nfs4_purge_stale_fh(e.error, dvp, cr); 8525 /* 8526 * System V defines rmdir to return EEXIST, not 8527 * ENOTEMPTY if the directory is not empty. Over 8528 * the wire, the error is NFSERR_ENOTEMPTY which 8529 * geterrno4 maps to ENOTEMPTY. 8530 */ 8531 if (e.error == ENOTEMPTY) 8532 e.error = EEXIST; 8533 } else { 8534 resop = &res.array[1]; /* remove res */ 8535 rm_res = &resop->nfs_resop4_u.opremove; 8536 8537 if (res.status == NFS4_OK) { 8538 resop = &res.array[2]; /* dir attrs */ 8539 dinfo.di_garp = 8540 &resop->nfs_resop4_u.opgetattr.ga_res; 8541 dinfo.di_cred = cr; 8542 dinfop = &dinfo; 8543 } else 8544 dinfop = NULL; 8545 8546 /* Update dir attribute, readdir and dnlc caches */ 8547 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8548 dinfop); 8549 8550 /* destroy rddir cache for dir that was removed */ 8551 if (VTOR4(vp)->r_dir != NULL) 8552 nfs4_purge_rddir_cache(vp); 8553 } 8554 } 8555 8556 if (need_end_op) 8557 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8558 8559 nfs_rw_exit(&drp->r_rwlock); 8560 8561 if (resp) 8562 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8563 8564 VN_RELE(vp); 8565 8566 return (e.error); 8567 } 8568 8569 static int 8570 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr) 8571 { 8572 int error; 8573 vnode_t *vp; 8574 rnode4_t *rp; 8575 char *contents; 8576 mntinfo4_t *mi = VTOMI4(dvp); 8577 8578 if (nfs_zone() != mi->mi_zone) 8579 return (EPERM); 8580 if (!(mi->mi_flags & MI4_SYMLINK)) 8581 return (EOPNOTSUPP); 8582 8583 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8584 if (error) { 8585 return (error); 8586 } 8587 8588 ASSERT(nfs4_consistent_type(vp)); 8589 rp = VTOR4(vp); 8590 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8591 8592 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8593 8594 if (contents != NULL) { 8595 mutex_enter(&rp->r_statelock); 8596 if (rp->r_symlink.contents == NULL) { 8597 rp->r_symlink.len = strlen(tnm); 8598 bcopy(tnm, contents, rp->r_symlink.len); 8599 rp->r_symlink.contents = contents; 8600 rp->r_symlink.size = MAXPATHLEN; 8601 mutex_exit(&rp->r_statelock); 8602 } else { 8603 mutex_exit(&rp->r_statelock); 8604 kmem_free((void *)contents, MAXPATHLEN); 8605 } 8606 } 8607 } 8608 VN_RELE(vp); 8609 8610 return (error); 8611 } 8612 8613 8614 /* 8615 * Read directory entries. 8616 * There are some weird things to look out for here. The uio_loffset 8617 * field is either 0 or it is the offset returned from a previous 8618 * readdir. It is an opaque value used by the server to find the 8619 * correct directory block to read. The count field is the number 8620 * of blocks to read on the server. This is advisory only, the server 8621 * may return only one block's worth of entries. Entries may be compressed 8622 * on the server. 8623 */ 8624 static int 8625 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp) 8626 { 8627 int error; 8628 uint_t count; 8629 rnode4_t *rp; 8630 rddir4_cache *rdc; 8631 rddir4_cache *rrdc; 8632 8633 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8634 return (EIO); 8635 rp = VTOR4(vp); 8636 8637 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8638 8639 /* 8640 * Make sure that the directory cache is valid. 8641 */ 8642 if (rp->r_dir != NULL) { 8643 if (nfs_disable_rddir_cache != 0) { 8644 /* 8645 * Setting nfs_disable_rddir_cache in /etc/system 8646 * allows interoperability with servers that do not 8647 * properly update the attributes of directories. 8648 * Any cached information gets purged before an 8649 * access is made to it. 8650 */ 8651 nfs4_purge_rddir_cache(vp); 8652 } 8653 8654 error = nfs4_validate_caches(vp, cr); 8655 if (error) 8656 return (error); 8657 } 8658 8659 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8660 8661 /* 8662 * Short circuit last readdir which always returns 0 bytes. 8663 * This can be done after the directory has been read through 8664 * completely at least once. This will set r_direof which 8665 * can be used to find the value of the last cookie. 8666 */ 8667 mutex_enter(&rp->r_statelock); 8668 if (rp->r_direof != NULL && 8669 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8670 mutex_exit(&rp->r_statelock); 8671 #ifdef DEBUG 8672 nfs4_readdir_cache_shorts++; 8673 #endif 8674 if (eofp) 8675 *eofp = 1; 8676 return (0); 8677 } 8678 8679 /* 8680 * Look for a cache entry. Cache entries are identified 8681 * by the NFS cookie value and the byte count requested. 8682 */ 8683 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8684 8685 /* 8686 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8687 */ 8688 if (rdc == NULL) { 8689 mutex_exit(&rp->r_statelock); 8690 return (EINTR); 8691 } 8692 8693 /* 8694 * Check to see if we need to fill this entry in. 8695 */ 8696 if (rdc->flags & RDDIRREQ) { 8697 rdc->flags &= ~RDDIRREQ; 8698 rdc->flags |= RDDIR; 8699 mutex_exit(&rp->r_statelock); 8700 8701 /* 8702 * Do the readdir. 8703 */ 8704 nfs4readdir(vp, rdc, cr); 8705 8706 /* 8707 * Reaquire the lock, so that we can continue 8708 */ 8709 mutex_enter(&rp->r_statelock); 8710 /* 8711 * The entry is now complete 8712 */ 8713 rdc->flags &= ~RDDIR; 8714 } 8715 8716 ASSERT(!(rdc->flags & RDDIR)); 8717 8718 /* 8719 * If an error occurred while attempting 8720 * to fill the cache entry, mark the entry invalid and 8721 * just return the error. 8722 */ 8723 if (rdc->error) { 8724 error = rdc->error; 8725 rdc->flags |= RDDIRREQ; 8726 rddir4_cache_rele(rp, rdc); 8727 mutex_exit(&rp->r_statelock); 8728 return (error); 8729 } 8730 8731 /* 8732 * The cache entry is complete and good, 8733 * copyout the dirent structs to the calling 8734 * thread. 8735 */ 8736 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8737 8738 /* 8739 * If no error occurred during the copyout, 8740 * update the offset in the uio struct to 8741 * contain the value of the next NFS 4 cookie 8742 * and set the eof value appropriately. 8743 */ 8744 if (!error) { 8745 uiop->uio_loffset = rdc->nfs4_ncookie; 8746 if (eofp) 8747 *eofp = rdc->eof; 8748 } 8749 8750 /* 8751 * Decide whether to do readahead. Don't if we 8752 * have already read to the end of directory. 8753 */ 8754 if (rdc->eof) { 8755 /* 8756 * Make the entry the direof only if it is cached 8757 */ 8758 if (rdc->flags & RDDIRCACHED) 8759 rp->r_direof = rdc; 8760 rddir4_cache_rele(rp, rdc); 8761 mutex_exit(&rp->r_statelock); 8762 return (error); 8763 } 8764 8765 /* Determine if a readdir readahead should be done */ 8766 if (!(rp->r_flags & R4LOOKUP)) { 8767 rddir4_cache_rele(rp, rdc); 8768 mutex_exit(&rp->r_statelock); 8769 return (error); 8770 } 8771 8772 /* 8773 * Now look for a readahead entry. 8774 * 8775 * Check to see whether we found an entry for the readahead. 8776 * If so, we don't need to do anything further, so free the new 8777 * entry if one was allocated. Otherwise, allocate a new entry, add 8778 * it to the cache, and then initiate an asynchronous readdir 8779 * operation to fill it. 8780 */ 8781 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8782 8783 /* 8784 * A readdir cache entry could not be obtained for the readahead. In 8785 * this case we skip the readahead and return. 8786 */ 8787 if (rrdc == NULL) { 8788 rddir4_cache_rele(rp, rdc); 8789 mutex_exit(&rp->r_statelock); 8790 return (error); 8791 } 8792 8793 /* 8794 * Check to see if we need to fill this entry in. 8795 */ 8796 if (rrdc->flags & RDDIRREQ) { 8797 rrdc->flags &= ~RDDIRREQ; 8798 rrdc->flags |= RDDIR; 8799 rddir4_cache_rele(rp, rdc); 8800 mutex_exit(&rp->r_statelock); 8801 #ifdef DEBUG 8802 nfs4_readdir_readahead++; 8803 #endif 8804 /* 8805 * Do the readdir. 8806 */ 8807 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 8808 return (error); 8809 } 8810 8811 rddir4_cache_rele(rp, rrdc); 8812 rddir4_cache_rele(rp, rdc); 8813 mutex_exit(&rp->r_statelock); 8814 return (error); 8815 } 8816 8817 static int 8818 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8819 { 8820 int error; 8821 rnode4_t *rp; 8822 8823 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 8824 8825 rp = VTOR4(vp); 8826 8827 /* 8828 * Obtain the readdir results for the caller. 8829 */ 8830 nfs4readdir(vp, rdc, cr); 8831 8832 mutex_enter(&rp->r_statelock); 8833 /* 8834 * The entry is now complete 8835 */ 8836 rdc->flags &= ~RDDIR; 8837 8838 error = rdc->error; 8839 if (error) 8840 rdc->flags |= RDDIRREQ; 8841 rddir4_cache_rele(rp, rdc); 8842 mutex_exit(&rp->r_statelock); 8843 8844 return (error); 8845 } 8846 8847 static void 8848 nfs4readdir_stub(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8849 { 8850 int stublength; 8851 dirent64_t *dp; 8852 u_longlong_t nodeid, pnodeid; 8853 vnode_t *dotdotvp = NULL; 8854 rnode4_t *rp = VTOR4(vp); 8855 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8856 8857 rdc->error = 0; 8858 rdc->entries = 0; 8859 rdc->actlen = rdc->entlen = 0; 8860 rdc->eof = TRUE; 8861 8862 /* Check for EOF case for readdir of stub */ 8863 if (cookie != 0 && cookie != 1) 8864 return; 8865 8866 nodeid = rp->r_attr.va_nodeid; 8867 if (vp->v_flag & VROOT) { 8868 pnodeid = nodeid; /* root of mount point */ 8869 } else { 8870 if (rdc->error = nfs4_lookup(vp, "..", &dotdotvp, 0, 0, 0, cr)) 8871 return; 8872 pnodeid = VTOR4(dotdotvp)->r_attr.va_nodeid; 8873 VN_RELE(dotdotvp); 8874 } 8875 8876 stublength = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8877 rdc->entries = kmem_alloc(stublength, KM_SLEEP); 8878 rdc->entlen = rdc->buflen = stublength; 8879 rdc->eof = TRUE; 8880 8881 dp = (dirent64_t *)rdc->entries; 8882 8883 if (rdc->nfs4_cookie == (nfs_cookie4)0) { 8884 bcopy(nfs4_dot_entries, rdc->entries, 8885 DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2)); 8886 dp->d_ino = nodeid; 8887 dp = (struct dirent64 *)(((char *)dp) + DIRENT64_RECLEN(1)); 8888 dp->d_ino = pnodeid; 8889 rdc->actlen = DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2); 8890 } else { /* for ".." entry */ 8891 bcopy(nfs4_dot_dot_entry, rdc->entries, DIRENT64_RECLEN(2)); 8892 dp->d_ino = pnodeid; 8893 rdc->actlen = DIRENT64_RECLEN(2); 8894 } 8895 rdc->nfs4_ncookie = rdc->actlen; 8896 } 8897 8898 /* 8899 * Read directory entries. 8900 * There are some weird things to look out for here. The uio_loffset 8901 * field is either 0 or it is the offset returned from a previous 8902 * readdir. It is an opaque value used by the server to find the 8903 * correct directory block to read. The count field is the number 8904 * of blocks to read on the server. This is advisory only, the server 8905 * may return only one block's worth of entries. Entries may be compressed 8906 * on the server. 8907 * 8908 * Generates the following compound request: 8909 * 1. If readdir offset is zero and no dnlc entry for parent exists, 8910 * must include a Lookupp as well. In this case, send: 8911 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 8912 * 2. Otherwise just do: { Putfh <fh>; Readdir } 8913 * 8914 * Get complete attributes and filehandles for entries if this is the 8915 * first read of the directory. Otherwise, just get fileid's. 8916 */ 8917 static void 8918 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 8919 { 8920 COMPOUND4args_clnt args; 8921 COMPOUND4res_clnt res; 8922 READDIR4args *rargs; 8923 READDIR4res_clnt *rd_res; 8924 bitmap4 rd_bitsval; 8925 nfs_argop4 argop[5]; 8926 nfs_resop4 *resop; 8927 rnode4_t *rp = VTOR4(vp); 8928 mntinfo4_t *mi = VTOMI4(vp); 8929 int doqueue; 8930 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 8931 vnode_t *dvp; 8932 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 8933 int num_ops, res_opcnt; 8934 bool_t needrecov = FALSE; 8935 nfs4_recov_state_t recov_state; 8936 hrtime_t t; 8937 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8938 8939 ASSERT(nfs_zone() == mi->mi_zone); 8940 ASSERT(rdc->flags & RDDIR); 8941 ASSERT(rdc->entries == NULL); 8942 8943 if (rp->r_flags & R4SRVSTUB) { 8944 nfs4readdir_stub(vp, rdc, cr); 8945 return; 8946 } 8947 8948 num_ops = 2; 8949 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 8950 /* 8951 * Since nfsv4 readdir may not return entries for "." and "..", 8952 * the client must recreate them: 8953 * To find the correct nodeid, do the following: 8954 * For current node, get nodeid from dnlc. 8955 * - if current node is rootvp, set pnodeid to nodeid. 8956 * - else if parent is in the dnlc, get its nodeid from there. 8957 * - else add LOOKUPP+GETATTR to compound. 8958 */ 8959 nodeid = rp->r_attr.va_nodeid; 8960 if (vp->v_flag & VROOT) { 8961 pnodeid = nodeid; /* root of mount point */ 8962 } else { 8963 dvp = dnlc_lookup(vp, ".."); 8964 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 8965 /* parent in dnlc cache - no need for otw */ 8966 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 8967 } else { 8968 /* 8969 * parent not in dnlc cache, 8970 * do lookupp to get its id 8971 */ 8972 num_ops = 5; 8973 pnodeid = 0; /* set later by getattr parent */ 8974 } 8975 if (dvp) 8976 VN_RELE(dvp); 8977 } 8978 } 8979 recov_state.rs_flags = 0; 8980 recov_state.rs_num_retry_despite_err = 0; 8981 8982 /* Save the original mount point security flavor */ 8983 (void) save_mnt_secinfo(mi->mi_curr_serv); 8984 8985 recov_retry: 8986 args.ctag = TAG_READDIR; 8987 8988 args.array = argop; 8989 args.array_len = num_ops; 8990 8991 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 8992 &recov_state, NULL)) { 8993 /* 8994 * If readdir a node that is a stub for a crossed mount point, 8995 * keep the original secinfo flavor for the current file 8996 * system, not the crossed one. 8997 */ 8998 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 8999 rdc->error = e.error; 9000 return; 9001 } 9002 9003 /* 9004 * Determine which attrs to request for dirents. This code 9005 * must be protected by nfs4_start/end_fop because of r_server 9006 * (which will change during failover recovery). 9007 * 9008 */ 9009 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9010 /* 9011 * Get all vattr attrs plus filehandle and rdattr_error 9012 */ 9013 rd_bitsval = NFS4_VATTR_MASK | 9014 FATTR4_RDATTR_ERROR_MASK | 9015 FATTR4_FILEHANDLE_MASK; 9016 9017 if (rp->r_flags & R4READDIRWATTR) { 9018 mutex_enter(&rp->r_statelock); 9019 rp->r_flags &= ~R4READDIRWATTR; 9020 mutex_exit(&rp->r_statelock); 9021 } 9022 } else { 9023 servinfo4_t *svp = rp->r_server; 9024 9025 /* 9026 * Already read directory. Use readdir with 9027 * no attrs (except for mounted_on_fileid) for updates. 9028 */ 9029 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9030 9031 /* 9032 * request mounted on fileid if supported, else request 9033 * fileid. maybe we should verify that fileid is supported 9034 * and request something else if not. 9035 */ 9036 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9037 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9038 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9039 nfs_rw_exit(&svp->sv_lock); 9040 } 9041 9042 /* putfh directory fh */ 9043 argop[0].argop = OP_CPUTFH; 9044 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9045 9046 argop[1].argop = OP_READDIR; 9047 rargs = &argop[1].nfs_argop4_u.opreaddir; 9048 /* 9049 * 1 and 2 are reserved for client "." and ".." entry offset. 9050 * cookie 0 should be used over-the-wire to start reading at 9051 * the beginning of the directory excluding "." and "..". 9052 */ 9053 if (rdc->nfs4_cookie == 0 || 9054 rdc->nfs4_cookie == 1 || 9055 rdc->nfs4_cookie == 2) { 9056 rargs->cookie = (nfs_cookie4)0; 9057 rargs->cookieverf = 0; 9058 } else { 9059 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9060 mutex_enter(&rp->r_statelock); 9061 rargs->cookieverf = rp->r_cookieverf4; 9062 mutex_exit(&rp->r_statelock); 9063 } 9064 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9065 rargs->maxcount = mi->mi_tsize; 9066 rargs->attr_request = rd_bitsval; 9067 rargs->rdc = rdc; 9068 rargs->dvp = vp; 9069 rargs->mi = mi; 9070 rargs->cr = cr; 9071 9072 9073 /* 9074 * If count < than the minimum required, we return no entries 9075 * and fail with EINVAL 9076 */ 9077 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9078 rdc->error = EINVAL; 9079 goto out; 9080 } 9081 9082 if (args.array_len == 5) { 9083 /* 9084 * Add lookupp and getattr for parent nodeid. 9085 */ 9086 argop[2].argop = OP_LOOKUPP; 9087 9088 argop[3].argop = OP_GETFH; 9089 9090 /* getattr parent */ 9091 argop[4].argop = OP_GETATTR; 9092 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9093 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9094 } 9095 9096 doqueue = 1; 9097 9098 if (mi->mi_io_kstats) { 9099 mutex_enter(&mi->mi_lock); 9100 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9101 mutex_exit(&mi->mi_lock); 9102 } 9103 9104 /* capture the time of this call */ 9105 rargs->t = t = gethrtime(); 9106 9107 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9108 9109 if (mi->mi_io_kstats) { 9110 mutex_enter(&mi->mi_lock); 9111 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9112 mutex_exit(&mi->mi_lock); 9113 } 9114 9115 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9116 9117 /* 9118 * If RPC error occurred and it isn't an error that 9119 * triggers recovery, then go ahead and fail now. 9120 */ 9121 if (e.error != 0 && !needrecov) { 9122 rdc->error = e.error; 9123 goto out; 9124 } 9125 9126 if (needrecov) { 9127 bool_t abort; 9128 9129 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9130 "nfs4readdir: initiating recovery.\n")); 9131 9132 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9133 NULL, OP_READDIR, NULL); 9134 if (abort == FALSE) { 9135 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9136 &recov_state, needrecov); 9137 if (!e.error) 9138 (void) xdr_free(xdr_COMPOUND4res_clnt, 9139 (caddr_t)&res); 9140 if (rdc->entries != NULL) { 9141 kmem_free(rdc->entries, rdc->entlen); 9142 rdc->entries = NULL; 9143 } 9144 goto recov_retry; 9145 } 9146 9147 if (e.error != 0) { 9148 rdc->error = e.error; 9149 goto out; 9150 } 9151 9152 /* fall through for res.status case */ 9153 } 9154 9155 res_opcnt = res.array_len; 9156 9157 /* 9158 * If compound failed first 2 ops (PUTFH+READDIR), then return 9159 * failure here. Subsequent ops are for filling out dot-dot 9160 * dirent, and if they fail, we still want to give the caller 9161 * the dirents returned by (the successful) READDIR op, so we need 9162 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9163 * 9164 * One example where PUTFH+READDIR ops would succeed but 9165 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9166 * but lacks x. In this case, a POSIX server's VOP_READDIR 9167 * would succeed; however, VOP_LOOKUP(..) would fail since no 9168 * x perm. We need to come up with a non-vendor-specific way 9169 * for a POSIX server to return d_ino from dotdot's dirent if 9170 * client only requests mounted_on_fileid, and just say the 9171 * LOOKUPP succeeded and fill out the GETATTR. However, if 9172 * client requested any mandatory attrs, server would be required 9173 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9174 * for dotdot. 9175 */ 9176 9177 if (res.status) { 9178 if (res_opcnt <= 2) { 9179 e.error = geterrno4(res.status); 9180 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9181 &recov_state, needrecov); 9182 nfs4_purge_stale_fh(e.error, vp, cr); 9183 rdc->error = e.error; 9184 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9185 if (rdc->entries != NULL) { 9186 kmem_free(rdc->entries, rdc->entlen); 9187 rdc->entries = NULL; 9188 } 9189 /* 9190 * If readdir a node that is a stub for a 9191 * crossed mount point, keep the original 9192 * secinfo flavor for the current file system, 9193 * not the crossed one. 9194 */ 9195 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9196 return; 9197 } 9198 } 9199 9200 resop = &res.array[1]; /* readdir res */ 9201 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9202 9203 mutex_enter(&rp->r_statelock); 9204 rp->r_cookieverf4 = rd_res->cookieverf; 9205 mutex_exit(&rp->r_statelock); 9206 9207 /* 9208 * For "." and ".." entries 9209 * e.g. 9210 * seek(cookie=0) -> "." entry with d_off = 1 9211 * seek(cookie=1) -> ".." entry with d_off = 2 9212 */ 9213 if (cookie == (nfs_cookie4) 0) { 9214 if (rd_res->dotp) 9215 rd_res->dotp->d_ino = nodeid; 9216 if (rd_res->dotdotp) 9217 rd_res->dotdotp->d_ino = pnodeid; 9218 } 9219 if (cookie == (nfs_cookie4) 1) { 9220 if (rd_res->dotdotp) 9221 rd_res->dotdotp->d_ino = pnodeid; 9222 } 9223 9224 9225 /* LOOKUPP+GETATTR attemped */ 9226 if (args.array_len == 5 && rd_res->dotdotp) { 9227 if (res.status == NFS4_OK && res_opcnt == 5) { 9228 nfs_fh4 *fhp; 9229 nfs4_sharedfh_t *sfhp; 9230 vnode_t *pvp; 9231 nfs4_ga_res_t *garp; 9232 9233 resop++; /* lookupp */ 9234 resop++; /* getfh */ 9235 fhp = &resop->nfs_resop4_u.opgetfh.object; 9236 9237 resop++; /* getattr of parent */ 9238 9239 /* 9240 * First, take care of finishing the 9241 * readdir results. 9242 */ 9243 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9244 /* 9245 * The d_ino of .. must be the inode number 9246 * of the mounted filesystem. 9247 */ 9248 if (garp->n4g_va.va_mask & AT_NODEID) 9249 rd_res->dotdotp->d_ino = 9250 garp->n4g_va.va_nodeid; 9251 9252 9253 /* 9254 * Next, create the ".." dnlc entry 9255 */ 9256 sfhp = sfh4_get(fhp, mi); 9257 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9258 dnlc_update(vp, "..", pvp); 9259 VN_RELE(pvp); 9260 } 9261 sfh4_rele(&sfhp); 9262 } 9263 } 9264 9265 if (mi->mi_io_kstats) { 9266 mutex_enter(&mi->mi_lock); 9267 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9268 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9269 mutex_exit(&mi->mi_lock); 9270 } 9271 9272 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9273 9274 out: 9275 /* 9276 * If readdir a node that is a stub for a crossed mount point, 9277 * keep the original secinfo flavor for the current file system, 9278 * not the crossed one. 9279 */ 9280 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9281 9282 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9283 } 9284 9285 9286 static int 9287 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9288 { 9289 rnode4_t *rp = VTOR4(bp->b_vp); 9290 int count; 9291 int error; 9292 cred_t *cred_otw = NULL; 9293 offset_t offset; 9294 nfs4_open_stream_t *osp = NULL; 9295 bool_t first_time = TRUE; /* first time getting otw cred */ 9296 bool_t last_time = FALSE; /* last time getting otw cred */ 9297 9298 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9299 9300 DTRACE_IO1(start, struct buf *, bp); 9301 offset = ldbtob(bp->b_lblkno); 9302 9303 if (bp->b_flags & B_READ) { 9304 read_again: 9305 /* 9306 * Releases the osp, if it is provided. 9307 * Puts a hold on the cred_otw and the new osp (if found). 9308 */ 9309 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9310 &first_time, &last_time); 9311 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9312 offset, bp->b_bcount, 9313 &bp->b_resid, cred_otw, 9314 readahead, NULL); 9315 crfree(cred_otw); 9316 if (!error) { 9317 if (bp->b_resid) { 9318 /* 9319 * Didn't get it all because we hit EOF, 9320 * zero all the memory beyond the EOF. 9321 */ 9322 /* bzero(rdaddr + */ 9323 bzero(bp->b_un.b_addr + 9324 bp->b_bcount - bp->b_resid, bp->b_resid); 9325 } 9326 mutex_enter(&rp->r_statelock); 9327 if (bp->b_resid == bp->b_bcount && 9328 offset >= rp->r_size) { 9329 /* 9330 * We didn't read anything at all as we are 9331 * past EOF. Return an error indicator back 9332 * but don't destroy the pages (yet). 9333 */ 9334 error = NFS_EOF; 9335 } 9336 mutex_exit(&rp->r_statelock); 9337 } else if (error == EACCES && last_time == FALSE) { 9338 goto read_again; 9339 } 9340 } else { 9341 if (!(rp->r_flags & R4STALE)) { 9342 write_again: 9343 /* 9344 * Releases the osp, if it is provided. 9345 * Puts a hold on the cred_otw and the new 9346 * osp (if found). 9347 */ 9348 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9349 &first_time, &last_time); 9350 mutex_enter(&rp->r_statelock); 9351 count = MIN(bp->b_bcount, rp->r_size - offset); 9352 mutex_exit(&rp->r_statelock); 9353 if (count < 0) 9354 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9355 #ifdef DEBUG 9356 if (count == 0) { 9357 zoneid_t zoneid = getzoneid(); 9358 9359 zcmn_err(zoneid, CE_WARN, 9360 "nfs4_bio: zero length write at %lld", 9361 offset); 9362 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9363 "b_bcount=%ld, file size=%lld", 9364 rp->r_flags, (long)bp->b_bcount, 9365 rp->r_size); 9366 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9367 if (nfs4_bio_do_stop) 9368 debug_enter("nfs4_bio"); 9369 } 9370 #endif 9371 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9372 count, cred_otw, stab_comm); 9373 if (error == EACCES && last_time == FALSE) { 9374 crfree(cred_otw); 9375 goto write_again; 9376 } 9377 bp->b_error = error; 9378 if (error && error != EINTR && 9379 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9380 /* 9381 * Don't print EDQUOT errors on the console. 9382 * Don't print asynchronous EACCES errors. 9383 * Don't print EFBIG errors. 9384 * Print all other write errors. 9385 */ 9386 if (error != EDQUOT && error != EFBIG && 9387 (error != EACCES || 9388 !(bp->b_flags & B_ASYNC))) 9389 nfs4_write_error(bp->b_vp, 9390 error, cred_otw); 9391 /* 9392 * Update r_error and r_flags as appropriate. 9393 * If the error was ESTALE, then mark the 9394 * rnode as not being writeable and save 9395 * the error status. Otherwise, save any 9396 * errors which occur from asynchronous 9397 * page invalidations. Any errors occurring 9398 * from other operations should be saved 9399 * by the caller. 9400 */ 9401 mutex_enter(&rp->r_statelock); 9402 if (error == ESTALE) { 9403 rp->r_flags |= R4STALE; 9404 if (!rp->r_error) 9405 rp->r_error = error; 9406 } else if (!rp->r_error && 9407 (bp->b_flags & 9408 (B_INVAL|B_FORCE|B_ASYNC)) == 9409 (B_INVAL|B_FORCE|B_ASYNC)) { 9410 rp->r_error = error; 9411 } 9412 mutex_exit(&rp->r_statelock); 9413 } 9414 crfree(cred_otw); 9415 } else 9416 error = rp->r_error; 9417 } 9418 9419 if (error != 0 && error != NFS_EOF) 9420 bp->b_flags |= B_ERROR; 9421 9422 if (osp) 9423 open_stream_rele(osp, rp); 9424 9425 DTRACE_IO1(done, struct buf *, bp); 9426 9427 return (error); 9428 } 9429 9430 /* ARGSUSED */ 9431 static int 9432 nfs4_fid(vnode_t *vp, fid_t *fidp) 9433 { 9434 return (EREMOTE); 9435 } 9436 9437 /* ARGSUSED2 */ 9438 static int 9439 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9440 { 9441 rnode4_t *rp = VTOR4(vp); 9442 9443 if (!write_lock) { 9444 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9445 return (V_WRITELOCK_FALSE); 9446 } 9447 9448 if ((rp->r_flags & R4DIRECTIO) || 9449 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9450 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9451 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9452 return (V_WRITELOCK_FALSE); 9453 nfs_rw_exit(&rp->r_rwlock); 9454 } 9455 9456 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9457 return (V_WRITELOCK_TRUE); 9458 } 9459 9460 /* ARGSUSED */ 9461 static void 9462 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9463 { 9464 rnode4_t *rp = VTOR4(vp); 9465 9466 nfs_rw_exit(&rp->r_rwlock); 9467 } 9468 9469 /* ARGSUSED */ 9470 static int 9471 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp) 9472 { 9473 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9474 return (EIO); 9475 9476 /* 9477 * Because we stuff the readdir cookie into the offset field 9478 * someone may attempt to do an lseek with the cookie which 9479 * we want to succeed. 9480 */ 9481 if (vp->v_type == VDIR) 9482 return (0); 9483 if (*noffp < 0) 9484 return (EINVAL); 9485 return (0); 9486 } 9487 9488 9489 /* 9490 * Return all the pages from [off..off+len) in file 9491 */ 9492 static int 9493 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9494 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9495 enum seg_rw rw, cred_t *cr) 9496 { 9497 rnode4_t *rp; 9498 int error; 9499 mntinfo4_t *mi; 9500 9501 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9502 return (EIO); 9503 rp = VTOR4(vp); 9504 if (IS_SHADOW(vp, rp)) 9505 vp = RTOV4(rp); 9506 9507 if (vp->v_flag & VNOMAP) 9508 return (ENOSYS); 9509 9510 if (protp != NULL) 9511 *protp = PROT_ALL; 9512 9513 /* 9514 * Now validate that the caches are up to date. 9515 */ 9516 if (error = nfs4_validate_caches(vp, cr)) 9517 return (error); 9518 9519 mi = VTOMI4(vp); 9520 retry: 9521 mutex_enter(&rp->r_statelock); 9522 9523 /* 9524 * Don't create dirty pages faster than they 9525 * can be cleaned so that the system doesn't 9526 * get imbalanced. If the async queue is 9527 * maxed out, then wait for it to drain before 9528 * creating more dirty pages. Also, wait for 9529 * any threads doing pagewalks in the vop_getattr 9530 * entry points so that they don't block for 9531 * long periods. 9532 */ 9533 if (rw == S_CREATE) { 9534 while ((mi->mi_max_threads != 0 && 9535 rp->r_awcount > 2 * mi->mi_max_threads) || 9536 rp->r_gcount > 0) 9537 cv_wait(&rp->r_cv, &rp->r_statelock); 9538 } 9539 9540 /* 9541 * If we are getting called as a side effect of an nfs_write() 9542 * operation the local file size might not be extended yet. 9543 * In this case we want to be able to return pages of zeroes. 9544 */ 9545 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9546 NFS4_DEBUG(nfs4_pageio_debug, 9547 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9548 "len=%llu, size=%llu, attrsize =%llu", off, 9549 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9550 mutex_exit(&rp->r_statelock); 9551 return (EFAULT); /* beyond EOF */ 9552 } 9553 9554 mutex_exit(&rp->r_statelock); 9555 9556 if (len <= PAGESIZE) { 9557 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9558 seg, addr, rw, cr); 9559 NFS4_DEBUG(nfs4_pageio_debug && error, 9560 (CE_NOTE, "getpage error %d; off=%lld, " 9561 "len=%lld", error, off, (u_longlong_t)len)); 9562 } else { 9563 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9564 pl, plsz, seg, addr, rw, cr); 9565 NFS4_DEBUG(nfs4_pageio_debug && error, 9566 (CE_NOTE, "getpages error %d; off=%lld, " 9567 "len=%lld", error, off, (u_longlong_t)len)); 9568 } 9569 9570 switch (error) { 9571 case NFS_EOF: 9572 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9573 goto retry; 9574 case ESTALE: 9575 nfs4_purge_stale_fh(error, vp, cr); 9576 } 9577 9578 return (error); 9579 } 9580 9581 /* 9582 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9583 */ 9584 /* ARGSUSED */ 9585 static int 9586 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9587 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9588 enum seg_rw rw, cred_t *cr) 9589 { 9590 rnode4_t *rp; 9591 uint_t bsize; 9592 struct buf *bp; 9593 page_t *pp; 9594 u_offset_t lbn; 9595 u_offset_t io_off; 9596 u_offset_t blkoff; 9597 u_offset_t rablkoff; 9598 size_t io_len; 9599 uint_t blksize; 9600 int error; 9601 int readahead; 9602 int readahead_issued = 0; 9603 int ra_window; /* readahead window */ 9604 page_t *pagefound; 9605 page_t *savepp; 9606 9607 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9608 return (EIO); 9609 9610 rp = VTOR4(vp); 9611 ASSERT(!IS_SHADOW(vp, rp)); 9612 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9613 9614 reread: 9615 bp = NULL; 9616 pp = NULL; 9617 pagefound = NULL; 9618 9619 if (pl != NULL) 9620 pl[0] = NULL; 9621 9622 error = 0; 9623 lbn = off / bsize; 9624 blkoff = lbn * bsize; 9625 9626 /* 9627 * Queueing up the readahead before doing the synchronous read 9628 * results in a significant increase in read throughput because 9629 * of the increased parallelism between the async threads and 9630 * the process context. 9631 */ 9632 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9633 rw != S_CREATE && 9634 !(vp->v_flag & VNOCACHE)) { 9635 mutex_enter(&rp->r_statelock); 9636 9637 /* 9638 * Calculate the number of readaheads to do. 9639 * a) No readaheads at offset = 0. 9640 * b) Do maximum(nfs4_nra) readaheads when the readahead 9641 * window is closed. 9642 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9643 * upon how far the readahead window is open or close. 9644 * d) No readaheads if rp->r_nextr is not within the scope 9645 * of the readahead window (random i/o). 9646 */ 9647 9648 if (off == 0) 9649 readahead = 0; 9650 else if (blkoff == rp->r_nextr) 9651 readahead = nfs4_nra; 9652 else if (rp->r_nextr > blkoff && 9653 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9654 <= (nfs4_nra - 1))) 9655 readahead = nfs4_nra - ra_window; 9656 else 9657 readahead = 0; 9658 9659 rablkoff = rp->r_nextr; 9660 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9661 mutex_exit(&rp->r_statelock); 9662 if (nfs4_async_readahead(vp, rablkoff + bsize, 9663 addr + (rablkoff + bsize - off), 9664 seg, cr, nfs4_readahead) < 0) { 9665 mutex_enter(&rp->r_statelock); 9666 break; 9667 } 9668 readahead--; 9669 rablkoff += bsize; 9670 /* 9671 * Indicate that we did a readahead so 9672 * readahead offset is not updated 9673 * by the synchronous read below. 9674 */ 9675 readahead_issued = 1; 9676 mutex_enter(&rp->r_statelock); 9677 /* 9678 * set readahead offset to 9679 * offset of last async readahead 9680 * request. 9681 */ 9682 rp->r_nextr = rablkoff; 9683 } 9684 mutex_exit(&rp->r_statelock); 9685 } 9686 9687 again: 9688 if ((pagefound = page_exists(vp, off)) == NULL) { 9689 if (pl == NULL) { 9690 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9691 nfs4_readahead); 9692 } else if (rw == S_CREATE) { 9693 /* 9694 * Block for this page is not allocated, or the offset 9695 * is beyond the current allocation size, or we're 9696 * allocating a swap slot and the page was not found, 9697 * so allocate it and return a zero page. 9698 */ 9699 if ((pp = page_create_va(vp, off, 9700 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9701 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9702 io_len = PAGESIZE; 9703 mutex_enter(&rp->r_statelock); 9704 rp->r_nextr = off + PAGESIZE; 9705 mutex_exit(&rp->r_statelock); 9706 } else { 9707 /* 9708 * Need to go to server to get a block 9709 */ 9710 mutex_enter(&rp->r_statelock); 9711 if (blkoff < rp->r_size && 9712 blkoff + bsize > rp->r_size) { 9713 /* 9714 * If less than a block left in 9715 * file read less than a block. 9716 */ 9717 if (rp->r_size <= off) { 9718 /* 9719 * Trying to access beyond EOF, 9720 * set up to get at least one page. 9721 */ 9722 blksize = off + PAGESIZE - blkoff; 9723 } else 9724 blksize = rp->r_size - blkoff; 9725 } else if ((off == 0) || 9726 (off != rp->r_nextr && !readahead_issued)) { 9727 blksize = PAGESIZE; 9728 blkoff = off; /* block = page here */ 9729 } else 9730 blksize = bsize; 9731 mutex_exit(&rp->r_statelock); 9732 9733 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9734 &io_len, blkoff, blksize, 0); 9735 9736 /* 9737 * Some other thread has entered the page, 9738 * so just use it. 9739 */ 9740 if (pp == NULL) 9741 goto again; 9742 9743 /* 9744 * Now round the request size up to page boundaries. 9745 * This ensures that the entire page will be 9746 * initialized to zeroes if EOF is encountered. 9747 */ 9748 io_len = ptob(btopr(io_len)); 9749 9750 bp = pageio_setup(pp, io_len, vp, B_READ); 9751 ASSERT(bp != NULL); 9752 9753 /* 9754 * pageio_setup should have set b_addr to 0. This 9755 * is correct since we want to do I/O on a page 9756 * boundary. bp_mapin will use this addr to calculate 9757 * an offset, and then set b_addr to the kernel virtual 9758 * address it allocated for us. 9759 */ 9760 ASSERT(bp->b_un.b_addr == 0); 9761 9762 bp->b_edev = 0; 9763 bp->b_dev = 0; 9764 bp->b_lblkno = lbtodb(io_off); 9765 bp->b_file = vp; 9766 bp->b_offset = (offset_t)off; 9767 bp_mapin(bp); 9768 9769 /* 9770 * If doing a write beyond what we believe is EOF, 9771 * don't bother trying to read the pages from the 9772 * server, we'll just zero the pages here. We 9773 * don't check that the rw flag is S_WRITE here 9774 * because some implementations may attempt a 9775 * read access to the buffer before copying data. 9776 */ 9777 mutex_enter(&rp->r_statelock); 9778 if (io_off >= rp->r_size && seg == segkmap) { 9779 mutex_exit(&rp->r_statelock); 9780 bzero(bp->b_un.b_addr, io_len); 9781 } else { 9782 mutex_exit(&rp->r_statelock); 9783 error = nfs4_bio(bp, NULL, cr, FALSE); 9784 } 9785 9786 /* 9787 * Unmap the buffer before freeing it. 9788 */ 9789 bp_mapout(bp); 9790 pageio_done(bp); 9791 9792 savepp = pp; 9793 do { 9794 pp->p_fsdata = C_NOCOMMIT; 9795 } while ((pp = pp->p_next) != savepp); 9796 9797 if (error == NFS_EOF) { 9798 /* 9799 * If doing a write system call just return 9800 * zeroed pages, else user tried to get pages 9801 * beyond EOF, return error. We don't check 9802 * that the rw flag is S_WRITE here because 9803 * some implementations may attempt a read 9804 * access to the buffer before copying data. 9805 */ 9806 if (seg == segkmap) 9807 error = 0; 9808 else 9809 error = EFAULT; 9810 } 9811 9812 if (!readahead_issued && !error) { 9813 mutex_enter(&rp->r_statelock); 9814 rp->r_nextr = io_off + io_len; 9815 mutex_exit(&rp->r_statelock); 9816 } 9817 } 9818 } 9819 9820 out: 9821 if (pl == NULL) 9822 return (error); 9823 9824 if (error) { 9825 if (pp != NULL) 9826 pvn_read_done(pp, B_ERROR); 9827 return (error); 9828 } 9829 9830 if (pagefound) { 9831 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9832 9833 /* 9834 * Page exists in the cache, acquire the appropriate lock. 9835 * If this fails, start all over again. 9836 */ 9837 if ((pp = page_lookup(vp, off, se)) == NULL) { 9838 #ifdef DEBUG 9839 nfs4_lostpage++; 9840 #endif 9841 goto reread; 9842 } 9843 pl[0] = pp; 9844 pl[1] = NULL; 9845 return (0); 9846 } 9847 9848 if (pp != NULL) 9849 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 9850 9851 return (error); 9852 } 9853 9854 static void 9855 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 9856 cred_t *cr) 9857 { 9858 int error; 9859 page_t *pp; 9860 u_offset_t io_off; 9861 size_t io_len; 9862 struct buf *bp; 9863 uint_t bsize, blksize; 9864 rnode4_t *rp = VTOR4(vp); 9865 page_t *savepp; 9866 9867 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9868 9869 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9870 9871 mutex_enter(&rp->r_statelock); 9872 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 9873 /* 9874 * If less than a block left in file read less 9875 * than a block. 9876 */ 9877 blksize = rp->r_size - blkoff; 9878 } else 9879 blksize = bsize; 9880 mutex_exit(&rp->r_statelock); 9881 9882 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 9883 &io_off, &io_len, blkoff, blksize, 1); 9884 /* 9885 * The isra flag passed to the kluster function is 1, we may have 9886 * gotten a return value of NULL for a variety of reasons (# of free 9887 * pages < minfree, someone entered the page on the vnode etc). In all 9888 * cases, we want to punt on the readahead. 9889 */ 9890 if (pp == NULL) 9891 return; 9892 9893 /* 9894 * Now round the request size up to page boundaries. 9895 * This ensures that the entire page will be 9896 * initialized to zeroes if EOF is encountered. 9897 */ 9898 io_len = ptob(btopr(io_len)); 9899 9900 bp = pageio_setup(pp, io_len, vp, B_READ); 9901 ASSERT(bp != NULL); 9902 9903 /* 9904 * pageio_setup should have set b_addr to 0. This is correct since 9905 * we want to do I/O on a page boundary. bp_mapin() will use this addr 9906 * to calculate an offset, and then set b_addr to the kernel virtual 9907 * address it allocated for us. 9908 */ 9909 ASSERT(bp->b_un.b_addr == 0); 9910 9911 bp->b_edev = 0; 9912 bp->b_dev = 0; 9913 bp->b_lblkno = lbtodb(io_off); 9914 bp->b_file = vp; 9915 bp->b_offset = (offset_t)blkoff; 9916 bp_mapin(bp); 9917 9918 /* 9919 * If doing a write beyond what we believe is EOF, don't bother trying 9920 * to read the pages from the server, we'll just zero the pages here. 9921 * We don't check that the rw flag is S_WRITE here because some 9922 * implementations may attempt a read access to the buffer before 9923 * copying data. 9924 */ 9925 mutex_enter(&rp->r_statelock); 9926 if (io_off >= rp->r_size && seg == segkmap) { 9927 mutex_exit(&rp->r_statelock); 9928 bzero(bp->b_un.b_addr, io_len); 9929 error = 0; 9930 } else { 9931 mutex_exit(&rp->r_statelock); 9932 error = nfs4_bio(bp, NULL, cr, TRUE); 9933 if (error == NFS_EOF) 9934 error = 0; 9935 } 9936 9937 /* 9938 * Unmap the buffer before freeing it. 9939 */ 9940 bp_mapout(bp); 9941 pageio_done(bp); 9942 9943 savepp = pp; 9944 do { 9945 pp->p_fsdata = C_NOCOMMIT; 9946 } while ((pp = pp->p_next) != savepp); 9947 9948 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 9949 9950 /* 9951 * In case of error set readahead offset 9952 * to the lowest offset. 9953 * pvn_read_done() calls VN_DISPOSE to destroy the pages 9954 */ 9955 if (error && rp->r_nextr > io_off) { 9956 mutex_enter(&rp->r_statelock); 9957 if (rp->r_nextr > io_off) 9958 rp->r_nextr = io_off; 9959 mutex_exit(&rp->r_statelock); 9960 } 9961 } 9962 9963 /* 9964 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 9965 * If len == 0, do from off to EOF. 9966 * 9967 * The normal cases should be len == 0 && off == 0 (entire vp list) or 9968 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 9969 * (from pageout). 9970 */ 9971 static int 9972 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr) 9973 { 9974 int error; 9975 rnode4_t *rp; 9976 9977 ASSERT(cr != NULL); 9978 9979 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 9980 return (EIO); 9981 9982 rp = VTOR4(vp); 9983 if (IS_SHADOW(vp, rp)) 9984 vp = RTOV4(rp); 9985 9986 /* 9987 * XXX - Why should this check be made here? 9988 */ 9989 if (vp->v_flag & VNOMAP) 9990 return (ENOSYS); 9991 9992 if (len == 0 && !(flags & B_INVAL) && 9993 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 9994 return (0); 9995 9996 mutex_enter(&rp->r_statelock); 9997 rp->r_count++; 9998 mutex_exit(&rp->r_statelock); 9999 error = nfs4_putpages(vp, off, len, flags, cr); 10000 mutex_enter(&rp->r_statelock); 10001 rp->r_count--; 10002 cv_broadcast(&rp->r_cv); 10003 mutex_exit(&rp->r_statelock); 10004 10005 return (error); 10006 } 10007 10008 /* 10009 * Write out a single page, possibly klustering adjacent dirty pages. 10010 */ 10011 int 10012 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10013 int flags, cred_t *cr) 10014 { 10015 u_offset_t io_off; 10016 u_offset_t lbn_off; 10017 u_offset_t lbn; 10018 size_t io_len; 10019 uint_t bsize; 10020 int error; 10021 rnode4_t *rp; 10022 10023 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10024 ASSERT(pp != NULL); 10025 ASSERT(cr != NULL); 10026 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10027 10028 rp = VTOR4(vp); 10029 ASSERT(rp->r_count > 0); 10030 ASSERT(!IS_SHADOW(vp, rp)); 10031 10032 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10033 lbn = pp->p_offset / bsize; 10034 lbn_off = lbn * bsize; 10035 10036 /* 10037 * Find a kluster that fits in one block, or in 10038 * one page if pages are bigger than blocks. If 10039 * there is less file space allocated than a whole 10040 * page, we'll shorten the i/o request below. 10041 */ 10042 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10043 roundup(bsize, PAGESIZE), flags); 10044 10045 /* 10046 * pvn_write_kluster shouldn't have returned a page with offset 10047 * behind the original page we were given. Verify that. 10048 */ 10049 ASSERT((pp->p_offset / bsize) >= lbn); 10050 10051 /* 10052 * Now pp will have the list of kept dirty pages marked for 10053 * write back. It will also handle invalidation and freeing 10054 * of pages that are not dirty. Check for page length rounding 10055 * problems. 10056 */ 10057 if (io_off + io_len > lbn_off + bsize) { 10058 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10059 io_len = lbn_off + bsize - io_off; 10060 } 10061 /* 10062 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10063 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10064 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10065 * progress and the r_size has not been made consistent with the 10066 * new size of the file. When the uiomove() completes the r_size is 10067 * updated and the R4MODINPROGRESS flag is cleared. 10068 * 10069 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10070 * consistent value of r_size. Without this handshaking, it is 10071 * possible that nfs4_bio() picks up the old value of r_size 10072 * before the uiomove() in writerp4() completes. This will result 10073 * in the write through nfs4_bio() being dropped. 10074 * 10075 * More precisely, there is a window between the time the uiomove() 10076 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10077 * operation intervenes in this window, the page will be picked up, 10078 * because it is dirty (it will be unlocked, unless it was 10079 * pagecreate'd). When the page is picked up as dirty, the dirty 10080 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10081 * checked. This will still be the old size. Therefore the page will 10082 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10083 * the page will be found to be clean and the write will be dropped. 10084 */ 10085 if (rp->r_flags & R4MODINPROGRESS) { 10086 mutex_enter(&rp->r_statelock); 10087 if ((rp->r_flags & R4MODINPROGRESS) && 10088 rp->r_modaddr + MAXBSIZE > io_off && 10089 rp->r_modaddr < io_off + io_len) { 10090 page_t *plist; 10091 /* 10092 * A write is in progress for this region of the file. 10093 * If we did not detect R4MODINPROGRESS here then this 10094 * path through nfs_putapage() would eventually go to 10095 * nfs4_bio() and may not write out all of the data 10096 * in the pages. We end up losing data. So we decide 10097 * to set the modified bit on each page in the page 10098 * list and mark the rnode with R4DIRTY. This write 10099 * will be restarted at some later time. 10100 */ 10101 plist = pp; 10102 while (plist != NULL) { 10103 pp = plist; 10104 page_sub(&plist, pp); 10105 hat_setmod(pp); 10106 page_io_unlock(pp); 10107 page_unlock(pp); 10108 } 10109 rp->r_flags |= R4DIRTY; 10110 mutex_exit(&rp->r_statelock); 10111 if (offp) 10112 *offp = io_off; 10113 if (lenp) 10114 *lenp = io_len; 10115 return (0); 10116 } 10117 mutex_exit(&rp->r_statelock); 10118 } 10119 10120 if (flags & B_ASYNC) { 10121 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10122 nfs4_sync_putapage); 10123 } else 10124 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10125 10126 if (offp) 10127 *offp = io_off; 10128 if (lenp) 10129 *lenp = io_len; 10130 return (error); 10131 } 10132 10133 static int 10134 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10135 int flags, cred_t *cr) 10136 { 10137 int error; 10138 rnode4_t *rp; 10139 10140 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10141 10142 flags |= B_WRITE; 10143 10144 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10145 10146 rp = VTOR4(vp); 10147 10148 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10149 error == EACCES) && 10150 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10151 if (!(rp->r_flags & R4OUTOFSPACE)) { 10152 mutex_enter(&rp->r_statelock); 10153 rp->r_flags |= R4OUTOFSPACE; 10154 mutex_exit(&rp->r_statelock); 10155 } 10156 flags |= B_ERROR; 10157 pvn_write_done(pp, flags); 10158 /* 10159 * If this was not an async thread, then try again to 10160 * write out the pages, but this time, also destroy 10161 * them whether or not the write is successful. This 10162 * will prevent memory from filling up with these 10163 * pages and destroying them is the only alternative 10164 * if they can't be written out. 10165 * 10166 * Don't do this if this is an async thread because 10167 * when the pages are unlocked in pvn_write_done, 10168 * some other thread could have come along, locked 10169 * them, and queued for an async thread. It would be 10170 * possible for all of the async threads to be tied 10171 * up waiting to lock the pages again and they would 10172 * all already be locked and waiting for an async 10173 * thread to handle them. Deadlock. 10174 */ 10175 if (!(flags & B_ASYNC)) { 10176 error = nfs4_putpage(vp, io_off, io_len, 10177 B_INVAL | B_FORCE, cr); 10178 } 10179 } else { 10180 if (error) 10181 flags |= B_ERROR; 10182 else if (rp->r_flags & R4OUTOFSPACE) { 10183 mutex_enter(&rp->r_statelock); 10184 rp->r_flags &= ~R4OUTOFSPACE; 10185 mutex_exit(&rp->r_statelock); 10186 } 10187 pvn_write_done(pp, flags); 10188 if (freemem < desfree) 10189 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10190 NFS4_WRITE_NOWAIT); 10191 } 10192 10193 return (error); 10194 } 10195 10196 #ifdef DEBUG 10197 int nfs4_force_open_before_mmap = 0; 10198 #endif 10199 10200 static int 10201 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10202 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10203 { 10204 struct segvn_crargs vn_a; 10205 int error = 0; 10206 rnode4_t *rp = VTOR4(vp); 10207 mntinfo4_t *mi = VTOMI4(vp); 10208 10209 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10210 return (EIO); 10211 10212 if (vp->v_flag & VNOMAP) 10213 return (ENOSYS); 10214 10215 if (off < 0 || (off + len) < 0) 10216 return (ENXIO); 10217 10218 if (vp->v_type != VREG) 10219 return (ENODEV); 10220 10221 /* 10222 * If the file is delegated to the client don't do anything. 10223 * If the file is not delegated, then validate the data cache. 10224 */ 10225 mutex_enter(&rp->r_statev4_lock); 10226 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10227 mutex_exit(&rp->r_statev4_lock); 10228 error = nfs4_validate_caches(vp, cr); 10229 if (error) 10230 return (error); 10231 } else { 10232 mutex_exit(&rp->r_statev4_lock); 10233 } 10234 10235 /* 10236 * Check to see if the vnode is currently marked as not cachable. 10237 * This means portions of the file are locked (through VOP_FRLOCK). 10238 * In this case the map request must be refused. We use 10239 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10240 */ 10241 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 10242 return (EINTR); 10243 10244 if (vp->v_flag & VNOCACHE) { 10245 error = EAGAIN; 10246 goto done; 10247 } 10248 10249 /* 10250 * Don't allow concurrent locks and mapping if mandatory locking is 10251 * enabled. 10252 */ 10253 if (flk_has_remote_locks(vp)) { 10254 struct vattr va; 10255 va.va_mask = AT_MODE; 10256 error = nfs4getattr(vp, &va, cr); 10257 if (error != 0) 10258 goto done; 10259 if (MANDLOCK(vp, va.va_mode)) { 10260 error = EAGAIN; 10261 goto done; 10262 } 10263 } 10264 10265 /* 10266 * It is possible that the rnode has a lost lock request that we 10267 * are still trying to recover, and that the request conflicts with 10268 * this map request. 10269 * 10270 * An alternative approach would be for nfs4_safemap() to consider 10271 * queued lock requests when deciding whether to set or clear 10272 * VNOCACHE. This would require the frlock code path to call 10273 * nfs4_safemap() after enqueing a lost request. 10274 */ 10275 if (nfs4_map_lost_lock_conflict(vp)) { 10276 error = EAGAIN; 10277 goto done; 10278 } 10279 10280 as_rangelock(as); 10281 if (!(flags & MAP_FIXED)) { 10282 map_addr(addrp, len, off, 1, flags); 10283 if (*addrp == NULL) { 10284 as_rangeunlock(as); 10285 error = ENOMEM; 10286 goto done; 10287 } 10288 } else { 10289 /* 10290 * User specified address - blow away any previous mappings 10291 */ 10292 (void) as_unmap(as, *addrp, len); 10293 } 10294 10295 if (vp->v_type == VREG) { 10296 /* 10297 * We need to retrieve the open stream 10298 */ 10299 nfs4_open_stream_t *osp = NULL; 10300 nfs4_open_owner_t *oop = NULL; 10301 10302 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10303 if (oop != NULL) { 10304 /* returns with 'os_sync_lock' held */ 10305 osp = find_open_stream(oop, rp); 10306 open_owner_rele(oop); 10307 } 10308 if (osp == NULL) { 10309 #ifdef DEBUG 10310 if (nfs4_force_open_before_mmap) { 10311 error = EIO; 10312 goto done; 10313 } 10314 #endif 10315 /* returns with 'os_sync_lock' held */ 10316 error = open_and_get_osp(vp, cr, &osp); 10317 if (osp == NULL) { 10318 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10319 "nfs4_map: we tried to OPEN the file " 10320 "but again no osp, so fail with EIO")); 10321 goto done; 10322 } 10323 } 10324 10325 if (osp->os_failed_reopen) { 10326 mutex_exit(&osp->os_sync_lock); 10327 open_stream_rele(osp, rp); 10328 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10329 "nfs4_map: os_failed_reopen set on " 10330 "osp %p, cr %p, rp %s", (void *)osp, 10331 (void *)cr, rnode4info(rp))); 10332 error = EIO; 10333 goto done; 10334 } 10335 mutex_exit(&osp->os_sync_lock); 10336 open_stream_rele(osp, rp); 10337 } 10338 10339 vn_a.vp = vp; 10340 vn_a.offset = off; 10341 vn_a.type = (flags & MAP_TYPE); 10342 vn_a.prot = (uchar_t)prot; 10343 vn_a.maxprot = (uchar_t)maxprot; 10344 vn_a.flags = (flags & ~MAP_TYPE); 10345 vn_a.cred = cr; 10346 vn_a.amp = NULL; 10347 vn_a.szc = 0; 10348 vn_a.lgrp_mem_policy_flags = 0; 10349 10350 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10351 as_rangeunlock(as); 10352 10353 done: 10354 nfs_rw_exit(&rp->r_lkserlock); 10355 return (error); 10356 } 10357 10358 /* 10359 * We're most likely dealing with a kernel module that likes to READ 10360 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10361 * officially OPEN the file to create the necessary client state 10362 * for bookkeeping of os_mmap_read/write counts. 10363 * 10364 * Since VOP_MAP only passes in a pointer to the vnode rather than 10365 * a double pointer, we can't handle the case where nfs4open_otw() 10366 * returns a different vnode than the one passed into VOP_MAP (since 10367 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10368 * we return NULL and let nfs4_map() fail. Note: the only case where 10369 * this should happen is if the file got removed and replaced with the 10370 * same name on the server (in addition to the fact that we're trying 10371 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10372 */ 10373 static int 10374 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10375 { 10376 rnode4_t *rp, *drp; 10377 vnode_t *dvp, *open_vp; 10378 char file_name[MAXNAMELEN]; 10379 int just_created; 10380 nfs4_open_stream_t *osp; 10381 nfs4_open_owner_t *oop; 10382 int error; 10383 10384 *ospp = NULL; 10385 open_vp = map_vp; 10386 10387 rp = VTOR4(open_vp); 10388 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10389 return (error); 10390 drp = VTOR4(dvp); 10391 10392 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10393 VN_RELE(dvp); 10394 return (EINTR); 10395 } 10396 10397 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10398 nfs_rw_exit(&drp->r_rwlock); 10399 VN_RELE(dvp); 10400 return (error); 10401 } 10402 10403 mutex_enter(&rp->r_statev4_lock); 10404 if (rp->created_v4) { 10405 rp->created_v4 = 0; 10406 mutex_exit(&rp->r_statev4_lock); 10407 10408 dnlc_update(dvp, file_name, open_vp); 10409 /* This is needed so we don't bump the open ref count */ 10410 just_created = 1; 10411 } else { 10412 mutex_exit(&rp->r_statev4_lock); 10413 just_created = 0; 10414 } 10415 10416 VN_HOLD(map_vp); 10417 10418 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10419 just_created); 10420 if (error) { 10421 nfs_rw_exit(&drp->r_rwlock); 10422 VN_RELE(dvp); 10423 VN_RELE(map_vp); 10424 return (error); 10425 } 10426 10427 nfs_rw_exit(&drp->r_rwlock); 10428 VN_RELE(dvp); 10429 10430 /* 10431 * If nfs4open_otw() returned a different vnode then "undo" 10432 * the open and return failure to the caller. 10433 */ 10434 if (!VN_CMP(open_vp, map_vp)) { 10435 nfs4_error_t e; 10436 10437 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10438 "open returned a different vnode")); 10439 /* 10440 * If there's an error, ignore it, 10441 * and let VOP_INACTIVE handle it. 10442 */ 10443 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10444 CLOSE_NORM, 0, 0, 0); 10445 VN_RELE(map_vp); 10446 return (EIO); 10447 } 10448 10449 VN_RELE(map_vp); 10450 10451 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10452 if (!oop) { 10453 nfs4_error_t e; 10454 10455 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10456 "no open owner")); 10457 /* 10458 * If there's an error, ignore it, 10459 * and let VOP_INACTIVE handle it. 10460 */ 10461 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10462 CLOSE_NORM, 0, 0, 0); 10463 return (EIO); 10464 } 10465 osp = find_open_stream(oop, rp); 10466 open_owner_rele(oop); 10467 *ospp = osp; 10468 return (0); 10469 } 10470 10471 /* 10472 * Please be aware that when this function is called, the address space write 10473 * a_lock is held. Do not put over the wire calls in this function. 10474 */ 10475 /* ARGSUSED */ 10476 static int 10477 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10478 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) 10479 { 10480 rnode4_t *rp; 10481 int error = 0; 10482 mntinfo4_t *mi; 10483 10484 mi = VTOMI4(vp); 10485 rp = VTOR4(vp); 10486 10487 if (nfs_zone() != mi->mi_zone) 10488 return (EIO); 10489 if (vp->v_flag & VNOMAP) 10490 return (ENOSYS); 10491 10492 /* 10493 * Need to hold rwlock while incrementing the mapcnt so that 10494 * mmap'ing can be serialized with writes so that the caching 10495 * can be handled correctly. 10496 * 10497 * Don't need to update the open stream first, since this 10498 * mmap can't add any additional share access that isn't 10499 * already contained in the open stream (for the case where we 10500 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10501 * take into account os_mmap_read[write] counts). 10502 */ 10503 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10504 return (EINTR); 10505 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10506 nfs_rw_exit(&rp->r_rwlock); 10507 10508 if (vp->v_type == VREG) { 10509 /* 10510 * We need to retrieve the open stream and update the counts. 10511 * If there is no open stream here, something is wrong. 10512 */ 10513 nfs4_open_stream_t *osp = NULL; 10514 nfs4_open_owner_t *oop = NULL; 10515 10516 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10517 if (oop != NULL) { 10518 /* returns with 'os_sync_lock' held */ 10519 osp = find_open_stream(oop, rp); 10520 open_owner_rele(oop); 10521 } 10522 if (osp == NULL) { 10523 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10524 "nfs4_addmap: we should have an osp" 10525 "but we don't, so fail with EIO")); 10526 error = EIO; 10527 goto out; 10528 } 10529 10530 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10531 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10532 10533 /* 10534 * Update the map count in the open stream. 10535 * This is necessary in the case where we 10536 * open/mmap/close/, then the server reboots, and we 10537 * attempt to reopen. If the mmap doesn't add share 10538 * access then we send an invalid reopen with 10539 * access = NONE. 10540 * 10541 * We need to specifically check each PROT_* so a mmap 10542 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10543 * read and write access. A simple comparison of prot 10544 * to ~PROT_WRITE to determine read access is insufficient 10545 * since prot can be |= with PROT_USER, etc. 10546 */ 10547 10548 /* 10549 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10550 */ 10551 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10552 osp->os_mmap_write += btopr(len); 10553 if (maxprot & PROT_READ) 10554 osp->os_mmap_read += btopr(len); 10555 if (maxprot & PROT_EXEC) 10556 osp->os_mmap_read += btopr(len); 10557 /* 10558 * Ensure that os_mmap_read gets incremented, even if 10559 * maxprot were to look like PROT_NONE. 10560 */ 10561 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10562 !(maxprot & PROT_EXEC)) 10563 osp->os_mmap_read += btopr(len); 10564 osp->os_mapcnt += btopr(len); 10565 mutex_exit(&osp->os_sync_lock); 10566 open_stream_rele(osp, rp); 10567 } 10568 10569 out: 10570 /* 10571 * If we got an error, then undo our 10572 * incrementing of 'r_mapcnt'. 10573 */ 10574 10575 if (error) { 10576 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10577 ASSERT(rp->r_mapcnt >= 0); 10578 } 10579 return (error); 10580 } 10581 10582 static int 10583 nfs4_cmp(vnode_t *vp1, vnode_t *vp2) 10584 { 10585 10586 return (VTOR4(vp1) == VTOR4(vp2)); 10587 } 10588 10589 static int 10590 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10591 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr) 10592 { 10593 int rc; 10594 u_offset_t start, end; 10595 rnode4_t *rp; 10596 int error = 0, intr = INTR4(vp); 10597 nfs4_error_t e; 10598 10599 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10600 return (EIO); 10601 10602 /* check for valid cmd parameter */ 10603 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10604 return (EINVAL); 10605 10606 /* Verify l_type. */ 10607 switch (bfp->l_type) { 10608 case F_RDLCK: 10609 if (cmd != F_GETLK && !(flag & FREAD)) 10610 return (EBADF); 10611 break; 10612 case F_WRLCK: 10613 if (cmd != F_GETLK && !(flag & FWRITE)) 10614 return (EBADF); 10615 break; 10616 case F_UNLCK: 10617 intr = 0; 10618 break; 10619 10620 default: 10621 return (EINVAL); 10622 } 10623 10624 /* check the validity of the lock range */ 10625 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10626 return (rc); 10627 if (rc = flk_check_lock_data(start, end, MAXEND)) 10628 return (rc); 10629 10630 /* 10631 * If the filesystem is mounted using local locking, pass the 10632 * request off to the local locking code. 10633 */ 10634 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10635 if (cmd == F_SETLK || cmd == F_SETLKW) { 10636 /* 10637 * For complete safety, we should be holding 10638 * r_lkserlock. However, we can't call 10639 * nfs4_safelock and then fs_frlock while 10640 * holding r_lkserlock, so just invoke 10641 * nfs4_safelock and expect that this will 10642 * catch enough of the cases. 10643 */ 10644 if (!nfs4_safelock(vp, bfp, cr)) 10645 return (EAGAIN); 10646 } 10647 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr)); 10648 } 10649 10650 rp = VTOR4(vp); 10651 10652 /* 10653 * Check whether the given lock request can proceed, given the 10654 * current file mappings. 10655 */ 10656 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10657 return (EINTR); 10658 if (cmd == F_SETLK || cmd == F_SETLKW) { 10659 if (!nfs4_safelock(vp, bfp, cr)) { 10660 rc = EAGAIN; 10661 goto done; 10662 } 10663 } 10664 10665 /* 10666 * Flush the cache after waiting for async I/O to finish. For new 10667 * locks, this is so that the process gets the latest bits from the 10668 * server. For unlocks, this is so that other clients see the 10669 * latest bits once the file has been unlocked. If currently dirty 10670 * pages can't be flushed, then don't allow a lock to be set. But 10671 * allow unlocks to succeed, to avoid having orphan locks on the 10672 * server. 10673 */ 10674 if (cmd != F_GETLK) { 10675 mutex_enter(&rp->r_statelock); 10676 while (rp->r_count > 0) { 10677 if (intr) { 10678 klwp_t *lwp = ttolwp(curthread); 10679 10680 if (lwp != NULL) 10681 lwp->lwp_nostop++; 10682 if (cv_wait_sig(&rp->r_cv, &rp->r_statelock) == 0) { 10683 if (lwp != NULL) 10684 lwp->lwp_nostop--; 10685 rc = EINTR; 10686 break; 10687 } 10688 if (lwp != NULL) 10689 lwp->lwp_nostop--; 10690 } else 10691 cv_wait(&rp->r_cv, &rp->r_statelock); 10692 } 10693 mutex_exit(&rp->r_statelock); 10694 if (rc != 0) 10695 goto done; 10696 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr); 10697 if (error) { 10698 if (error == ENOSPC || error == EDQUOT) { 10699 mutex_enter(&rp->r_statelock); 10700 if (!rp->r_error) 10701 rp->r_error = error; 10702 mutex_exit(&rp->r_statelock); 10703 } 10704 if (bfp->l_type != F_UNLCK) { 10705 rc = ENOLCK; 10706 goto done; 10707 } 10708 } 10709 } 10710 10711 /* 10712 * Call the lock manager to do the real work of contacting 10713 * the server and obtaining the lock. 10714 */ 10715 10716 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10717 cr, &e, NULL, NULL); 10718 rc = e.error; 10719 10720 if (rc == 0) 10721 nfs4_lockcompletion(vp, cmd); 10722 10723 done: 10724 nfs_rw_exit(&rp->r_lkserlock); 10725 10726 return (rc); 10727 } 10728 10729 /* 10730 * Free storage space associated with the specified vnode. The portion 10731 * to be freed is specified by bfp->l_start and bfp->l_len (already 10732 * normalized to a "whence" of 0). 10733 * 10734 * This is an experimental facility whose continued existence is not 10735 * guaranteed. Currently, we only support the special case 10736 * of l_len == 0, meaning free to end of file. 10737 */ 10738 /* ARGSUSED */ 10739 static int 10740 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10741 offset_t offset, cred_t *cr, caller_context_t *ct) 10742 { 10743 int error; 10744 10745 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10746 return (EIO); 10747 ASSERT(vp->v_type == VREG); 10748 if (cmd != F_FREESP) 10749 return (EINVAL); 10750 10751 error = convoff(vp, bfp, 0, offset); 10752 if (!error) { 10753 ASSERT(bfp->l_start >= 0); 10754 if (bfp->l_len == 0) { 10755 struct vattr va; 10756 10757 va.va_mask = AT_SIZE; 10758 va.va_size = bfp->l_start; 10759 error = nfs4setattr(vp, &va, 0, cr, NULL); 10760 } else 10761 error = EINVAL; 10762 } 10763 10764 return (error); 10765 } 10766 10767 /* ARGSUSED */ 10768 static int 10769 nfs4_realvp(vnode_t *vp, vnode_t **vpp) 10770 { 10771 return (EINVAL); 10772 } 10773 10774 /* 10775 * Setup and add an address space callback to do the work of the delmap call. 10776 * The callback will (and must be) deleted in the actual callback function. 10777 * 10778 * This is done in order to take care of the problem that we have with holding 10779 * the address space's a_lock for a long period of time (e.g. if the NFS server 10780 * is down). Callbacks will be executed in the address space code while the 10781 * a_lock is not held. Holding the address space's a_lock causes things such 10782 * as ps and fork to hang because they are trying to acquire this lock as well. 10783 */ 10784 /* ARGSUSED */ 10785 static int 10786 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10787 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) 10788 { 10789 int caller_found; 10790 int error; 10791 rnode4_t *rp; 10792 nfs4_delmap_args_t *dmapp; 10793 nfs4_delmapcall_t *delmap_call; 10794 10795 if (vp->v_flag & VNOMAP) 10796 return (ENOSYS); 10797 10798 /* 10799 * A process may not change zones if it has NFS pages mmap'ed 10800 * in, so we can't legitimately get here from the wrong zone. 10801 */ 10802 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10803 10804 rp = VTOR4(vp); 10805 10806 /* 10807 * The way that the address space of this process deletes its mapping 10808 * of this file is via the following call chains: 10809 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10810 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10811 * 10812 * With the use of address space callbacks we are allowed to drop the 10813 * address space lock, a_lock, while executing the NFS operations that 10814 * need to go over the wire. Returning EAGAIN to the caller of this 10815 * function is what drives the execution of the callback that we add 10816 * below. The callback will be executed by the address space code 10817 * after dropping the a_lock. When the callback is finished, since 10818 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 10819 * is called again on the same segment to finish the rest of the work 10820 * that needs to happen during unmapping. 10821 * 10822 * This action of calling back into the segment driver causes 10823 * nfs4_delmap() to get called again, but since the callback was 10824 * already executed at this point, it already did the work and there 10825 * is nothing left for us to do. 10826 * 10827 * To Summarize: 10828 * - The first time nfs4_delmap is called by the current thread is when 10829 * we add the caller associated with this delmap to the delmap caller 10830 * list, add the callback, and return EAGAIN. 10831 * - The second time in this call chain when nfs4_delmap is called we 10832 * will find this caller in the delmap caller list and realize there 10833 * is no more work to do thus removing this caller from the list and 10834 * returning the error that was set in the callback execution. 10835 */ 10836 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 10837 if (caller_found) { 10838 /* 10839 * 'error' is from the actual delmap operations. To avoid 10840 * hangs, we need to handle the return of EAGAIN differently 10841 * since this is what drives the callback execution. 10842 * In this case, we don't want to return EAGAIN and do the 10843 * callback execution because there are none to execute. 10844 */ 10845 if (error == EAGAIN) 10846 return (0); 10847 else 10848 return (error); 10849 } 10850 10851 /* current caller was not in the list */ 10852 delmap_call = nfs4_init_delmapcall(); 10853 10854 mutex_enter(&rp->r_statelock); 10855 list_insert_tail(&rp->r_indelmap, delmap_call); 10856 mutex_exit(&rp->r_statelock); 10857 10858 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 10859 10860 dmapp->vp = vp; 10861 dmapp->off = off; 10862 dmapp->addr = addr; 10863 dmapp->len = len; 10864 dmapp->prot = prot; 10865 dmapp->maxprot = maxprot; 10866 dmapp->flags = flags; 10867 dmapp->cr = cr; 10868 dmapp->caller = delmap_call; 10869 10870 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 10871 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 10872 10873 return (error ? error : EAGAIN); 10874 } 10875 10876 static nfs4_delmapcall_t * 10877 nfs4_init_delmapcall() 10878 { 10879 nfs4_delmapcall_t *delmap_call; 10880 10881 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 10882 delmap_call->call_id = curthread; 10883 delmap_call->error = 0; 10884 10885 return (delmap_call); 10886 } 10887 10888 static void 10889 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 10890 { 10891 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 10892 } 10893 10894 /* 10895 * Searches for the current delmap caller (based on curthread) in the list of 10896 * callers. If it is found, we remove it and free the delmap caller. 10897 * Returns: 10898 * 0 if the caller wasn't found 10899 * 1 if the caller was found, removed and freed. *errp will be set 10900 * to what the result of the delmap was. 10901 */ 10902 static int 10903 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 10904 { 10905 nfs4_delmapcall_t *delmap_call; 10906 10907 /* 10908 * If the list doesn't exist yet, we create it and return 10909 * that the caller wasn't found. No list = no callers. 10910 */ 10911 mutex_enter(&rp->r_statelock); 10912 if (!(rp->r_flags & R4DELMAPLIST)) { 10913 /* The list does not exist */ 10914 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 10915 offsetof(nfs4_delmapcall_t, call_node)); 10916 rp->r_flags |= R4DELMAPLIST; 10917 mutex_exit(&rp->r_statelock); 10918 return (0); 10919 } else { 10920 /* The list exists so search it */ 10921 for (delmap_call = list_head(&rp->r_indelmap); 10922 delmap_call != NULL; 10923 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 10924 if (delmap_call->call_id == curthread) { 10925 /* current caller is in the list */ 10926 *errp = delmap_call->error; 10927 list_remove(&rp->r_indelmap, delmap_call); 10928 mutex_exit(&rp->r_statelock); 10929 nfs4_free_delmapcall(delmap_call); 10930 return (1); 10931 } 10932 } 10933 } 10934 mutex_exit(&rp->r_statelock); 10935 return (0); 10936 } 10937 10938 /* 10939 * Remove some pages from an mmap'd vnode. Just update the 10940 * count of pages. If doing close-to-open, then flush and 10941 * commit all of the pages associated with this file. 10942 * Otherwise, start an asynchronous page flush to write out 10943 * any dirty pages. This will also associate a credential 10944 * with the rnode which can be used to write the pages. 10945 */ 10946 /* ARGSUSED */ 10947 static void 10948 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 10949 { 10950 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 10951 rnode4_t *rp; 10952 mntinfo4_t *mi; 10953 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 10954 10955 rp = VTOR4(dmapp->vp); 10956 mi = VTOMI4(dmapp->vp); 10957 10958 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 10959 ASSERT(rp->r_mapcnt >= 0); 10960 10961 /* 10962 * Initiate a page flush and potential commit if there are 10963 * pages, the file system was not mounted readonly, the segment 10964 * was mapped shared, and the pages themselves were writeable. 10965 */ 10966 if (nfs4_has_pages(dmapp->vp) && 10967 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 10968 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 10969 mutex_enter(&rp->r_statelock); 10970 rp->r_flags |= R4DIRTY; 10971 mutex_exit(&rp->r_statelock); 10972 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 10973 dmapp->len, dmapp->cr); 10974 if (!e.error) { 10975 mutex_enter(&rp->r_statelock); 10976 e.error = rp->r_error; 10977 rp->r_error = 0; 10978 mutex_exit(&rp->r_statelock); 10979 } 10980 } else 10981 e.error = 0; 10982 10983 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 10984 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 10985 B_INVAL, dmapp->cr); 10986 10987 if (e.error) { 10988 e.stat = puterrno4(e.error); 10989 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 10990 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 10991 dmapp->caller->error = e.error; 10992 } 10993 10994 /* Check to see if we need to close the file */ 10995 10996 if (dmapp->vp->v_type == VREG) { 10997 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 10998 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 10999 11000 if (e.error != 0 || e.stat != NFS4_OK) { 11001 /* 11002 * Since it is possible that e.error == 0 and 11003 * e.stat != NFS4_OK (and vice versa), 11004 * we do the proper checking in order to get both 11005 * e.error and e.stat reporting the correct info. 11006 */ 11007 if (e.stat == NFS4_OK) 11008 e.stat = puterrno4(e.error); 11009 if (e.error == 0) 11010 e.error = geterrno4(e.stat); 11011 11012 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11013 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11014 dmapp->caller->error = e.error; 11015 } 11016 } 11017 11018 (void) as_delete_callback(as, arg); 11019 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11020 } 11021 11022 11023 static uint_t 11024 fattr4_maxfilesize_to_bits(uint64_t ll) 11025 { 11026 uint_t l = 1; 11027 11028 if (ll == 0) { 11029 return (0); 11030 } 11031 11032 if (ll & 0xffffffff00000000) { 11033 l += 32; ll >>= 32; 11034 } 11035 if (ll & 0xffff0000) { 11036 l += 16; ll >>= 16; 11037 } 11038 if (ll & 0xff00) { 11039 l += 8; ll >>= 8; 11040 } 11041 if (ll & 0xf0) { 11042 l += 4; ll >>= 4; 11043 } 11044 if (ll & 0xc) { 11045 l += 2; ll >>= 2; 11046 } 11047 if (ll & 0x2) { 11048 l += 1; 11049 } 11050 return (l); 11051 } 11052 11053 static int 11054 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr) 11055 { 11056 int error; 11057 hrtime_t t; 11058 rnode4_t *rp; 11059 nfs4_ga_res_t gar; 11060 nfs4_ga_ext_res_t ger; 11061 11062 gar.n4g_ext_res = &ger; 11063 11064 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11065 return (EIO); 11066 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11067 *valp = MAXPATHLEN; 11068 return (0); 11069 } 11070 if (cmd == _PC_ACL_ENABLED) { 11071 *valp = _ACL_ACE_ENABLED; 11072 return (0); 11073 } 11074 11075 rp = VTOR4(vp); 11076 if (cmd == _PC_XATTR_EXISTS) { 11077 /* 11078 * Eventually should attempt small client readdir before 11079 * going otw with GETATTR(FATTR4_NAMED_ATTR). For now 11080 * just drive the OTW getattr. This is required because 11081 * _PC_XATTR_EXISTS can only return true if attributes 11082 * exist -- simply checking for existance of the attrdir 11083 * is not sufficient. 11084 * 11085 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11086 * is NULL. Once the xadir vp exists, we can create xattrs, 11087 * and we don't have any way to update the "base" object's 11088 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11089 * could help out. 11090 */ 11091 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11092 rp->r_xattr_dir == NULL) { 11093 *valp = rp->r_pathconf.pc4_xattr_exists; 11094 return (0); 11095 } 11096 } else { /* OLD CODE */ 11097 if (ATTRCACHE4_VALID(vp)) { 11098 mutex_enter(&rp->r_statelock); 11099 if (rp->r_pathconf.pc4_cache_valid) { 11100 error = 0; 11101 switch (cmd) { 11102 case _PC_FILESIZEBITS: 11103 *valp = 11104 rp->r_pathconf.pc4_filesizebits; 11105 break; 11106 case _PC_LINK_MAX: 11107 *valp = 11108 rp->r_pathconf.pc4_link_max; 11109 break; 11110 case _PC_NAME_MAX: 11111 *valp = 11112 rp->r_pathconf.pc4_name_max; 11113 break; 11114 case _PC_CHOWN_RESTRICTED: 11115 *valp = 11116 rp->r_pathconf.pc4_chown_restricted; 11117 break; 11118 case _PC_NO_TRUNC: 11119 *valp = 11120 rp->r_pathconf.pc4_no_trunc; 11121 break; 11122 default: 11123 error = EINVAL; 11124 break; 11125 } 11126 mutex_exit(&rp->r_statelock); 11127 #ifdef DEBUG 11128 nfs4_pathconf_cache_hits++; 11129 #endif 11130 return (error); 11131 } 11132 mutex_exit(&rp->r_statelock); 11133 } 11134 } 11135 #ifdef DEBUG 11136 nfs4_pathconf_cache_misses++; 11137 #endif 11138 11139 t = gethrtime(); 11140 11141 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11142 11143 if (error) { 11144 mutex_enter(&rp->r_statelock); 11145 rp->r_pathconf.pc4_cache_valid = FALSE; 11146 rp->r_pathconf.pc4_xattr_valid = FALSE; 11147 mutex_exit(&rp->r_statelock); 11148 return (error); 11149 } 11150 11151 /* interpret the max filesize */ 11152 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11153 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11154 11155 /* Store the attributes we just received */ 11156 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11157 11158 switch (cmd) { 11159 case _PC_FILESIZEBITS: 11160 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11161 break; 11162 case _PC_LINK_MAX: 11163 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11164 break; 11165 case _PC_NAME_MAX: 11166 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11167 break; 11168 case _PC_CHOWN_RESTRICTED: 11169 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11170 break; 11171 case _PC_NO_TRUNC: 11172 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11173 break; 11174 case _PC_XATTR_EXISTS: 11175 *valp = gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists; 11176 break; 11177 default: 11178 return (EINVAL); 11179 } 11180 11181 return (0); 11182 } 11183 11184 /* 11185 * Called by async thread to do synchronous pageio. Do the i/o, wait 11186 * for it to complete, and cleanup the page list when done. 11187 */ 11188 static int 11189 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11190 int flags, cred_t *cr) 11191 { 11192 int error; 11193 11194 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11195 11196 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11197 if (flags & B_READ) 11198 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11199 else 11200 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11201 return (error); 11202 } 11203 11204 static int 11205 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11206 int flags, cred_t *cr) 11207 { 11208 int error; 11209 rnode4_t *rp; 11210 11211 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11212 return (EIO); 11213 11214 if (pp == NULL) 11215 return (EINVAL); 11216 11217 rp = VTOR4(vp); 11218 mutex_enter(&rp->r_statelock); 11219 rp->r_count++; 11220 mutex_exit(&rp->r_statelock); 11221 11222 if (flags & B_ASYNC) { 11223 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11224 nfs4_sync_pageio); 11225 } else 11226 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11227 mutex_enter(&rp->r_statelock); 11228 rp->r_count--; 11229 cv_broadcast(&rp->r_cv); 11230 mutex_exit(&rp->r_statelock); 11231 return (error); 11232 } 11233 11234 static void 11235 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr) 11236 { 11237 int error; 11238 rnode4_t *rp; 11239 page_t *plist; 11240 page_t *pptr; 11241 offset3 offset; 11242 count3 len; 11243 k_sigset_t smask; 11244 11245 /* 11246 * We should get called with fl equal to either B_FREE or 11247 * B_INVAL. Any other value is illegal. 11248 * 11249 * The page that we are either supposed to free or destroy 11250 * should be exclusive locked and its io lock should not 11251 * be held. 11252 */ 11253 ASSERT(fl == B_FREE || fl == B_INVAL); 11254 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11255 11256 rp = VTOR4(vp); 11257 11258 /* 11259 * If the page doesn't need to be committed or we shouldn't 11260 * even bother attempting to commit it, then just make sure 11261 * that the p_fsdata byte is clear and then either free or 11262 * destroy the page as appropriate. 11263 */ 11264 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11265 pp->p_fsdata = C_NOCOMMIT; 11266 if (fl == B_FREE) 11267 page_free(pp, dn); 11268 else 11269 page_destroy(pp, dn); 11270 return; 11271 } 11272 11273 /* 11274 * If there is a page invalidation operation going on, then 11275 * if this is one of the pages being destroyed, then just 11276 * clear the p_fsdata byte and then either free or destroy 11277 * the page as appropriate. 11278 */ 11279 mutex_enter(&rp->r_statelock); 11280 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11281 mutex_exit(&rp->r_statelock); 11282 pp->p_fsdata = C_NOCOMMIT; 11283 if (fl == B_FREE) 11284 page_free(pp, dn); 11285 else 11286 page_destroy(pp, dn); 11287 return; 11288 } 11289 11290 /* 11291 * If we are freeing this page and someone else is already 11292 * waiting to do a commit, then just unlock the page and 11293 * return. That other thread will take care of commiting 11294 * this page. The page can be freed sometime after the 11295 * commit has finished. Otherwise, if the page is marked 11296 * as delay commit, then we may be getting called from 11297 * pvn_write_done, one page at a time. This could result 11298 * in one commit per page, so we end up doing lots of small 11299 * commits instead of fewer larger commits. This is bad, 11300 * we want do as few commits as possible. 11301 */ 11302 if (fl == B_FREE) { 11303 if (rp->r_flags & R4COMMITWAIT) { 11304 page_unlock(pp); 11305 mutex_exit(&rp->r_statelock); 11306 return; 11307 } 11308 if (pp->p_fsdata == C_DELAYCOMMIT) { 11309 pp->p_fsdata = C_COMMIT; 11310 page_unlock(pp); 11311 mutex_exit(&rp->r_statelock); 11312 return; 11313 } 11314 } 11315 11316 /* 11317 * Check to see if there is a signal which would prevent an 11318 * attempt to commit the pages from being successful. If so, 11319 * then don't bother with all of the work to gather pages and 11320 * generate the unsuccessful RPC. Just return from here and 11321 * let the page be committed at some later time. 11322 */ 11323 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11324 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11325 sigunintr(&smask); 11326 page_unlock(pp); 11327 mutex_exit(&rp->r_statelock); 11328 return; 11329 } 11330 sigunintr(&smask); 11331 11332 /* 11333 * We are starting to need to commit pages, so let's try 11334 * to commit as many as possible at once to reduce the 11335 * overhead. 11336 * 11337 * Set the `commit inprogress' state bit. We must 11338 * first wait until any current one finishes. Then 11339 * we initialize the c_pages list with this page. 11340 */ 11341 while (rp->r_flags & R4COMMIT) { 11342 rp->r_flags |= R4COMMITWAIT; 11343 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11344 rp->r_flags &= ~R4COMMITWAIT; 11345 } 11346 rp->r_flags |= R4COMMIT; 11347 mutex_exit(&rp->r_statelock); 11348 ASSERT(rp->r_commit.c_pages == NULL); 11349 rp->r_commit.c_pages = pp; 11350 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11351 rp->r_commit.c_commlen = PAGESIZE; 11352 11353 /* 11354 * Gather together all other pages which can be committed. 11355 * They will all be chained off r_commit.c_pages. 11356 */ 11357 nfs4_get_commit(vp); 11358 11359 /* 11360 * Clear the `commit inprogress' status and disconnect 11361 * the list of pages to be committed from the rnode. 11362 * At this same time, we also save the starting offset 11363 * and length of data to be committed on the server. 11364 */ 11365 plist = rp->r_commit.c_pages; 11366 rp->r_commit.c_pages = NULL; 11367 offset = rp->r_commit.c_commbase; 11368 len = rp->r_commit.c_commlen; 11369 mutex_enter(&rp->r_statelock); 11370 rp->r_flags &= ~R4COMMIT; 11371 cv_broadcast(&rp->r_commit.c_cv); 11372 mutex_exit(&rp->r_statelock); 11373 11374 if (curproc == proc_pageout || curproc == proc_fsflush || 11375 nfs_zone() != VTOMI4(vp)->mi_zone) { 11376 nfs4_async_commit(vp, plist, offset, len, 11377 cr, do_nfs4_async_commit); 11378 return; 11379 } 11380 11381 /* 11382 * Actually generate the COMMIT op over the wire operation. 11383 */ 11384 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11385 11386 /* 11387 * If we got an error during the commit, just unlock all 11388 * of the pages. The pages will get retransmitted to the 11389 * server during a putpage operation. 11390 */ 11391 if (error) { 11392 while (plist != NULL) { 11393 pptr = plist; 11394 page_sub(&plist, pptr); 11395 page_unlock(pptr); 11396 } 11397 return; 11398 } 11399 11400 /* 11401 * We've tried as hard as we can to commit the data to stable 11402 * storage on the server. We just unlock the rest of the pages 11403 * and clear the commit required state. They will be put 11404 * onto the tail of the cachelist if they are nolonger 11405 * mapped. 11406 */ 11407 while (plist != pp) { 11408 pptr = plist; 11409 page_sub(&plist, pptr); 11410 pptr->p_fsdata = C_NOCOMMIT; 11411 page_unlock(pptr); 11412 } 11413 11414 /* 11415 * It is possible that nfs4_commit didn't return error but 11416 * some other thread has modified the page we are going 11417 * to free/destroy. 11418 * In this case we need to rewrite the page. Do an explicit check 11419 * before attempting to free/destroy the page. If modified, needs to 11420 * be rewritten so unlock the page and return. 11421 */ 11422 if (hat_ismod(pp)) { 11423 pp->p_fsdata = C_NOCOMMIT; 11424 page_unlock(pp); 11425 return; 11426 } 11427 11428 /* 11429 * Now, as appropriate, either free or destroy the page 11430 * that we were called with. 11431 */ 11432 pp->p_fsdata = C_NOCOMMIT; 11433 if (fl == B_FREE) 11434 page_free(pp, dn); 11435 else 11436 page_destroy(pp, dn); 11437 } 11438 11439 /* 11440 * Commit requires that the current fh be the file written to. 11441 * The compound op structure is: 11442 * PUTFH(file), COMMIT 11443 */ 11444 static int 11445 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11446 { 11447 COMPOUND4args_clnt args; 11448 COMPOUND4res_clnt res; 11449 COMMIT4res *cm_res; 11450 nfs_argop4 argop[2]; 11451 nfs_resop4 *resop; 11452 int doqueue; 11453 mntinfo4_t *mi; 11454 rnode4_t *rp; 11455 cred_t *cred_otw = NULL; 11456 bool_t needrecov = FALSE; 11457 nfs4_recov_state_t recov_state; 11458 nfs4_open_stream_t *osp = NULL; 11459 bool_t first_time = TRUE; /* first time getting OTW cred */ 11460 bool_t last_time = FALSE; /* last time getting OTW cred */ 11461 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11462 11463 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11464 11465 rp = VTOR4(vp); 11466 11467 mi = VTOMI4(vp); 11468 recov_state.rs_flags = 0; 11469 recov_state.rs_num_retry_despite_err = 0; 11470 get_commit_cred: 11471 /* 11472 * Releases the osp, if a valid open stream is provided. 11473 * Puts a hold on the cred_otw and the new osp (if found). 11474 */ 11475 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11476 &first_time, &last_time); 11477 args.ctag = TAG_COMMIT; 11478 recov_retry: 11479 /* 11480 * Commit ops: putfh file; commit 11481 */ 11482 args.array_len = 2; 11483 args.array = argop; 11484 11485 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11486 &recov_state, NULL); 11487 if (e.error) { 11488 crfree(cred_otw); 11489 if (osp != NULL) 11490 open_stream_rele(osp, rp); 11491 return (e.error); 11492 } 11493 11494 /* putfh directory */ 11495 argop[0].argop = OP_CPUTFH; 11496 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11497 11498 /* commit */ 11499 argop[1].argop = OP_COMMIT; 11500 argop[1].nfs_argop4_u.opcommit.offset = offset; 11501 argop[1].nfs_argop4_u.opcommit.count = count; 11502 11503 doqueue = 1; 11504 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11505 11506 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11507 if (!needrecov && e.error) { 11508 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11509 needrecov); 11510 crfree(cred_otw); 11511 if (e.error == EACCES && last_time == FALSE) 11512 goto get_commit_cred; 11513 if (osp != NULL) 11514 open_stream_rele(osp, rp); 11515 return (e.error); 11516 } 11517 11518 if (needrecov) { 11519 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11520 NULL, OP_COMMIT, NULL) == FALSE) { 11521 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11522 &recov_state, needrecov); 11523 if (!e.error) 11524 (void) xdr_free(xdr_COMPOUND4res_clnt, 11525 (caddr_t)&res); 11526 goto recov_retry; 11527 } 11528 if (e.error) { 11529 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11530 &recov_state, needrecov); 11531 crfree(cred_otw); 11532 if (osp != NULL) 11533 open_stream_rele(osp, rp); 11534 return (e.error); 11535 } 11536 /* fall through for res.status case */ 11537 } 11538 11539 if (res.status) { 11540 e.error = geterrno4(res.status); 11541 if (e.error == EACCES && last_time == FALSE) { 11542 crfree(cred_otw); 11543 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11544 &recov_state, needrecov); 11545 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11546 goto get_commit_cred; 11547 } 11548 /* 11549 * Can't do a nfs4_purge_stale_fh here because this 11550 * can cause a deadlock. nfs4_commit can 11551 * be called from nfs4_dispose which can be called 11552 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11553 * can call back to pvn_vplist_dirty. 11554 */ 11555 if (e.error == ESTALE) { 11556 mutex_enter(&rp->r_statelock); 11557 rp->r_flags |= R4STALE; 11558 if (!rp->r_error) 11559 rp->r_error = e.error; 11560 mutex_exit(&rp->r_statelock); 11561 PURGE_ATTRCACHE4(vp); 11562 } else { 11563 mutex_enter(&rp->r_statelock); 11564 if (!rp->r_error) 11565 rp->r_error = e.error; 11566 mutex_exit(&rp->r_statelock); 11567 } 11568 } else { 11569 ASSERT(rp->r_flags & R4HAVEVERF); 11570 resop = &res.array[1]; /* commit res */ 11571 cm_res = &resop->nfs_resop4_u.opcommit; 11572 mutex_enter(&rp->r_statelock); 11573 if (cm_res->writeverf == rp->r_writeverf) { 11574 mutex_exit(&rp->r_statelock); 11575 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11576 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11577 &recov_state, needrecov); 11578 crfree(cred_otw); 11579 if (osp != NULL) 11580 open_stream_rele(osp, rp); 11581 return (0); 11582 } 11583 nfs4_set_mod(vp); 11584 rp->r_writeverf = cm_res->writeverf; 11585 mutex_exit(&rp->r_statelock); 11586 e.error = NFS_VERF_MISMATCH; 11587 } 11588 11589 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11590 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11591 crfree(cred_otw); 11592 if (osp != NULL) 11593 open_stream_rele(osp, rp); 11594 11595 return (e.error); 11596 } 11597 11598 static void 11599 nfs4_set_mod(vnode_t *vp) 11600 { 11601 page_t *pp; 11602 kmutex_t *vphm; 11603 rnode4_t *rp; 11604 11605 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11606 11607 /* make sure we're looking at the master vnode, not a shadow */ 11608 11609 rp = VTOR4(vp); 11610 if (IS_SHADOW(vp, rp)) 11611 vp = RTOV4(rp); 11612 11613 vphm = page_vnode_mutex(vp); 11614 mutex_enter(vphm); 11615 /* 11616 * If there are no pages associated with this vnode, then 11617 * just return. 11618 */ 11619 if ((pp = vp->v_pages) == NULL) { 11620 mutex_exit(vphm); 11621 return; 11622 } 11623 11624 do { 11625 if (pp->p_fsdata != C_NOCOMMIT) { 11626 hat_setmod(pp); 11627 pp->p_fsdata = C_NOCOMMIT; 11628 } 11629 } while ((pp = pp->p_vpnext) != vp->v_pages); 11630 mutex_exit(vphm); 11631 } 11632 11633 /* 11634 * This function is used to gather a page list of the pages which 11635 * can be committed on the server. 11636 * 11637 * The calling thread must have set R4COMMIT. This bit is used to 11638 * serialize access to the commit structure in the rnode. As long 11639 * as the thread has set R4COMMIT, then it can manipulate the commit 11640 * structure without requiring any other locks. 11641 * 11642 * When this function is called from nfs4_dispose() the page passed 11643 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11644 * will skip it. This is not a problem since we initially add the 11645 * page to the r_commit page list. 11646 * 11647 */ 11648 static void 11649 nfs4_get_commit(vnode_t *vp) 11650 { 11651 rnode4_t *rp; 11652 page_t *pp; 11653 kmutex_t *vphm; 11654 11655 rp = VTOR4(vp); 11656 11657 ASSERT(rp->r_flags & R4COMMIT); 11658 11659 /* make sure we're looking at the master vnode, not a shadow */ 11660 11661 if (IS_SHADOW(vp, rp)) 11662 vp = RTOV4(rp); 11663 11664 vphm = page_vnode_mutex(vp); 11665 mutex_enter(vphm); 11666 11667 /* 11668 * If there are no pages associated with this vnode, then 11669 * just return. 11670 */ 11671 if ((pp = vp->v_pages) == NULL) { 11672 mutex_exit(vphm); 11673 return; 11674 } 11675 11676 /* 11677 * Step through all of the pages associated with this vnode 11678 * looking for pages which need to be committed. 11679 */ 11680 do { 11681 /* 11682 * First short-cut everything (without the page_lock) 11683 * and see if this page does not need to be committed 11684 * or is modified if so then we'll just skip it. 11685 */ 11686 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11687 continue; 11688 11689 /* 11690 * Attempt to lock the page. If we can't, then 11691 * someone else is messing with it or we have been 11692 * called from nfs4_dispose and this is the page that 11693 * nfs4_dispose was called with.. anyway just skip it. 11694 */ 11695 if (!page_trylock(pp, SE_EXCL)) 11696 continue; 11697 11698 /* 11699 * Lets check again now that we have the page lock. 11700 */ 11701 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11702 page_unlock(pp); 11703 continue; 11704 } 11705 11706 /* this had better not be a free page */ 11707 ASSERT(PP_ISFREE(pp) == 0); 11708 11709 /* 11710 * The page needs to be committed and we locked it. 11711 * Update the base and length parameters and add it 11712 * to r_pages. 11713 */ 11714 if (rp->r_commit.c_pages == NULL) { 11715 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11716 rp->r_commit.c_commlen = PAGESIZE; 11717 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11718 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11719 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11720 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11721 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11722 <= pp->p_offset) { 11723 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11724 rp->r_commit.c_commbase + PAGESIZE; 11725 } 11726 page_add(&rp->r_commit.c_pages, pp); 11727 } while ((pp = pp->p_vpnext) != vp->v_pages); 11728 11729 mutex_exit(vphm); 11730 } 11731 11732 /* 11733 * This routine is used to gather together a page list of the pages 11734 * which are to be committed on the server. This routine must not 11735 * be called if the calling thread holds any locked pages. 11736 * 11737 * The calling thread must have set R4COMMIT. This bit is used to 11738 * serialize access to the commit structure in the rnode. As long 11739 * as the thread has set R4COMMIT, then it can manipulate the commit 11740 * structure without requiring any other locks. 11741 */ 11742 static void 11743 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11744 { 11745 11746 rnode4_t *rp; 11747 page_t *pp; 11748 u_offset_t end; 11749 u_offset_t off; 11750 ASSERT(len != 0); 11751 rp = VTOR4(vp); 11752 ASSERT(rp->r_flags & R4COMMIT); 11753 11754 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11755 11756 /* make sure we're looking at the master vnode, not a shadow */ 11757 11758 if (IS_SHADOW(vp, rp)) 11759 vp = RTOV4(rp); 11760 11761 /* 11762 * If there are no pages associated with this vnode, then 11763 * just return. 11764 */ 11765 if ((pp = vp->v_pages) == NULL) 11766 return; 11767 /* 11768 * Calculate the ending offset. 11769 */ 11770 end = soff + len; 11771 for (off = soff; off < end; off += PAGESIZE) { 11772 /* 11773 * Lookup each page by vp, offset. 11774 */ 11775 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11776 continue; 11777 /* 11778 * If this page does not need to be committed or is 11779 * modified, then just skip it. 11780 */ 11781 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11782 page_unlock(pp); 11783 continue; 11784 } 11785 11786 ASSERT(PP_ISFREE(pp) == 0); 11787 /* 11788 * The page needs to be committed and we locked it. 11789 * Update the base and length parameters and add it 11790 * to r_pages. 11791 */ 11792 if (rp->r_commit.c_pages == NULL) { 11793 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11794 rp->r_commit.c_commlen = PAGESIZE; 11795 } else { 11796 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11797 rp->r_commit.c_commbase + PAGESIZE; 11798 } 11799 page_add(&rp->r_commit.c_pages, pp); 11800 } 11801 } 11802 11803 /* 11804 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 11805 * Flushes and commits data to the server. 11806 */ 11807 static int 11808 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 11809 { 11810 int error; 11811 verifier4 write_verf; 11812 rnode4_t *rp = VTOR4(vp); 11813 11814 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11815 11816 /* 11817 * Flush the data portion of the file and then commit any 11818 * portions which need to be committed. This may need to 11819 * be done twice if the server has changed state since 11820 * data was last written. The data will need to be 11821 * rewritten to the server and then a new commit done. 11822 * 11823 * In fact, this may need to be done several times if the 11824 * server is having problems and crashing while we are 11825 * attempting to do this. 11826 */ 11827 11828 top: 11829 /* 11830 * Do a flush based on the poff and plen arguments. This 11831 * will synchronously write out any modified pages in the 11832 * range specified by (poff, plen). This starts all of the 11833 * i/o operations which will be waited for in the next 11834 * call to nfs4_putpage 11835 */ 11836 11837 mutex_enter(&rp->r_statelock); 11838 write_verf = rp->r_writeverf; 11839 mutex_exit(&rp->r_statelock); 11840 11841 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr); 11842 if (error == EAGAIN) 11843 error = 0; 11844 11845 /* 11846 * Do a flush based on the poff and plen arguments. This 11847 * will synchronously write out any modified pages in the 11848 * range specified by (poff, plen) and wait until all of 11849 * the asynchronous i/o's in that range are done as well. 11850 */ 11851 if (!error) 11852 error = nfs4_putpage(vp, poff, plen, 0, cr); 11853 11854 if (error) 11855 return (error); 11856 11857 mutex_enter(&rp->r_statelock); 11858 if (rp->r_writeverf != write_verf) { 11859 mutex_exit(&rp->r_statelock); 11860 goto top; 11861 } 11862 mutex_exit(&rp->r_statelock); 11863 11864 /* 11865 * Now commit any pages which might need to be committed. 11866 * If the error, NFS_VERF_MISMATCH, is returned, then 11867 * start over with the flush operation. 11868 */ 11869 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 11870 11871 if (error == NFS_VERF_MISMATCH) 11872 goto top; 11873 11874 return (error); 11875 } 11876 11877 /* 11878 * nfs4_commit_vp() will wait for other pending commits and 11879 * will either commit the whole file or a range, plen dictates 11880 * if we commit whole file. a value of zero indicates the whole 11881 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 11882 */ 11883 static int 11884 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 11885 cred_t *cr, int wait_on_writes) 11886 { 11887 rnode4_t *rp; 11888 page_t *plist; 11889 offset3 offset; 11890 count3 len; 11891 11892 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11893 11894 rp = VTOR4(vp); 11895 11896 /* 11897 * before we gather commitable pages make 11898 * sure there are no outstanding async writes 11899 */ 11900 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 11901 mutex_enter(&rp->r_statelock); 11902 while (rp->r_count > 0) { 11903 cv_wait(&rp->r_cv, &rp->r_statelock); 11904 } 11905 mutex_exit(&rp->r_statelock); 11906 } 11907 11908 /* 11909 * Set the `commit inprogress' state bit. We must 11910 * first wait until any current one finishes. 11911 */ 11912 mutex_enter(&rp->r_statelock); 11913 while (rp->r_flags & R4COMMIT) { 11914 rp->r_flags |= R4COMMITWAIT; 11915 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11916 rp->r_flags &= ~R4COMMITWAIT; 11917 } 11918 rp->r_flags |= R4COMMIT; 11919 mutex_exit(&rp->r_statelock); 11920 11921 /* 11922 * Gather all of the pages which need to be 11923 * committed. 11924 */ 11925 if (plen == 0) 11926 nfs4_get_commit(vp); 11927 else 11928 nfs4_get_commit_range(vp, poff, plen); 11929 11930 /* 11931 * Clear the `commit inprogress' bit and disconnect the 11932 * page list which was gathered by nfs4_get_commit. 11933 */ 11934 plist = rp->r_commit.c_pages; 11935 rp->r_commit.c_pages = NULL; 11936 offset = rp->r_commit.c_commbase; 11937 len = rp->r_commit.c_commlen; 11938 mutex_enter(&rp->r_statelock); 11939 rp->r_flags &= ~R4COMMIT; 11940 cv_broadcast(&rp->r_commit.c_cv); 11941 mutex_exit(&rp->r_statelock); 11942 11943 /* 11944 * If any pages need to be committed, commit them and 11945 * then unlock them so that they can be freed some 11946 * time later. 11947 */ 11948 if (plist == NULL) 11949 return (0); 11950 11951 /* 11952 * No error occurred during the flush portion 11953 * of this operation, so now attempt to commit 11954 * the data to stable storage on the server. 11955 * 11956 * This will unlock all of the pages on the list. 11957 */ 11958 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 11959 } 11960 11961 static int 11962 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 11963 cred_t *cr) 11964 { 11965 int error; 11966 page_t *pp; 11967 11968 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11969 11970 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 11971 11972 /* 11973 * If we got an error, then just unlock all of the pages 11974 * on the list. 11975 */ 11976 if (error) { 11977 while (plist != NULL) { 11978 pp = plist; 11979 page_sub(&plist, pp); 11980 page_unlock(pp); 11981 } 11982 return (error); 11983 } 11984 /* 11985 * We've tried as hard as we can to commit the data to stable 11986 * storage on the server. We just unlock the pages and clear 11987 * the commit required state. They will get freed later. 11988 */ 11989 while (plist != NULL) { 11990 pp = plist; 11991 page_sub(&plist, pp); 11992 pp->p_fsdata = C_NOCOMMIT; 11993 page_unlock(pp); 11994 } 11995 11996 return (error); 11997 } 11998 11999 static void 12000 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12001 cred_t *cr) 12002 { 12003 12004 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12005 } 12006 12007 /*ARGSUSED*/ 12008 static int 12009 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 12010 { 12011 int error = 0; 12012 mntinfo4_t *mi; 12013 vattr_t va; 12014 vsecattr_t nfsace4_vsap; 12015 12016 mi = VTOMI4(vp); 12017 if (nfs_zone() != mi->mi_zone) 12018 return (EIO); 12019 if (mi->mi_flags & MI4_ACL) { 12020 /* if we have a delegation, return it */ 12021 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12022 (void) nfs4delegreturn(VTOR4(vp), 12023 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12024 12025 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12026 NFS4_ACL_SET); 12027 if (error) /* EINVAL */ 12028 return (error); 12029 12030 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12031 /* 12032 * These are aclent_t type entries. 12033 */ 12034 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12035 vp->v_type == VDIR, FALSE); 12036 if (error) 12037 return (error); 12038 } else { 12039 /* 12040 * These are ace_t type entries. 12041 */ 12042 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12043 FALSE); 12044 if (error) 12045 return (error); 12046 } 12047 bzero(&va, sizeof (va)); 12048 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12049 vs_ace4_destroy(&nfsace4_vsap); 12050 return (error); 12051 } 12052 return (ENOSYS); 12053 } 12054 12055 static int 12056 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr) 12057 { 12058 int error; 12059 mntinfo4_t *mi; 12060 nfs4_ga_res_t gar; 12061 rnode4_t *rp = VTOR4(vp); 12062 12063 mi = VTOMI4(vp); 12064 if (nfs_zone() != mi->mi_zone) 12065 return (EIO); 12066 12067 bzero(&gar, sizeof (gar)); 12068 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12069 12070 /* 12071 * vsecattr->vsa_mask holds the original acl request mask. 12072 * This is needed when determining what to return. 12073 * (See: nfs4_create_getsecattr_return()) 12074 */ 12075 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12076 if (error) /* EINVAL */ 12077 return (error); 12078 12079 if (mi->mi_flags & MI4_ACL) { 12080 /* 12081 * Check if the data is cached and the cache is valid. If it 12082 * is we don't go over the wire. 12083 */ 12084 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12085 mutex_enter(&rp->r_statelock); 12086 if (rp->r_secattr != NULL) { 12087 error = nfs4_create_getsecattr_return( 12088 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12089 rp->r_attr.va_gid, 12090 vp->v_type == VDIR); 12091 if (!error) { /* error == 0 - Success! */ 12092 mutex_exit(&rp->r_statelock); 12093 return (error); 12094 } 12095 } 12096 mutex_exit(&rp->r_statelock); 12097 } 12098 12099 /* 12100 * The getattr otw call will always get both the acl, in 12101 * the form of a list of nfsace4's, and the number of acl 12102 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12103 */ 12104 gar.n4g_va.va_mask = AT_ALL; 12105 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12106 if (error) { 12107 vs_ace4_destroy(&gar.n4g_vsa); 12108 if (error == ENOTSUP || error == EOPNOTSUPP) 12109 error = fs_fab_acl(vp, vsecattr, flag, cr); 12110 return (error); 12111 } 12112 12113 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12114 /* 12115 * No error was returned, but according to the response 12116 * bitmap, neither was an acl. 12117 */ 12118 vs_ace4_destroy(&gar.n4g_vsa); 12119 error = fs_fab_acl(vp, vsecattr, flag, cr); 12120 return (error); 12121 } 12122 12123 /* 12124 * Update the cache with the ACL. 12125 */ 12126 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12127 12128 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12129 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12130 vp->v_type == VDIR); 12131 vs_ace4_destroy(&gar.n4g_vsa); 12132 if ((error) && (vsecattr->vsa_mask & 12133 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12134 (error != EACCES)) { 12135 error = fs_fab_acl(vp, vsecattr, flag, cr); 12136 } 12137 return (error); 12138 } 12139 error = fs_fab_acl(vp, vsecattr, flag, cr); 12140 return (error); 12141 } 12142 12143 /* 12144 * The function returns: 12145 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12146 * - EINVAL if the passed in "acl_mask" is an invalid request. 12147 * 12148 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12149 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12150 * 12151 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12152 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12153 * - We have a count field set without the corresponding acl field set. (e.g. - 12154 * VSA_ACECNT is set, but VSA_ACE is not) 12155 */ 12156 static int 12157 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12158 { 12159 /* Shortcut the masks that are always valid. */ 12160 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12161 return (0); 12162 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12163 return (0); 12164 12165 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12166 /* 12167 * We can't have any VSA_ACL type stuff in the mask now. 12168 */ 12169 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12170 VSA_DFACLCNT)) 12171 return (EINVAL); 12172 12173 if (op == NFS4_ACL_SET) { 12174 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12175 return (EINVAL); 12176 } 12177 } 12178 12179 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12180 /* 12181 * We can't have any VSA_ACE type stuff in the mask now. 12182 */ 12183 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12184 return (EINVAL); 12185 12186 if (op == NFS4_ACL_SET) { 12187 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12188 return (EINVAL); 12189 12190 if ((acl_mask & VSA_DFACLCNT) && 12191 !(acl_mask & VSA_DFACL)) 12192 return (EINVAL); 12193 } 12194 } 12195 return (0); 12196 } 12197 12198 /* 12199 * The theory behind creating the correct getsecattr return is simply this: 12200 * "Don't return anything that the caller is not expecting to have to free." 12201 */ 12202 static int 12203 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12204 uid_t uid, gid_t gid, int isdir) 12205 { 12206 int error = 0; 12207 /* Save the mask since the translators modify it. */ 12208 uint_t orig_mask = vsap->vsa_mask; 12209 12210 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12211 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12212 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12213 12214 if (error) 12215 return (error); 12216 12217 /* 12218 * If the caller only asked for the ace count (VSA_ACECNT) 12219 * don't give them the full acl (VSA_ACE), free it. 12220 */ 12221 if (!orig_mask & VSA_ACE) { 12222 if (vsap->vsa_aclentp != NULL) { 12223 kmem_free(vsap->vsa_aclentp, 12224 vsap->vsa_aclcnt * sizeof (ace_t)); 12225 vsap->vsa_aclentp = NULL; 12226 } 12227 } 12228 vsap->vsa_mask = orig_mask; 12229 12230 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12231 VSA_DFACLCNT)) { 12232 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12233 isdir, FALSE, 12234 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12235 12236 if (error) 12237 return (error); 12238 12239 /* 12240 * If the caller only asked for the acl count (VSA_ACLCNT) 12241 * and/or the default acl count (VSA_DFACLCNT) don't give them 12242 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12243 */ 12244 if (!orig_mask & VSA_ACL) { 12245 if (vsap->vsa_aclentp != NULL) { 12246 kmem_free(vsap->vsa_aclentp, 12247 vsap->vsa_aclcnt * sizeof (aclent_t)); 12248 vsap->vsa_aclentp = NULL; 12249 } 12250 } 12251 12252 if (!orig_mask & VSA_DFACL) { 12253 if (vsap->vsa_dfaclentp != NULL) { 12254 kmem_free(vsap->vsa_dfaclentp, 12255 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12256 vsap->vsa_dfaclentp = NULL; 12257 } 12258 } 12259 vsap->vsa_mask = orig_mask; 12260 } 12261 return (0); 12262 } 12263 12264 static int 12265 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr) 12266 { 12267 int error; 12268 12269 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12270 return (EIO); 12271 /* 12272 * check for valid cmd parameter 12273 */ 12274 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12275 return (EINVAL); 12276 12277 /* 12278 * Check access permissions 12279 */ 12280 if ((cmd & F_SHARE) && 12281 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12282 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12283 return (EBADF); 12284 12285 /* 12286 * If the filesystem is mounted using local locking, pass the 12287 * request off to the local share code. 12288 */ 12289 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12290 return (fs_shrlock(vp, cmd, shr, flag, cr)); 12291 12292 switch (cmd) { 12293 case F_SHARE: 12294 case F_UNSHARE: 12295 /* 12296 * This will be properly implemented later, 12297 * see RFE: 4823948 . 12298 */ 12299 error = EAGAIN; 12300 break; 12301 12302 case F_HASREMOTELOCKS: 12303 /* 12304 * NFS client can't store remote locks itself 12305 */ 12306 shr->s_access = 0; 12307 error = 0; 12308 break; 12309 12310 default: 12311 error = EINVAL; 12312 break; 12313 } 12314 12315 return (error); 12316 } 12317 12318 /* 12319 * Common code called by directory ops to update the attrcache 12320 */ 12321 static int 12322 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12323 hrtime_t t, vnode_t *vp, cred_t *cr) 12324 { 12325 int error = 0; 12326 12327 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12328 12329 if (status != NFS4_OK) { 12330 /* getattr not done or failed */ 12331 PURGE_ATTRCACHE4(vp); 12332 return (error); 12333 } 12334 12335 if (garp) { 12336 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12337 } else { 12338 PURGE_ATTRCACHE4(vp); 12339 } 12340 return (error); 12341 } 12342 12343 /* 12344 * Update directory caches for directory modification ops (link, rename, etc.) 12345 * When dinfo is NULL, manage dircaches in the old way. 12346 */ 12347 static void 12348 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12349 dirattr_info_t *dinfo) 12350 { 12351 rnode4_t *drp = VTOR4(dvp); 12352 12353 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12354 12355 /* Purge rddir cache for dir since it changed */ 12356 if (drp->r_dir != NULL) 12357 nfs4_purge_rddir_cache(dvp); 12358 12359 /* 12360 * If caller provided dinfo, then use it to manage dir caches. 12361 */ 12362 if (dinfo != NULL) { 12363 if (vp != NULL) { 12364 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12365 if (!VTOR4(vp)->created_v4) { 12366 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12367 dnlc_update(dvp, nm, vp); 12368 } else { 12369 /* 12370 * XXX don't update if the created_v4 flag is 12371 * set 12372 */ 12373 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12374 NFS4_DEBUG(nfs4_client_state_debug, 12375 (CE_NOTE, "nfs4_update_dircaches: " 12376 "don't update dnlc: created_v4 flag")); 12377 } 12378 } 12379 12380 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12381 dinfo->di_cred, FALSE, cinfo); 12382 12383 return; 12384 } 12385 12386 /* 12387 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12388 * Since caller modified dir but didn't receive post-dirmod-op dir 12389 * attrs, the dir's attrs must be purged. 12390 * 12391 * XXX this check and dnlc update/purge should really be atomic, 12392 * XXX but can't use rnode statelock because it'll deadlock in 12393 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12394 * XXX does occur. 12395 * 12396 * XXX We also may want to check that atomic is true in the 12397 * XXX change_info struct. If it is not, the change_info may 12398 * XXX reflect changes by more than one clients which means that 12399 * XXX our cache may not be valid. 12400 */ 12401 PURGE_ATTRCACHE4(dvp); 12402 if (drp->r_change == cinfo->before) { 12403 /* no changes took place in the directory prior to our link */ 12404 if (vp != NULL) { 12405 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12406 if (!VTOR4(vp)->created_v4) { 12407 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12408 dnlc_update(dvp, nm, vp); 12409 } else { 12410 /* 12411 * XXX dont' update if the created_v4 flag 12412 * is set 12413 */ 12414 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12415 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12416 "nfs4_update_dircaches: don't" 12417 " update dnlc: created_v4 flag")); 12418 } 12419 } 12420 } else { 12421 /* Another client modified directory - purge its dnlc cache */ 12422 dnlc_purge_vp(dvp); 12423 } 12424 } 12425 12426 /* 12427 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12428 * file. 12429 * 12430 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12431 * file (ie: client recovery) and otherwise set to FALSE. 12432 * 12433 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12434 * initiated) calling functions. 12435 * 12436 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12437 * of resending a 'lost' open request. 12438 * 12439 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12440 * server that hands out BAD_SEQID on open confirm. 12441 * 12442 * Errors are returned via the nfs4_error_t parameter. 12443 */ 12444 void 12445 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12446 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12447 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12448 { 12449 COMPOUND4args_clnt args; 12450 COMPOUND4res_clnt res; 12451 nfs_argop4 argop[2]; 12452 nfs_resop4 *resop; 12453 int doqueue = 1; 12454 mntinfo4_t *mi; 12455 OPEN_CONFIRM4args *open_confirm_args; 12456 int needrecov; 12457 12458 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12459 #if DEBUG 12460 mutex_enter(&oop->oo_lock); 12461 ASSERT(oop->oo_seqid_inuse); 12462 mutex_exit(&oop->oo_lock); 12463 #endif 12464 12465 recov_retry_confirm: 12466 nfs4_error_zinit(ep); 12467 *retry_open = FALSE; 12468 12469 if (resend) 12470 args.ctag = TAG_OPEN_CONFIRM_LOST; 12471 else 12472 args.ctag = TAG_OPEN_CONFIRM; 12473 12474 args.array_len = 2; 12475 args.array = argop; 12476 12477 /* putfh target fh */ 12478 argop[0].argop = OP_CPUTFH; 12479 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12480 12481 argop[1].argop = OP_OPEN_CONFIRM; 12482 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12483 12484 (*seqid) += 1; 12485 open_confirm_args->seqid = *seqid; 12486 open_confirm_args->open_stateid = *stateid; 12487 12488 mi = VTOMI4(vp); 12489 12490 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12491 12492 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12493 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12494 } 12495 12496 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12497 if (!needrecov && ep->error) 12498 return; 12499 12500 if (needrecov) { 12501 bool_t abort = FALSE; 12502 12503 if (reopening_file == FALSE) { 12504 nfs4_bseqid_entry_t *bsep = NULL; 12505 12506 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12507 bsep = nfs4_create_bseqid_entry(oop, NULL, 12508 vp, 0, args.ctag, 12509 open_confirm_args->seqid); 12510 12511 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12512 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12513 if (bsep) { 12514 kmem_free(bsep, sizeof (*bsep)); 12515 if (num_bseqid_retryp && 12516 --(*num_bseqid_retryp) == 0) 12517 abort = TRUE; 12518 } 12519 } 12520 if ((ep->error == ETIMEDOUT || 12521 res.status == NFS4ERR_RESOURCE) && 12522 abort == FALSE && resend == FALSE) { 12523 if (!ep->error) 12524 (void) xdr_free(xdr_COMPOUND4res_clnt, 12525 (caddr_t)&res); 12526 12527 delay(SEC_TO_TICK(confirm_retry_sec)); 12528 goto recov_retry_confirm; 12529 } 12530 /* State may have changed so retry the entire OPEN op */ 12531 if (abort == FALSE) 12532 *retry_open = TRUE; 12533 else 12534 *retry_open = FALSE; 12535 if (!ep->error) 12536 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12537 return; 12538 } 12539 12540 if (res.status) { 12541 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12542 return; 12543 } 12544 12545 resop = &res.array[1]; /* open confirm res */ 12546 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12547 stateid, sizeof (*stateid)); 12548 12549 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12550 } 12551 12552 /* 12553 * Return the credentials associated with a client state object. The 12554 * caller is responsible for freeing the credentials. 12555 */ 12556 12557 static cred_t * 12558 state_to_cred(nfs4_open_stream_t *osp) 12559 { 12560 cred_t *cr; 12561 12562 /* 12563 * It's ok to not lock the open stream and open owner to get 12564 * the oo_cred since this is only written once (upon creation) 12565 * and will not change. 12566 */ 12567 cr = osp->os_open_owner->oo_cred; 12568 crhold(cr); 12569 12570 return (cr); 12571 } 12572 12573 /* 12574 * nfs4_find_sysid 12575 * 12576 * Find the sysid for the knetconfig associated with the given mi. 12577 */ 12578 static struct lm_sysid * 12579 nfs4_find_sysid(mntinfo4_t *mi) 12580 { 12581 ASSERT(nfs_zone() == mi->mi_zone); 12582 12583 /* 12584 * Switch from RDMA knconf to original mount knconf 12585 */ 12586 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12587 mi->mi_curr_serv->sv_hostname, NULL)); 12588 } 12589 12590 #ifdef DEBUG 12591 /* 12592 * Return a string version of the call type for easy reading. 12593 */ 12594 static char * 12595 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12596 { 12597 switch (ctype) { 12598 case NFS4_LCK_CTYPE_NORM: 12599 return ("NORMAL"); 12600 case NFS4_LCK_CTYPE_RECLAIM: 12601 return ("RECLAIM"); 12602 case NFS4_LCK_CTYPE_RESEND: 12603 return ("RESEND"); 12604 case NFS4_LCK_CTYPE_REINSTATE: 12605 return ("REINSTATE"); 12606 default: 12607 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12608 "type %d", ctype); 12609 return (""); 12610 } 12611 } 12612 #endif 12613 12614 /* 12615 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12616 * Unlock requests don't have an over-the-wire locktype, so we just return 12617 * something non-threatening. 12618 */ 12619 12620 static nfs_lock_type4 12621 flk_to_locktype(int cmd, int l_type) 12622 { 12623 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12624 12625 switch (l_type) { 12626 case F_UNLCK: 12627 return (READ_LT); 12628 case F_RDLCK: 12629 if (cmd == F_SETLK) 12630 return (READ_LT); 12631 else 12632 return (READW_LT); 12633 case F_WRLCK: 12634 if (cmd == F_SETLK) 12635 return (WRITE_LT); 12636 else 12637 return (WRITEW_LT); 12638 } 12639 panic("flk_to_locktype"); 12640 /*NOTREACHED*/ 12641 } 12642 12643 /* 12644 * Do some preliminary checks for nfs4frlock. 12645 */ 12646 static int 12647 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12648 u_offset_t offset) 12649 { 12650 int error = 0; 12651 12652 /* 12653 * If we are setting a lock, check that the file is opened 12654 * with the correct mode. 12655 */ 12656 if (cmd == F_SETLK || cmd == F_SETLKW) { 12657 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12658 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12659 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12660 "nfs4frlock_validate_args: file was opened with " 12661 "incorrect mode")); 12662 return (EBADF); 12663 } 12664 } 12665 12666 /* Convert the offset. It may need to be restored before returning. */ 12667 if (error = convoff(vp, flk, 0, offset)) { 12668 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12669 "nfs4frlock_validate_args: convoff => error= %d\n", 12670 error)); 12671 return (error); 12672 } 12673 12674 return (error); 12675 } 12676 12677 /* 12678 * Set the flock64's lm_sysid for nfs4frlock. 12679 */ 12680 static int 12681 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12682 { 12683 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12684 12685 /* Find the lm_sysid */ 12686 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12687 12688 if (*lspp == NULL) { 12689 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12690 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12691 return (ENOLCK); 12692 } 12693 12694 flk->l_sysid = lm_sysidt(*lspp); 12695 12696 return (0); 12697 } 12698 12699 /* 12700 * Do the remaining preliminary setup for nfs4frlock. 12701 */ 12702 static void 12703 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12704 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12705 cred_t **cred_otw) 12706 { 12707 /* 12708 * set tick_delay to the base delay time. 12709 * (NFS4_BASE_WAIT_TIME is in secs) 12710 */ 12711 12712 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12713 12714 /* 12715 * If lock is relative to EOF, we need the newest length of the 12716 * file. Therefore invalidate the ATTR_CACHE. 12717 */ 12718 12719 *whencep = flk->l_whence; 12720 12721 if (*whencep == 2) /* SEEK_END */ 12722 PURGE_ATTRCACHE4(vp); 12723 12724 recov_statep->rs_flags = 0; 12725 recov_statep->rs_num_retry_despite_err = 0; 12726 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12727 } 12728 12729 /* 12730 * Initialize and allocate the data structures necessary for 12731 * the nfs4frlock call. 12732 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12733 */ 12734 static void 12735 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12736 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12737 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12738 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12739 { 12740 int argoplist_size; 12741 int num_ops = 2; 12742 12743 *retry = FALSE; 12744 *did_start_fop = FALSE; 12745 *skip_get_err = FALSE; 12746 lost_rqstp->lr_op = 0; 12747 argoplist_size = num_ops * sizeof (nfs_argop4); 12748 /* fill array with zero */ 12749 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12750 12751 *argspp = argsp; 12752 *respp = NULL; 12753 12754 argsp->array_len = num_ops; 12755 argsp->array = *argopp; 12756 12757 /* initialize in case of error; will get real value down below */ 12758 argsp->ctag = TAG_NONE; 12759 12760 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12761 *op_hintp = OH_LOCKU; 12762 else 12763 *op_hintp = OH_OTHER; 12764 } 12765 12766 /* 12767 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12768 * the proper nfs4_server_t for this instance of nfs4frlock. 12769 * Returns 0 (success) or an errno value. 12770 */ 12771 static int 12772 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12773 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12774 bool_t *did_start_fop, bool_t *startrecovp) 12775 { 12776 int error = 0; 12777 rnode4_t *rp; 12778 12779 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12780 12781 if (ctype == NFS4_LCK_CTYPE_NORM) { 12782 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12783 recov_statep, startrecovp); 12784 if (error) 12785 return (error); 12786 *did_start_fop = TRUE; 12787 } else { 12788 *did_start_fop = FALSE; 12789 *startrecovp = FALSE; 12790 } 12791 12792 if (!error) { 12793 rp = VTOR4(vp); 12794 12795 /* If the file failed recovery, just quit. */ 12796 mutex_enter(&rp->r_statelock); 12797 if (rp->r_flags & R4RECOVERR) { 12798 error = EIO; 12799 } 12800 mutex_exit(&rp->r_statelock); 12801 } 12802 12803 return (error); 12804 } 12805 12806 /* 12807 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 12808 * resend nfs4frlock call is initiated by the recovery framework. 12809 * Acquires the lop and oop seqid synchronization. 12810 */ 12811 static void 12812 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 12813 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 12814 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 12815 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 12816 { 12817 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 12818 int error; 12819 12820 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 12821 (CE_NOTE, 12822 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 12823 ASSERT(resend_rqstp != NULL); 12824 ASSERT(resend_rqstp->lr_op == OP_LOCK || 12825 resend_rqstp->lr_op == OP_LOCKU); 12826 12827 *oopp = resend_rqstp->lr_oop; 12828 if (resend_rqstp->lr_oop) { 12829 open_owner_hold(resend_rqstp->lr_oop); 12830 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 12831 ASSERT(error == 0); /* recov thread always succeeds */ 12832 } 12833 12834 /* Must resend this lost lock/locku request. */ 12835 ASSERT(resend_rqstp->lr_lop != NULL); 12836 *lopp = resend_rqstp->lr_lop; 12837 lock_owner_hold(resend_rqstp->lr_lop); 12838 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 12839 ASSERT(error == 0); /* recov thread always succeeds */ 12840 12841 *ospp = resend_rqstp->lr_osp; 12842 if (*ospp) 12843 open_stream_hold(resend_rqstp->lr_osp); 12844 12845 if (resend_rqstp->lr_op == OP_LOCK) { 12846 LOCK4args *lock_args; 12847 12848 argop->argop = OP_LOCK; 12849 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 12850 lock_args->locktype = resend_rqstp->lr_locktype; 12851 lock_args->reclaim = 12852 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 12853 lock_args->offset = resend_rqstp->lr_flk->l_start; 12854 lock_args->length = resend_rqstp->lr_flk->l_len; 12855 if (lock_args->length == 0) 12856 lock_args->length = ~lock_args->length; 12857 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 12858 mi2clientid(mi), &lock_args->locker); 12859 12860 switch (resend_rqstp->lr_ctype) { 12861 case NFS4_LCK_CTYPE_RESEND: 12862 argsp->ctag = TAG_LOCK_RESEND; 12863 break; 12864 case NFS4_LCK_CTYPE_REINSTATE: 12865 argsp->ctag = TAG_LOCK_REINSTATE; 12866 break; 12867 case NFS4_LCK_CTYPE_RECLAIM: 12868 argsp->ctag = TAG_LOCK_RECLAIM; 12869 break; 12870 default: 12871 argsp->ctag = TAG_LOCK_UNKNOWN; 12872 break; 12873 } 12874 } else { 12875 LOCKU4args *locku_args; 12876 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 12877 12878 argop->argop = OP_LOCKU; 12879 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 12880 locku_args->locktype = READ_LT; 12881 locku_args->seqid = lop->lock_seqid + 1; 12882 mutex_enter(&lop->lo_lock); 12883 locku_args->lock_stateid = lop->lock_stateid; 12884 mutex_exit(&lop->lo_lock); 12885 locku_args->offset = resend_rqstp->lr_flk->l_start; 12886 locku_args->length = resend_rqstp->lr_flk->l_len; 12887 if (locku_args->length == 0) 12888 locku_args->length = ~locku_args->length; 12889 12890 switch (resend_rqstp->lr_ctype) { 12891 case NFS4_LCK_CTYPE_RESEND: 12892 argsp->ctag = TAG_LOCKU_RESEND; 12893 break; 12894 case NFS4_LCK_CTYPE_REINSTATE: 12895 argsp->ctag = TAG_LOCKU_REINSTATE; 12896 break; 12897 default: 12898 argsp->ctag = TAG_LOCK_UNKNOWN; 12899 break; 12900 } 12901 } 12902 } 12903 12904 /* 12905 * Setup the LOCKT4 arguments. 12906 */ 12907 static void 12908 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 12909 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 12910 rnode4_t *rp) 12911 { 12912 LOCKT4args *lockt_args; 12913 12914 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 12915 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 12916 argop->argop = OP_LOCKT; 12917 argsp->ctag = TAG_LOCKT; 12918 lockt_args = &argop->nfs_argop4_u.oplockt; 12919 12920 /* 12921 * The locktype will be READ_LT unless it's 12922 * a write lock. We do this because the Solaris 12923 * system call allows the combination of 12924 * F_UNLCK and F_GETLK* and so in that case the 12925 * unlock is mapped to a read. 12926 */ 12927 if (flk->l_type == F_WRLCK) 12928 lockt_args->locktype = WRITE_LT; 12929 else 12930 lockt_args->locktype = READ_LT; 12931 12932 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 12933 /* set the lock owner4 args */ 12934 nfs4_setlockowner_args(&lockt_args->owner, rp, 12935 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 12936 flk->l_pid); 12937 lockt_args->offset = flk->l_start; 12938 lockt_args->length = flk->l_len; 12939 if (flk->l_len == 0) 12940 lockt_args->length = ~lockt_args->length; 12941 12942 *lockt_argsp = lockt_args; 12943 } 12944 12945 /* 12946 * If the client is holding a delegation, and the open stream to be used 12947 * with this lock request is a delegation open stream, then re-open the stream. 12948 * Sets the nfs4_error_t to all zeros unless the open stream has already 12949 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 12950 * means the caller should retry (like a recovery retry). 12951 */ 12952 static void 12953 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 12954 { 12955 open_delegation_type4 dt; 12956 bool_t reopen_needed, force; 12957 nfs4_open_stream_t *osp; 12958 open_claim_type4 oclaim; 12959 rnode4_t *rp = VTOR4(vp); 12960 mntinfo4_t *mi = VTOMI4(vp); 12961 12962 ASSERT(nfs_zone() == mi->mi_zone); 12963 12964 nfs4_error_zinit(ep); 12965 12966 mutex_enter(&rp->r_statev4_lock); 12967 dt = rp->r_deleg_type; 12968 mutex_exit(&rp->r_statev4_lock); 12969 12970 if (dt != OPEN_DELEGATE_NONE) { 12971 nfs4_open_owner_t *oop; 12972 12973 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 12974 if (!oop) { 12975 ep->stat = NFS4ERR_IO; 12976 return; 12977 } 12978 /* returns with 'os_sync_lock' held */ 12979 osp = find_open_stream(oop, rp); 12980 if (!osp) { 12981 open_owner_rele(oop); 12982 ep->stat = NFS4ERR_IO; 12983 return; 12984 } 12985 12986 if (osp->os_failed_reopen) { 12987 NFS4_DEBUG((nfs4_open_stream_debug || 12988 nfs4_client_lock_debug), (CE_NOTE, 12989 "nfs4frlock_check_deleg: os_failed_reopen set " 12990 "for osp %p, cr %p, rp %s", (void *)osp, 12991 (void *)cr, rnode4info(rp))); 12992 mutex_exit(&osp->os_sync_lock); 12993 open_stream_rele(osp, rp); 12994 open_owner_rele(oop); 12995 ep->stat = NFS4ERR_IO; 12996 return; 12997 } 12998 12999 /* 13000 * Determine whether a reopen is needed. If this 13001 * is a delegation open stream, then send the open 13002 * to the server to give visibility to the open owner. 13003 * Even if it isn't a delegation open stream, we need 13004 * to check if the previous open CLAIM_DELEGATE_CUR 13005 * was sufficient. 13006 */ 13007 13008 reopen_needed = osp->os_delegation || 13009 ((lt == F_RDLCK && 13010 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13011 (lt == F_WRLCK && 13012 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13013 13014 mutex_exit(&osp->os_sync_lock); 13015 open_owner_rele(oop); 13016 13017 if (reopen_needed) { 13018 /* 13019 * Always use CLAIM_PREVIOUS after server reboot. 13020 * The server will reject CLAIM_DELEGATE_CUR if 13021 * it is used during the grace period. 13022 */ 13023 mutex_enter(&mi->mi_lock); 13024 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13025 oclaim = CLAIM_PREVIOUS; 13026 force = TRUE; 13027 } else { 13028 oclaim = CLAIM_DELEGATE_CUR; 13029 force = FALSE; 13030 } 13031 mutex_exit(&mi->mi_lock); 13032 13033 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13034 if (ep->error == EAGAIN) { 13035 nfs4_error_zinit(ep); 13036 ep->stat = NFS4ERR_DELAY; 13037 } 13038 } 13039 open_stream_rele(osp, rp); 13040 osp = NULL; 13041 } 13042 } 13043 13044 /* 13045 * Setup the LOCKU4 arguments. 13046 * Returns errors via the nfs4_error_t. 13047 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13048 * over-the-wire. The caller must release the 13049 * reference on *lopp. 13050 * NFS4ERR_DELAY caller should retry (like recovery retry) 13051 * (other) unrecoverable error. 13052 */ 13053 static void 13054 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13055 LOCKU4args **locku_argsp, flock64_t *flk, 13056 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13057 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13058 bool_t *skip_get_err, bool_t *go_otwp) 13059 { 13060 nfs4_lock_owner_t *lop = NULL; 13061 LOCKU4args *locku_args; 13062 pid_t pid; 13063 bool_t is_spec = FALSE; 13064 rnode4_t *rp = VTOR4(vp); 13065 13066 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13067 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13068 13069 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13070 if (ep->error || ep->stat) 13071 return; 13072 13073 argop->argop = OP_LOCKU; 13074 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13075 argsp->ctag = TAG_LOCKU_REINSTATE; 13076 else 13077 argsp->ctag = TAG_LOCKU; 13078 locku_args = &argop->nfs_argop4_u.oplocku; 13079 *locku_argsp = locku_args; 13080 13081 /* 13082 * XXX what should locku_args->locktype be? 13083 * setting to ALWAYS be READ_LT so at least 13084 * it is a valid locktype. 13085 */ 13086 13087 locku_args->locktype = READ_LT; 13088 13089 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13090 flk->l_pid; 13091 13092 /* 13093 * Get the lock owner stateid. If no lock owner 13094 * exists, return success. 13095 */ 13096 lop = find_lock_owner(rp, pid, LOWN_ANY); 13097 *lopp = lop; 13098 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13099 is_spec = TRUE; 13100 if (!lop || is_spec) { 13101 /* 13102 * No lock owner so no locks to unlock. 13103 * Return success. If there was a failed 13104 * reclaim earlier, the lock might still be 13105 * registered with the local locking code, 13106 * so notify it of the unlock. 13107 * 13108 * If the lockowner is using a special stateid, 13109 * then the original lock request (that created 13110 * this lockowner) was never successful, so we 13111 * have no lock to undo OTW. 13112 */ 13113 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13114 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13115 "(%ld) so return success", (long)pid)); 13116 13117 if (ctype == NFS4_LCK_CTYPE_NORM) 13118 flk->l_pid = curproc->p_pid; 13119 nfs4_register_lock_locally(vp, flk, flag, offset); 13120 /* 13121 * Release our hold and NULL out so final_cleanup 13122 * doesn't try to end a lock seqid sync we 13123 * never started. 13124 */ 13125 if (is_spec) { 13126 lock_owner_rele(lop); 13127 *lopp = NULL; 13128 } 13129 *skip_get_err = TRUE; 13130 *go_otwp = FALSE; 13131 return; 13132 } 13133 13134 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13135 if (ep->error == EAGAIN) { 13136 lock_owner_rele(lop); 13137 *lopp = NULL; 13138 return; 13139 } 13140 13141 mutex_enter(&lop->lo_lock); 13142 locku_args->lock_stateid = lop->lock_stateid; 13143 mutex_exit(&lop->lo_lock); 13144 locku_args->seqid = lop->lock_seqid + 1; 13145 13146 /* leave the ref count on lop, rele after RPC call */ 13147 13148 locku_args->offset = flk->l_start; 13149 locku_args->length = flk->l_len; 13150 if (flk->l_len == 0) 13151 locku_args->length = ~locku_args->length; 13152 13153 *go_otwp = TRUE; 13154 } 13155 13156 /* 13157 * Setup the LOCK4 arguments. 13158 * 13159 * Returns errors via the nfs4_error_t. 13160 * NFS4_OK no problems 13161 * NFS4ERR_DELAY caller should retry (like recovery retry) 13162 * (other) unrecoverable error 13163 */ 13164 static void 13165 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13166 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13167 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13168 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13169 { 13170 LOCK4args *lock_args; 13171 nfs4_open_owner_t *oop = NULL; 13172 nfs4_open_stream_t *osp = NULL; 13173 nfs4_lock_owner_t *lop = NULL; 13174 pid_t pid; 13175 rnode4_t *rp = VTOR4(vp); 13176 13177 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13178 13179 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13180 if (ep->error || ep->stat != NFS4_OK) 13181 return; 13182 13183 argop->argop = OP_LOCK; 13184 if (ctype == NFS4_LCK_CTYPE_NORM) 13185 argsp->ctag = TAG_LOCK; 13186 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13187 argsp->ctag = TAG_RELOCK; 13188 else 13189 argsp->ctag = TAG_LOCK_REINSTATE; 13190 lock_args = &argop->nfs_argop4_u.oplock; 13191 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13192 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13193 /* 13194 * Get the lock owner. If no lock owner exists, 13195 * create a 'temporary' one and grab the open seqid 13196 * synchronization (which puts a hold on the open 13197 * owner and open stream). 13198 * This also grabs the lock seqid synchronization. 13199 */ 13200 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13201 ep->stat = 13202 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13203 13204 if (ep->stat != NFS4_OK) 13205 goto out; 13206 13207 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13208 &lock_args->locker); 13209 13210 lock_args->offset = flk->l_start; 13211 lock_args->length = flk->l_len; 13212 if (flk->l_len == 0) 13213 lock_args->length = ~lock_args->length; 13214 *lock_argsp = lock_args; 13215 out: 13216 *oopp = oop; 13217 *ospp = osp; 13218 *lopp = lop; 13219 } 13220 13221 /* 13222 * After we get the reply from the server, record the proper information 13223 * for possible resend lock requests. 13224 * 13225 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13226 */ 13227 static void 13228 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13229 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13230 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13231 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13232 { 13233 bool_t unlock = (flk->l_type == F_UNLCK); 13234 13235 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13236 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13237 ctype == NFS4_LCK_CTYPE_REINSTATE); 13238 13239 if (error != 0 && !unlock) { 13240 NFS4_DEBUG((nfs4_lost_rqst_debug || 13241 nfs4_client_lock_debug), (CE_NOTE, 13242 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13243 " for lop %p", (void *)lop)); 13244 ASSERT(lop != NULL); 13245 mutex_enter(&lop->lo_lock); 13246 lop->lo_pending_rqsts = 1; 13247 mutex_exit(&lop->lo_lock); 13248 } 13249 13250 lost_rqstp->lr_putfirst = FALSE; 13251 lost_rqstp->lr_op = 0; 13252 13253 /* 13254 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13255 * recovery purposes so that the lock request that was sent 13256 * can be saved and re-issued later. Ditto for EIO from a forced 13257 * unmount. This is done to have the client's local locking state 13258 * match the v4 server's state; that is, the request was 13259 * potentially received and accepted by the server but the client 13260 * thinks it was not. 13261 */ 13262 if (error == ETIMEDOUT || error == EINTR || 13263 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13264 NFS4_DEBUG((nfs4_lost_rqst_debug || 13265 nfs4_client_lock_debug), (CE_NOTE, 13266 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13267 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13268 (void *)lop, (void *)oop, (void *)osp)); 13269 if (unlock) 13270 lost_rqstp->lr_op = OP_LOCKU; 13271 else { 13272 lost_rqstp->lr_op = OP_LOCK; 13273 lost_rqstp->lr_locktype = locktype; 13274 } 13275 /* 13276 * Objects are held and rele'd via the recovery code. 13277 * See nfs4_save_lost_rqst. 13278 */ 13279 lost_rqstp->lr_vp = vp; 13280 lost_rqstp->lr_dvp = NULL; 13281 lost_rqstp->lr_oop = oop; 13282 lost_rqstp->lr_osp = osp; 13283 lost_rqstp->lr_lop = lop; 13284 lost_rqstp->lr_cr = cr; 13285 switch (ctype) { 13286 case NFS4_LCK_CTYPE_NORM: 13287 flk->l_pid = ttoproc(curthread)->p_pid; 13288 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13289 break; 13290 case NFS4_LCK_CTYPE_REINSTATE: 13291 lost_rqstp->lr_putfirst = TRUE; 13292 lost_rqstp->lr_ctype = ctype; 13293 break; 13294 default: 13295 break; 13296 } 13297 lost_rqstp->lr_flk = flk; 13298 } 13299 } 13300 13301 /* 13302 * Update lop's seqid. Also update the seqid stored in a resend request, 13303 * if any. (Some recovery errors increment the seqid, and we may have to 13304 * send the resend request again.) 13305 */ 13306 13307 static void 13308 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13309 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13310 { 13311 if (lock_args) { 13312 if (lock_args->locker.new_lock_owner == TRUE) 13313 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13314 else { 13315 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13316 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13317 } 13318 } else if (locku_args) { 13319 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13320 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13321 } 13322 } 13323 13324 /* 13325 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13326 * COMPOUND4 args/res for calls that need to retry. 13327 * Switches the *cred_otwp to base_cr. 13328 */ 13329 static void 13330 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13331 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13332 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13333 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13334 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13335 { 13336 nfs4_open_owner_t *oop = *oopp; 13337 nfs4_open_stream_t *osp = *ospp; 13338 nfs4_lock_owner_t *lop = *lopp; 13339 nfs_argop4 *argop = (*argspp)->array; 13340 13341 if (*did_start_fop) { 13342 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13343 needrecov); 13344 *did_start_fop = FALSE; 13345 } 13346 ASSERT((*argspp)->array_len == 2); 13347 if (argop[1].argop == OP_LOCK) 13348 nfs4args_lock_free(&argop[1]); 13349 else if (argop[1].argop == OP_LOCKT) 13350 nfs4args_lockt_free(&argop[1]); 13351 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13352 if (!error) 13353 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13354 *argspp = NULL; 13355 *respp = NULL; 13356 13357 if (lop) { 13358 nfs4_end_lock_seqid_sync(lop); 13359 lock_owner_rele(lop); 13360 *lopp = NULL; 13361 } 13362 13363 /* need to free up the reference on osp for lock args */ 13364 if (osp != NULL) { 13365 open_stream_rele(osp, VTOR4(vp)); 13366 *ospp = NULL; 13367 } 13368 13369 /* need to free up the reference on oop for lock args */ 13370 if (oop != NULL) { 13371 nfs4_end_open_seqid_sync(oop); 13372 open_owner_rele(oop); 13373 *oopp = NULL; 13374 } 13375 13376 crfree(*cred_otwp); 13377 *cred_otwp = base_cr; 13378 crhold(*cred_otwp); 13379 } 13380 13381 /* 13382 * Function to process the client's recovery for nfs4frlock. 13383 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13384 * 13385 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13386 * COMPOUND4 args/res for calls that need to retry. 13387 * 13388 * Note: the rp's r_lkserlock is *not* dropped during this path. 13389 */ 13390 static bool_t 13391 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13392 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13393 LOCK4args *lock_args, LOCKU4args *locku_args, 13394 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13395 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13396 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13397 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13398 { 13399 nfs4_open_owner_t *oop = *oopp; 13400 nfs4_open_stream_t *osp = *ospp; 13401 nfs4_lock_owner_t *lop = *lopp; 13402 13403 bool_t abort, retry; 13404 13405 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13406 ASSERT((*argspp) != NULL); 13407 ASSERT((*respp) != NULL); 13408 if (lock_args || locku_args) 13409 ASSERT(lop != NULL); 13410 13411 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13412 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13413 13414 retry = TRUE; 13415 abort = FALSE; 13416 if (needrecov) { 13417 nfs4_bseqid_entry_t *bsep = NULL; 13418 nfs_opnum4 op; 13419 13420 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13421 13422 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13423 seqid4 seqid; 13424 13425 if (lock_args) { 13426 if (lock_args->locker.new_lock_owner == TRUE) 13427 seqid = lock_args->locker.locker4_u. 13428 open_owner.open_seqid; 13429 else 13430 seqid = lock_args->locker.locker4_u. 13431 lock_owner.lock_seqid; 13432 } else if (locku_args) { 13433 seqid = locku_args->seqid; 13434 } else { 13435 seqid = 0; 13436 } 13437 13438 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13439 flk->l_pid, (*argspp)->ctag, seqid); 13440 } 13441 13442 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13443 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13444 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13445 NULL, op, bsep); 13446 13447 if (bsep) 13448 kmem_free(bsep, sizeof (*bsep)); 13449 } 13450 13451 /* 13452 * Return that we do not want to retry the request for 3 cases: 13453 * 1. If we received EINTR or are bailing out because of a forced 13454 * unmount, we came into this code path just for the sake of 13455 * initiating recovery, we now need to return the error. 13456 * 2. If we have aborted recovery. 13457 * 3. We received NFS4ERR_BAD_SEQID. 13458 */ 13459 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13460 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13461 retry = FALSE; 13462 13463 if (*did_start_fop == TRUE) { 13464 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13465 needrecov); 13466 *did_start_fop = FALSE; 13467 } 13468 13469 if (retry == TRUE) { 13470 nfs_argop4 *argop; 13471 13472 argop = (*argspp)->array; 13473 ASSERT((*argspp)->array_len == 2); 13474 13475 if (argop[1].argop == OP_LOCK) 13476 nfs4args_lock_free(&argop[1]); 13477 else if (argop[1].argop == OP_LOCKT) 13478 nfs4args_lockt_free(&argop[1]); 13479 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13480 if (!ep->error) 13481 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13482 *respp = NULL; 13483 *argspp = NULL; 13484 } 13485 13486 if (lop != NULL) { 13487 nfs4_end_lock_seqid_sync(lop); 13488 lock_owner_rele(lop); 13489 } 13490 13491 *lopp = NULL; 13492 13493 /* need to free up the reference on osp for lock args */ 13494 if (osp != NULL) { 13495 open_stream_rele(osp, rp); 13496 *ospp = NULL; 13497 } 13498 13499 /* need to free up the reference on oop for lock args */ 13500 if (oop != NULL) { 13501 nfs4_end_open_seqid_sync(oop); 13502 open_owner_rele(oop); 13503 *oopp = NULL; 13504 } 13505 13506 return (retry); 13507 } 13508 13509 /* 13510 * Handles the succesful reply from the server for nfs4frlock. 13511 */ 13512 static void 13513 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13514 vnode_t *vp, int flag, u_offset_t offset, 13515 nfs4_lost_rqst_t *resend_rqstp) 13516 { 13517 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13518 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13519 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13520 if (ctype == NFS4_LCK_CTYPE_NORM) { 13521 flk->l_pid = ttoproc(curthread)->p_pid; 13522 /* 13523 * We do not register lost locks locally in 13524 * the 'resend' case since the user/application 13525 * doesn't think we have the lock. 13526 */ 13527 ASSERT(!resend_rqstp); 13528 nfs4_register_lock_locally(vp, flk, flag, offset); 13529 } 13530 } 13531 } 13532 13533 /* 13534 * Handle the DENIED reply from the server for nfs4frlock. 13535 * Returns TRUE if we should retry the request; FALSE otherwise. 13536 * 13537 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13538 * COMPOUND4 args/res for calls that need to retry. Can also 13539 * drop and regrab the r_lkserlock. 13540 */ 13541 static bool_t 13542 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13543 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13544 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13545 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13546 nfs4_recov_state_t *recov_statep, int needrecov, 13547 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13548 clock_t *tick_delayp, short *whencep, int *errorp, 13549 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13550 bool_t *skip_get_err) 13551 { 13552 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13553 13554 if (lock_args) { 13555 nfs4_open_owner_t *oop = *oopp; 13556 nfs4_open_stream_t *osp = *ospp; 13557 nfs4_lock_owner_t *lop = *lopp; 13558 int intr; 13559 13560 /* 13561 * Blocking lock needs to sleep and retry from the request. 13562 * 13563 * Do not block and wait for 'resend' or 'reinstate' 13564 * lock requests, just return the error. 13565 * 13566 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13567 */ 13568 if (cmd == F_SETLKW) { 13569 rnode4_t *rp = VTOR4(vp); 13570 nfs_argop4 *argop = (*argspp)->array; 13571 13572 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13573 13574 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13575 recov_statep, needrecov); 13576 *did_start_fop = FALSE; 13577 ASSERT((*argspp)->array_len == 2); 13578 if (argop[1].argop == OP_LOCK) 13579 nfs4args_lock_free(&argop[1]); 13580 else if (argop[1].argop == OP_LOCKT) 13581 nfs4args_lockt_free(&argop[1]); 13582 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13583 if (*respp) 13584 (void) xdr_free(xdr_COMPOUND4res_clnt, 13585 (caddr_t)*respp); 13586 *argspp = NULL; 13587 *respp = NULL; 13588 nfs4_end_lock_seqid_sync(lop); 13589 lock_owner_rele(lop); 13590 *lopp = NULL; 13591 if (osp != NULL) { 13592 open_stream_rele(osp, rp); 13593 *ospp = NULL; 13594 } 13595 if (oop != NULL) { 13596 nfs4_end_open_seqid_sync(oop); 13597 open_owner_rele(oop); 13598 *oopp = NULL; 13599 } 13600 13601 nfs_rw_exit(&rp->r_lkserlock); 13602 13603 intr = nfs4_block_and_wait(tick_delayp, rp); 13604 13605 if (intr) { 13606 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13607 RW_WRITER, FALSE); 13608 *errorp = EINTR; 13609 return (FALSE); 13610 } 13611 13612 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13613 RW_WRITER, FALSE); 13614 13615 /* 13616 * Make sure we are still safe to lock with 13617 * regards to mmapping. 13618 */ 13619 if (!nfs4_safelock(vp, flk, cr)) { 13620 *errorp = EAGAIN; 13621 return (FALSE); 13622 } 13623 13624 return (TRUE); 13625 } 13626 if (ctype == NFS4_LCK_CTYPE_NORM) 13627 *errorp = EAGAIN; 13628 *skip_get_err = TRUE; 13629 flk->l_whence = 0; 13630 *whencep = 0; 13631 return (FALSE); 13632 } else if (lockt_args) { 13633 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13634 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13635 13636 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13637 flk, lockt_args); 13638 13639 /* according to NLM code */ 13640 *errorp = 0; 13641 *whencep = 0; 13642 *skip_get_err = TRUE; 13643 return (FALSE); 13644 } 13645 return (FALSE); 13646 } 13647 13648 /* 13649 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13650 */ 13651 static void 13652 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13653 { 13654 switch (resp->status) { 13655 case NFS4ERR_ACCESS: 13656 case NFS4ERR_ADMIN_REVOKED: 13657 case NFS4ERR_BADHANDLE: 13658 case NFS4ERR_BAD_RANGE: 13659 case NFS4ERR_BAD_SEQID: 13660 case NFS4ERR_BAD_STATEID: 13661 case NFS4ERR_BADXDR: 13662 case NFS4ERR_DEADLOCK: 13663 case NFS4ERR_DELAY: 13664 case NFS4ERR_EXPIRED: 13665 case NFS4ERR_FHEXPIRED: 13666 case NFS4ERR_GRACE: 13667 case NFS4ERR_INVAL: 13668 case NFS4ERR_ISDIR: 13669 case NFS4ERR_LEASE_MOVED: 13670 case NFS4ERR_LOCK_NOTSUPP: 13671 case NFS4ERR_LOCK_RANGE: 13672 case NFS4ERR_MOVED: 13673 case NFS4ERR_NOFILEHANDLE: 13674 case NFS4ERR_NO_GRACE: 13675 case NFS4ERR_OLD_STATEID: 13676 case NFS4ERR_OPENMODE: 13677 case NFS4ERR_RECLAIM_BAD: 13678 case NFS4ERR_RECLAIM_CONFLICT: 13679 case NFS4ERR_RESOURCE: 13680 case NFS4ERR_SERVERFAULT: 13681 case NFS4ERR_STALE: 13682 case NFS4ERR_STALE_CLIENTID: 13683 case NFS4ERR_STALE_STATEID: 13684 return; 13685 default: 13686 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13687 "nfs4frlock_results_default: got unrecognizable " 13688 "res.status %d", resp->status)); 13689 *errorp = NFS4ERR_INVAL; 13690 } 13691 } 13692 13693 /* 13694 * The lock request was successful, so update the client's state. 13695 */ 13696 static void 13697 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13698 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13699 vnode_t *vp, flock64_t *flk, cred_t *cr, 13700 nfs4_lost_rqst_t *resend_rqstp) 13701 { 13702 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13703 13704 if (lock_args) { 13705 LOCK4res *lock_res; 13706 13707 lock_res = &resop->nfs_resop4_u.oplock; 13708 /* update the stateid with server's response */ 13709 13710 if (lock_args->locker.new_lock_owner == TRUE) { 13711 mutex_enter(&lop->lo_lock); 13712 lop->lo_just_created = NFS4_PERM_CREATED; 13713 mutex_exit(&lop->lo_lock); 13714 } 13715 13716 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13717 13718 /* 13719 * If the lock was the result of a resending a lost 13720 * request, we've synched up the stateid and seqid 13721 * with the server, but now the server might be out of sync 13722 * with what the application thinks it has for locks. 13723 * Clean that up here. It's unclear whether we should do 13724 * this even if the filesystem has been forcibly unmounted. 13725 * For most servers, it's probably wasted effort, but 13726 * RFC3530 lets servers require that unlocks exactly match 13727 * the locks that are held. 13728 */ 13729 if (resend_rqstp != NULL && 13730 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13731 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13732 } else { 13733 flk->l_whence = 0; 13734 } 13735 } else if (locku_args) { 13736 LOCKU4res *locku_res; 13737 13738 locku_res = &resop->nfs_resop4_u.oplocku; 13739 13740 /* Update the stateid with the server's response */ 13741 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13742 } else if (lockt_args) { 13743 /* Switch the lock type to express success, see fcntl */ 13744 flk->l_type = F_UNLCK; 13745 flk->l_whence = 0; 13746 } 13747 } 13748 13749 /* 13750 * Do final cleanup before exiting nfs4frlock. 13751 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13752 * COMPOUND4 args/res for calls that haven't already. 13753 */ 13754 static void 13755 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13756 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13757 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13758 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13759 short whence, u_offset_t offset, struct lm_sysid *ls, 13760 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13761 bool_t did_start_fop, bool_t skip_get_err, 13762 cred_t *cred_otw, cred_t *cred) 13763 { 13764 mntinfo4_t *mi = VTOMI4(vp); 13765 rnode4_t *rp = VTOR4(vp); 13766 int error = *errorp; 13767 nfs_argop4 *argop; 13768 13769 ASSERT(nfs_zone() == mi->mi_zone); 13770 /* 13771 * The client recovery code wants the raw status information, 13772 * so don't map the NFS status code to an errno value for 13773 * non-normal call types. 13774 */ 13775 if (ctype == NFS4_LCK_CTYPE_NORM) { 13776 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13777 *errorp = geterrno4(resp->status); 13778 if (did_start_fop == TRUE) 13779 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13780 needrecov); 13781 13782 if (!error && resp && resp->status == NFS4_OK) { 13783 /* 13784 * We've established a new lock on the server, so invalidate 13785 * the pages associated with the vnode to get the most up to 13786 * date pages from the server after acquiring the lock. We 13787 * want to be sure that the read operation gets the newest data. 13788 * N.B. 13789 * We used to do this in nfs4frlock_results_ok but that doesn't 13790 * work since VOP_PUTPAGE can call nfs4_commit which calls 13791 * nfs4_start_fop. We flush the pages below after calling 13792 * nfs4_end_fop above 13793 */ 13794 int error; 13795 13796 error = VOP_PUTPAGE(vp, (u_offset_t)0, 13797 0, B_INVAL, cred); 13798 13799 if (error && (error == ENOSPC || error == EDQUOT)) { 13800 rnode4_t *rp = VTOR4(vp); 13801 13802 mutex_enter(&rp->r_statelock); 13803 if (!rp->r_error) 13804 rp->r_error = error; 13805 mutex_exit(&rp->r_statelock); 13806 } 13807 } 13808 } 13809 if (argsp) { 13810 ASSERT(argsp->array_len == 2); 13811 argop = argsp->array; 13812 if (argop[1].argop == OP_LOCK) 13813 nfs4args_lock_free(&argop[1]); 13814 else if (argop[1].argop == OP_LOCKT) 13815 nfs4args_lockt_free(&argop[1]); 13816 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13817 if (resp) 13818 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 13819 } 13820 13821 /* free the reference on the lock owner */ 13822 if (lop != NULL) { 13823 nfs4_end_lock_seqid_sync(lop); 13824 lock_owner_rele(lop); 13825 } 13826 13827 /* need to free up the reference on osp for lock args */ 13828 if (osp != NULL) 13829 open_stream_rele(osp, rp); 13830 13831 /* need to free up the reference on oop for lock args */ 13832 if (oop != NULL) { 13833 nfs4_end_open_seqid_sync(oop); 13834 open_owner_rele(oop); 13835 } 13836 13837 (void) convoff(vp, flk, whence, offset); 13838 13839 lm_rel_sysid(ls); 13840 13841 /* 13842 * Record debug information in the event we get EINVAL. 13843 */ 13844 mutex_enter(&mi->mi_lock); 13845 if (*errorp == EINVAL && (lock_args || locku_args) && 13846 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 13847 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 13848 zcmn_err(getzoneid(), CE_NOTE, 13849 "%s operation failed with " 13850 "EINVAL probably since the server, %s," 13851 " doesn't support POSIX style locking", 13852 lock_args ? "LOCK" : "LOCKU", 13853 mi->mi_curr_serv->sv_hostname); 13854 mi->mi_flags |= MI4_LOCK_DEBUG; 13855 } 13856 } 13857 mutex_exit(&mi->mi_lock); 13858 13859 if (cred_otw) 13860 crfree(cred_otw); 13861 } 13862 13863 /* 13864 * This calls the server and the local locking code. 13865 * 13866 * Client locks are registerred locally by oring the sysid with 13867 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 13868 * We need to distinguish between the two to avoid collision in case one 13869 * machine is used as both client and server. 13870 * 13871 * Blocking lock requests will continually retry to acquire the lock 13872 * forever. 13873 * 13874 * The ctype is defined as follows: 13875 * NFS4_LCK_CTYPE_NORM: normal lock request. 13876 * 13877 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 13878 * recovery, get the pid from flk instead of curproc, and don't reregister 13879 * the lock locally. 13880 * 13881 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 13882 * that we will use the information passed in via resend_rqstp to setup the 13883 * lock/locku request. This resend is the exact same request as the 'lost 13884 * lock', and is initiated by the recovery framework. A successful resend 13885 * request can initiate one or more reinstate requests. 13886 * 13887 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 13888 * does not trigger additional reinstate requests. This lock call type is 13889 * set for setting the v4 server's locking state back to match what the 13890 * client's local locking state is in the event of a received 'lost lock'. 13891 * 13892 * Errors are returned via the nfs4_error_t parameter. 13893 */ 13894 void 13895 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 13896 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 13897 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 13898 { 13899 COMPOUND4args_clnt args, *argsp = NULL; 13900 COMPOUND4res_clnt res, *resp = NULL; 13901 nfs_argop4 *argop; 13902 nfs_resop4 *resop; 13903 rnode4_t *rp; 13904 int doqueue = 1; 13905 clock_t tick_delay; /* delay in clock ticks */ 13906 struct lm_sysid *ls; 13907 LOCK4args *lock_args = NULL; 13908 LOCKU4args *locku_args = NULL; 13909 LOCKT4args *lockt_args = NULL; 13910 nfs4_open_owner_t *oop = NULL; 13911 nfs4_open_stream_t *osp = NULL; 13912 nfs4_lock_owner_t *lop = NULL; 13913 bool_t needrecov = FALSE; 13914 nfs4_recov_state_t recov_state; 13915 short whence; 13916 nfs4_op_hint_t op_hint; 13917 nfs4_lost_rqst_t lost_rqst; 13918 bool_t retry = FALSE; 13919 bool_t did_start_fop = FALSE; 13920 bool_t skip_get_err = FALSE; 13921 cred_t *cred_otw = NULL; 13922 bool_t recovonly; /* just queue request */ 13923 int frc_no_reclaim = 0; 13924 #ifdef DEBUG 13925 char *name; 13926 #endif 13927 13928 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13929 13930 #ifdef DEBUG 13931 name = fn_name(VTOSV(vp)->sv_name); 13932 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 13933 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 13934 "length %"PRIu64", pid %d, sysid %d, call type %s, " 13935 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 13936 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 13937 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 13938 resend_rqstp ? "TRUE" : "FALSE")); 13939 kmem_free(name, MAXNAMELEN); 13940 #endif 13941 13942 nfs4_error_zinit(ep); 13943 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 13944 if (ep->error) 13945 return; 13946 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 13947 if (ep->error) 13948 return; 13949 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 13950 vp, cr, &cred_otw); 13951 13952 recov_retry: 13953 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 13954 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 13955 rp = VTOR4(vp); 13956 13957 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 13958 &did_start_fop, &recovonly); 13959 13960 if (ep->error) 13961 goto out; 13962 13963 if (recovonly) { 13964 /* 13965 * Leave the request for the recovery system to deal with. 13966 */ 13967 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13968 ASSERT(cmd != F_GETLK); 13969 ASSERT(flk->l_type == F_UNLCK); 13970 13971 nfs4_error_init(ep, EINTR); 13972 needrecov = TRUE; 13973 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 13974 if (lop != NULL) { 13975 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 13976 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 13977 (void) nfs4_start_recovery(ep, 13978 VTOMI4(vp), vp, NULL, NULL, 13979 (lost_rqst.lr_op == OP_LOCK || 13980 lost_rqst.lr_op == OP_LOCKU) ? 13981 &lost_rqst : NULL, OP_LOCKU, NULL); 13982 lock_owner_rele(lop); 13983 lop = NULL; 13984 } 13985 flk->l_pid = curproc->p_pid; 13986 nfs4_register_lock_locally(vp, flk, flag, offset); 13987 goto out; 13988 } 13989 13990 /* putfh directory fh */ 13991 argop[0].argop = OP_CPUTFH; 13992 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 13993 13994 /* 13995 * Set up the over-the-wire arguments and get references to the 13996 * open owner, etc. 13997 */ 13998 13999 if (ctype == NFS4_LCK_CTYPE_RESEND || 14000 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14001 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14002 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14003 } else { 14004 bool_t go_otw = TRUE; 14005 14006 ASSERT(resend_rqstp == NULL); 14007 14008 switch (cmd) { 14009 case F_GETLK: 14010 case F_O_GETLK: 14011 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14012 &lockt_args, argsp, flk, rp); 14013 break; 14014 case F_SETLKW: 14015 case F_SETLK: 14016 if (flk->l_type == F_UNLCK) 14017 nfs4frlock_setup_locku_args(ctype, 14018 &argop[1], &locku_args, flk, 14019 &lop, ep, argsp, 14020 vp, flag, offset, cr, 14021 &skip_get_err, &go_otw); 14022 else 14023 nfs4frlock_setup_lock_args(ctype, 14024 &lock_args, &oop, &osp, &lop, &argop[1], 14025 argsp, flk, cmd, vp, cr, ep); 14026 14027 if (ep->error) 14028 goto out; 14029 14030 switch (ep->stat) { 14031 case NFS4_OK: 14032 break; 14033 case NFS4ERR_DELAY: 14034 /* recov thread never gets this error */ 14035 ASSERT(resend_rqstp == NULL); 14036 ASSERT(did_start_fop); 14037 14038 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14039 &recov_state, TRUE); 14040 did_start_fop = FALSE; 14041 if (argop[1].argop == OP_LOCK) 14042 nfs4args_lock_free(&argop[1]); 14043 else if (argop[1].argop == OP_LOCKT) 14044 nfs4args_lockt_free(&argop[1]); 14045 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14046 argsp = NULL; 14047 goto recov_retry; 14048 default: 14049 ep->error = EIO; 14050 goto out; 14051 } 14052 break; 14053 default: 14054 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14055 "nfs4_frlock: invalid cmd %d", cmd)); 14056 ep->error = EINVAL; 14057 goto out; 14058 } 14059 14060 if (!go_otw) 14061 goto out; 14062 } 14063 14064 /* XXX should we use the local reclock as a cache ? */ 14065 /* 14066 * Unregister the lock with the local locking code before 14067 * contacting the server. This avoids a potential race where 14068 * another process gets notified that it has been granted a lock 14069 * before we can unregister ourselves locally. 14070 */ 14071 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14072 if (ctype == NFS4_LCK_CTYPE_NORM) 14073 flk->l_pid = ttoproc(curthread)->p_pid; 14074 nfs4_register_lock_locally(vp, flk, flag, offset); 14075 } 14076 14077 /* 14078 * Send the server the lock request. Continually loop with a delay 14079 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14080 */ 14081 resp = &res; 14082 14083 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14084 (CE_NOTE, 14085 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14086 rnode4info(rp))); 14087 14088 if (lock_args && frc_no_reclaim) { 14089 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14090 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14091 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14092 lock_args->reclaim = FALSE; 14093 if (did_reclaimp) 14094 *did_reclaimp = 0; 14095 } 14096 14097 /* 14098 * Do the OTW call. 14099 */ 14100 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14101 14102 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14103 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14104 14105 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14106 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14107 "nfs4frlock: needrecov %d", needrecov)); 14108 14109 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14110 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14111 args.ctag); 14112 14113 /* 14114 * Check if one of these mutually exclusive error cases has 14115 * happened: 14116 * need to swap credentials due to access error 14117 * recovery is needed 14118 * different error (only known case is missing Kerberos ticket) 14119 */ 14120 14121 if ((ep->error == EACCES || 14122 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14123 cred_otw != cr) { 14124 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14125 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14126 cr, &cred_otw); 14127 goto recov_retry; 14128 } 14129 14130 if (needrecov) { 14131 /* 14132 * LOCKT requests don't need to recover from lost 14133 * requests since they don't create/modify state. 14134 */ 14135 if ((ep->error == EINTR || 14136 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14137 lockt_args) 14138 goto out; 14139 /* 14140 * Do not attempt recovery for requests initiated by 14141 * the recovery framework. Let the framework redrive them. 14142 */ 14143 if (ctype != NFS4_LCK_CTYPE_NORM) 14144 goto out; 14145 else { 14146 ASSERT(resend_rqstp == NULL); 14147 } 14148 14149 nfs4frlock_save_lost_rqst(ctype, ep->error, 14150 flk_to_locktype(cmd, flk->l_type), 14151 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14152 14153 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14154 &resp, lock_args, locku_args, &oop, &osp, &lop, 14155 rp, vp, &recov_state, op_hint, &did_start_fop, 14156 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14157 14158 if (retry) { 14159 ASSERT(oop == NULL); 14160 ASSERT(osp == NULL); 14161 ASSERT(lop == NULL); 14162 goto recov_retry; 14163 } 14164 goto out; 14165 } 14166 14167 /* 14168 * Bail out if have reached this point with ep->error set. Can 14169 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14170 * This happens if Kerberos ticket has expired or has been 14171 * destroyed. 14172 */ 14173 if (ep->error != 0) 14174 goto out; 14175 14176 /* 14177 * Process the reply. 14178 */ 14179 switch (resp->status) { 14180 case NFS4_OK: 14181 resop = &resp->array[1]; 14182 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14183 resend_rqstp); 14184 /* 14185 * Have a successful lock operation, now update state. 14186 */ 14187 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14188 resop, lop, vp, flk, cr, resend_rqstp); 14189 break; 14190 14191 case NFS4ERR_DENIED: 14192 resop = &resp->array[1]; 14193 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14194 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14195 &recov_state, needrecov, &argsp, &resp, 14196 &tick_delay, &whence, &ep->error, resop, cr, 14197 &did_start_fop, &skip_get_err); 14198 14199 if (retry) { 14200 ASSERT(oop == NULL); 14201 ASSERT(osp == NULL); 14202 ASSERT(lop == NULL); 14203 goto recov_retry; 14204 } 14205 break; 14206 /* 14207 * If the server won't let us reclaim, fall-back to trying to lock 14208 * the file from scratch. Code elsewhere will check the changeinfo 14209 * to ensure the file hasn't been changed. 14210 */ 14211 case NFS4ERR_NO_GRACE: 14212 if (lock_args && lock_args->reclaim == TRUE) { 14213 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14214 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14215 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14216 frc_no_reclaim = 1; 14217 /* clean up before retrying */ 14218 needrecov = 0; 14219 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14220 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14221 &recov_state, op_hint, &did_start_fop, NULL, flk); 14222 goto recov_retry; 14223 } 14224 /* FALLTHROUGH */ 14225 14226 default: 14227 nfs4frlock_results_default(resp, &ep->error); 14228 break; 14229 } 14230 out: 14231 /* 14232 * Process and cleanup from error. Make interrupted unlock 14233 * requests look successful, since they will be handled by the 14234 * client recovery code. 14235 */ 14236 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14237 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14238 lock_args, locku_args, did_start_fop, 14239 skip_get_err, cred_otw, cr); 14240 14241 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14242 (cmd == F_SETLK || cmd == F_SETLKW)) 14243 ep->error = 0; 14244 } 14245 14246 /* 14247 * nfs4_safelock: 14248 * 14249 * Return non-zero if the given lock request can be handled without 14250 * violating the constraints on concurrent mapping and locking. 14251 */ 14252 14253 static int 14254 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14255 { 14256 rnode4_t *rp = VTOR4(vp); 14257 struct vattr va; 14258 int error; 14259 14260 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14261 ASSERT(rp->r_mapcnt >= 0); 14262 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14263 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14264 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14265 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14266 14267 if (rp->r_mapcnt == 0) 14268 return (1); /* always safe if not mapped */ 14269 14270 /* 14271 * If the file is already mapped and there are locks, then they 14272 * should be all safe locks. So adding or removing a lock is safe 14273 * as long as the new request is safe (i.e., whole-file, meaning 14274 * length and starting offset are both zero). 14275 */ 14276 14277 if (bfp->l_start != 0 || bfp->l_len != 0) { 14278 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14279 "cannot lock a memory mapped file unless locking the " 14280 "entire file: start %"PRIx64", len %"PRIx64, 14281 bfp->l_start, bfp->l_len)); 14282 return (0); 14283 } 14284 14285 /* mandatory locking and mapping don't mix */ 14286 va.va_mask = AT_MODE; 14287 error = VOP_GETATTR(vp, &va, 0, cr); 14288 if (error != 0) { 14289 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14290 "getattr error %d", error)); 14291 return (0); /* treat errors conservatively */ 14292 } 14293 if (MANDLOCK(vp, va.va_mode)) { 14294 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14295 "cannot mandatory lock and mmap a file")); 14296 return (0); 14297 } 14298 14299 return (1); 14300 } 14301 14302 14303 /* 14304 * Register the lock locally within Solaris. 14305 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14306 * recording locks locally. 14307 * 14308 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14309 * are registered locally. 14310 */ 14311 void 14312 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14313 u_offset_t offset) 14314 { 14315 int oldsysid; 14316 int error; 14317 #ifdef DEBUG 14318 char *name; 14319 #endif 14320 14321 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14322 14323 #ifdef DEBUG 14324 name = fn_name(VTOSV(vp)->sv_name); 14325 NFS4_DEBUG(nfs4_client_lock_debug, 14326 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14327 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14328 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14329 flk->l_sysid)); 14330 kmem_free(name, MAXNAMELEN); 14331 #endif 14332 14333 /* register the lock with local locking */ 14334 oldsysid = flk->l_sysid; 14335 flk->l_sysid |= LM_SYSID_CLIENT; 14336 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14337 #ifdef DEBUG 14338 if (error != 0) { 14339 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14340 "nfs4_register_lock_locally: could not register with" 14341 " local locking")); 14342 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14343 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14344 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14345 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14346 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14347 flk->l_type, flk->l_start, flk->l_len)); 14348 (void) reclock(vp, flk, 0, flag, offset, NULL); 14349 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14350 "blocked by pid %d sysid 0x%x type %d " 14351 "off 0x%" PRIx64 " len 0x%" PRIx64, 14352 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14353 flk->l_len)); 14354 } 14355 #endif 14356 flk->l_sysid = oldsysid; 14357 } 14358 14359 /* 14360 * nfs4_lockrelease: 14361 * 14362 * Release any locks on the given vnode that are held by the current 14363 * process. Also removes the lock owner (if one exists) from the rnode's 14364 * list. 14365 */ 14366 static int 14367 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14368 { 14369 flock64_t ld; 14370 int ret, error; 14371 rnode4_t *rp; 14372 nfs4_lock_owner_t *lop; 14373 nfs4_recov_state_t recov_state; 14374 mntinfo4_t *mi; 14375 bool_t possible_orphan = FALSE; 14376 bool_t recovonly; 14377 14378 ASSERT((uintptr_t)vp > KERNELBASE); 14379 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14380 14381 rp = VTOR4(vp); 14382 mi = VTOMI4(vp); 14383 14384 /* 14385 * If we have not locked anything then we can 14386 * just return since we have no work to do. 14387 */ 14388 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14389 return (0); 14390 } 14391 14392 /* 14393 * We need to comprehend that another thread may 14394 * kick off recovery and the lock_owner we have stashed 14395 * in lop might be invalid so we should NOT cache it 14396 * locally! 14397 */ 14398 recov_state.rs_flags = 0; 14399 recov_state.rs_num_retry_despite_err = 0; 14400 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14401 &recovonly); 14402 if (error) { 14403 mutex_enter(&rp->r_statelock); 14404 rp->r_flags |= R4LODANGLERS; 14405 mutex_exit(&rp->r_statelock); 14406 return (error); 14407 } 14408 14409 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14410 14411 /* 14412 * Check if the lock owner might have a lock (request was sent but 14413 * no response was received). Also check if there are any remote 14414 * locks on the file. (In theory we shouldn't have to make this 14415 * second check if there's no lock owner, but for now we'll be 14416 * conservative and do it anyway.) If either condition is true, 14417 * send an unlock for the entire file to the server. 14418 * 14419 * Note that no explicit synchronization is needed here. At worst, 14420 * flk_has_remote_locks() will return a false positive, in which case 14421 * the unlock call wastes time but doesn't harm correctness. 14422 */ 14423 14424 if (lop) { 14425 mutex_enter(&lop->lo_lock); 14426 possible_orphan = lop->lo_pending_rqsts; 14427 mutex_exit(&lop->lo_lock); 14428 lock_owner_rele(lop); 14429 } 14430 14431 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14432 14433 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14434 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14435 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14436 (void *)lop)); 14437 14438 if (possible_orphan || flk_has_remote_locks(vp)) { 14439 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14440 ld.l_whence = 0; /* unlock from start of file */ 14441 ld.l_start = 0; 14442 ld.l_len = 0; /* do entire file */ 14443 14444 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, cr); 14445 14446 if (ret != 0) { 14447 /* 14448 * If VOP_FRLOCK fails, make sure we unregister 14449 * local locks before we continue. 14450 */ 14451 ld.l_pid = ttoproc(curthread)->p_pid; 14452 nfs4_register_lock_locally(vp, &ld, flag, offset); 14453 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14454 "nfs4_lockrelease: lock release error on vp" 14455 " %p: error %d.\n", (void *)vp, ret)); 14456 } 14457 } 14458 14459 recov_state.rs_flags = 0; 14460 recov_state.rs_num_retry_despite_err = 0; 14461 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14462 &recovonly); 14463 if (error) { 14464 mutex_enter(&rp->r_statelock); 14465 rp->r_flags |= R4LODANGLERS; 14466 mutex_exit(&rp->r_statelock); 14467 return (error); 14468 } 14469 14470 /* 14471 * So, here we're going to need to retrieve the lock-owner 14472 * again (in case recovery has done a switch-a-roo) and 14473 * remove it because we can. 14474 */ 14475 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14476 14477 if (lop) { 14478 nfs4_rnode_remove_lock_owner(rp, lop); 14479 lock_owner_rele(lop); 14480 } 14481 14482 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14483 return (0); 14484 } 14485 14486 /* 14487 * Wait for 'tick_delay' clock ticks. 14488 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14489 * NOTE: lock_lease_time is in seconds. 14490 * 14491 * XXX For future improvements, should implement a waiting queue scheme. 14492 */ 14493 static int 14494 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14495 { 14496 long milliseconds_delay; 14497 time_t lock_lease_time; 14498 14499 /* wait tick_delay clock ticks or siginteruptus */ 14500 if (delay_sig(*tick_delay)) { 14501 return (EINTR); 14502 } 14503 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14504 "reissue the lock request: blocked for %ld clock ticks: %ld " 14505 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14506 14507 /* get the lease time */ 14508 lock_lease_time = r2lease_time(rp); 14509 14510 /* drv_hztousec converts ticks to microseconds */ 14511 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14512 if (milliseconds_delay < lock_lease_time * 1000) { 14513 *tick_delay = 2 * *tick_delay; 14514 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14515 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14516 } 14517 return (0); 14518 } 14519 14520 14521 void 14522 nfs4_vnops_init(void) 14523 { 14524 } 14525 14526 void 14527 nfs4_vnops_fini(void) 14528 { 14529 } 14530 14531 /* 14532 * Return a reference to the directory (parent) vnode for a given vnode, 14533 * using the saved pathname information and the directory file handle. The 14534 * caller is responsible for disposing of the reference. 14535 * Returns zero or an errno value. 14536 * 14537 * Caller should set need_start_op to FALSE if it is the recovery 14538 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14539 */ 14540 int 14541 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14542 { 14543 svnode_t *svnp; 14544 vnode_t *dvp = NULL; 14545 servinfo4_t *svp; 14546 nfs4_fname_t *mfname; 14547 int error; 14548 14549 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14550 14551 if (vp->v_flag & VROOT) { 14552 nfs4_sharedfh_t *sfh; 14553 nfs_fh4 fh; 14554 mntinfo4_t *mi; 14555 14556 ASSERT(vp->v_type == VREG); 14557 14558 mi = VTOMI4(vp); 14559 svp = mi->mi_curr_serv; 14560 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14561 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14562 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14563 sfh = sfh4_get(&fh, VTOMI4(vp)); 14564 nfs_rw_exit(&svp->sv_lock); 14565 mfname = mi->mi_fname; 14566 fn_hold(mfname); 14567 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14568 sfh4_rele(&sfh); 14569 14570 if (dvp->v_type == VNON) 14571 dvp->v_type = VDIR; 14572 *dvpp = dvp; 14573 return (0); 14574 } 14575 14576 svnp = VTOSV(vp); 14577 14578 if (svnp == NULL) { 14579 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14580 "shadow node is NULL")); 14581 return (EINVAL); 14582 } 14583 14584 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14585 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14586 "shadow node name or dfh val == NULL")); 14587 return (EINVAL); 14588 } 14589 14590 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14591 (int)need_start_op); 14592 if (error != 0) { 14593 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14594 "nfs4_make_dotdot returned %d", error)); 14595 return (error); 14596 } 14597 if (!dvp) { 14598 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14599 "nfs4_make_dotdot returned a NULL dvp")); 14600 return (EIO); 14601 } 14602 if (dvp->v_type == VNON) 14603 dvp->v_type = VDIR; 14604 ASSERT(dvp->v_type == VDIR); 14605 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14606 mutex_enter(&dvp->v_lock); 14607 dvp->v_flag |= V_XATTRDIR; 14608 mutex_exit(&dvp->v_lock); 14609 } 14610 *dvpp = dvp; 14611 return (0); 14612 } 14613 14614 /* 14615 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14616 * length that fnamep can accept, including the trailing null. 14617 * Returns 0 if okay, returns an errno value if there was a problem. 14618 */ 14619 14620 int 14621 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14622 { 14623 char *fn; 14624 int err = 0; 14625 servinfo4_t *svp; 14626 svnode_t *shvp; 14627 14628 /* 14629 * If the file being opened has VROOT set, then this is 14630 * a "file" mount. sv_name will not be interesting, so 14631 * go back to the servinfo4 to get the original mount 14632 * path and strip off all but the final edge. Otherwise 14633 * just return the name from the shadow vnode. 14634 */ 14635 14636 if (vp->v_flag & VROOT) { 14637 14638 svp = VTOMI4(vp)->mi_curr_serv; 14639 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14640 14641 fn = strrchr(svp->sv_path, '/'); 14642 if (fn == NULL) 14643 err = EINVAL; 14644 else 14645 fn++; 14646 } else { 14647 shvp = VTOSV(vp); 14648 fn = fn_name(shvp->sv_name); 14649 } 14650 14651 if (err == 0) 14652 if (strlen(fn) < maxlen) 14653 (void) strcpy(fnamep, fn); 14654 else 14655 err = ENAMETOOLONG; 14656 14657 if (vp->v_flag & VROOT) 14658 nfs_rw_exit(&svp->sv_lock); 14659 else 14660 kmem_free(fn, MAXNAMELEN); 14661 14662 return (err); 14663 } 14664 14665 /* 14666 * Bookkeeping for a close that doesn't need to go over the wire. 14667 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14668 * it is left at 1. 14669 */ 14670 void 14671 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14672 { 14673 rnode4_t *rp; 14674 mntinfo4_t *mi; 14675 14676 mi = VTOMI4(vp); 14677 rp = VTOR4(vp); 14678 14679 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14680 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14681 ASSERT(nfs_zone() == mi->mi_zone); 14682 ASSERT(mutex_owned(&osp->os_sync_lock)); 14683 ASSERT(*have_lockp); 14684 14685 if (!osp->os_valid || 14686 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14687 return; 14688 } 14689 14690 /* 14691 * This removes the reference obtained at OPEN; ie, 14692 * when the open stream structure was created. 14693 * 14694 * We don't have to worry about calling 'open_stream_rele' 14695 * since we our currently holding a reference to this 14696 * open stream which means the count can not go to 0 with 14697 * this decrement. 14698 */ 14699 ASSERT(osp->os_ref_count >= 2); 14700 osp->os_ref_count--; 14701 osp->os_valid = 0; 14702 mutex_exit(&osp->os_sync_lock); 14703 *have_lockp = 0; 14704 14705 nfs4_dec_state_ref_count(mi); 14706 } 14707 14708 /* 14709 * Close all remaining open streams on the rnode. These open streams 14710 * could be here because: 14711 * - The close attempted at either close or delmap failed 14712 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14713 * - Someone did mknod on a regular file but never opened it 14714 */ 14715 int 14716 nfs4close_all(vnode_t *vp, cred_t *cr) 14717 { 14718 nfs4_open_stream_t *osp; 14719 int error; 14720 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14721 rnode4_t *rp; 14722 14723 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14724 14725 error = 0; 14726 rp = VTOR4(vp); 14727 14728 /* 14729 * At this point, all we know is that the last time 14730 * someone called vn_rele, the count was 1. Since then, 14731 * the vnode could have been re-activated. We want to 14732 * loop through the open streams and close each one, but 14733 * we have to be careful since once we release the rnode 14734 * hash bucket lock, someone else is free to come in and 14735 * re-activate the rnode and add new open streams. The 14736 * strategy is take the rnode hash bucket lock, verify that 14737 * the count is still 1, grab the open stream off the 14738 * head of the list and mark it invalid, then release the 14739 * rnode hash bucket lock and proceed with that open stream. 14740 * This is ok because nfs4close_one() will acquire the proper 14741 * open/create to close/destroy synchronization for open 14742 * streams, and will ensure that if someone has reopened 14743 * the open stream after we've dropped the hash bucket lock 14744 * then we'll just simply return without destroying the 14745 * open stream. 14746 * Repeat until the list is empty. 14747 */ 14748 14749 for (;;) { 14750 14751 /* make sure vnode hasn't been reactivated */ 14752 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14753 mutex_enter(&vp->v_lock); 14754 if (vp->v_count > 1) { 14755 mutex_exit(&vp->v_lock); 14756 rw_exit(&rp->r_hashq->r_lock); 14757 break; 14758 } 14759 /* 14760 * Grabbing r_os_lock before releasing v_lock prevents 14761 * a window where the rnode/open stream could get 14762 * reactivated (and os_force_close set to 0) before we 14763 * had a chance to set os_force_close to 1. 14764 */ 14765 mutex_enter(&rp->r_os_lock); 14766 mutex_exit(&vp->v_lock); 14767 14768 osp = list_head(&rp->r_open_streams); 14769 if (!osp) { 14770 /* nothing left to CLOSE OTW, so return */ 14771 mutex_exit(&rp->r_os_lock); 14772 rw_exit(&rp->r_hashq->r_lock); 14773 break; 14774 } 14775 14776 mutex_enter(&rp->r_statev4_lock); 14777 /* the file can't still be mem mapped */ 14778 ASSERT(rp->r_mapcnt == 0); 14779 if (rp->created_v4) 14780 rp->created_v4 = 0; 14781 mutex_exit(&rp->r_statev4_lock); 14782 14783 /* 14784 * Grab a ref on this open stream; nfs4close_one 14785 * will mark it as invalid 14786 */ 14787 mutex_enter(&osp->os_sync_lock); 14788 osp->os_ref_count++; 14789 osp->os_force_close = 1; 14790 mutex_exit(&osp->os_sync_lock); 14791 mutex_exit(&rp->r_os_lock); 14792 rw_exit(&rp->r_hashq->r_lock); 14793 14794 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 14795 14796 /* Update error if it isn't already non-zero */ 14797 if (error == 0) { 14798 if (e.error) 14799 error = e.error; 14800 else if (e.stat) 14801 error = geterrno4(e.stat); 14802 } 14803 14804 #ifdef DEBUG 14805 nfs4close_all_cnt++; 14806 #endif 14807 /* Release the ref on osp acquired above. */ 14808 open_stream_rele(osp, rp); 14809 14810 /* Proceed to the next open stream, if any */ 14811 } 14812 return (error); 14813 } 14814 14815 /* 14816 * nfs4close_one - close one open stream for a file if needed. 14817 * 14818 * "close_type" indicates which close path this is: 14819 * CLOSE_NORM: close initiated via VOP_CLOSE. 14820 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 14821 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 14822 * the close and release of client state for this open stream 14823 * (unless someone else has the open stream open). 14824 * CLOSE_RESEND: indicates the request is a replay of an earlier request 14825 * (e.g., due to abort because of a signal). 14826 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 14827 * 14828 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 14829 * recovery. Instead, the caller is expected to deal with retries. 14830 * 14831 * The caller can either pass in the osp ('provided_osp') or not. 14832 * 14833 * 'access_bits' represents the access we are closing/downgrading. 14834 * 14835 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 14836 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 14837 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 14838 * 14839 * Errors are returned via the nfs4_error_t. 14840 */ 14841 void 14842 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 14843 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 14844 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 14845 uint_t mmap_flags) 14846 { 14847 nfs4_open_owner_t *oop; 14848 nfs4_open_stream_t *osp = NULL; 14849 int retry = 0; 14850 int num_retries = NFS4_NUM_RECOV_RETRIES; 14851 rnode4_t *rp; 14852 mntinfo4_t *mi; 14853 nfs4_recov_state_t recov_state; 14854 cred_t *cred_otw = NULL; 14855 bool_t recovonly = FALSE; 14856 int isrecov; 14857 int force_close; 14858 int close_failed = 0; 14859 int did_dec_count = 0; 14860 int did_start_op = 0; 14861 int did_force_recovlock = 0; 14862 int did_start_seqid_sync = 0; 14863 int have_sync_lock = 0; 14864 14865 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14866 14867 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 14868 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 14869 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 14870 len, maxprot, mmap_flags, access_bits)); 14871 14872 nfs4_error_zinit(ep); 14873 rp = VTOR4(vp); 14874 mi = VTOMI4(vp); 14875 isrecov = (close_type == CLOSE_RESEND || 14876 close_type == CLOSE_AFTER_RESEND); 14877 14878 /* 14879 * First get the open owner. 14880 */ 14881 if (!provided_osp) { 14882 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 14883 } else { 14884 oop = provided_osp->os_open_owner; 14885 ASSERT(oop != NULL); 14886 open_owner_hold(oop); 14887 } 14888 14889 if (!oop) { 14890 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 14891 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 14892 "close type %d", (void *)rp, (void *)mi, (void *)cr, 14893 (void *)provided_osp, close_type)); 14894 ep->error = EIO; 14895 goto out; 14896 } 14897 14898 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 14899 recov_retry: 14900 osp = NULL; 14901 close_failed = 0; 14902 force_close = (close_type == CLOSE_FORCE); 14903 retry = 0; 14904 did_start_op = 0; 14905 did_force_recovlock = 0; 14906 did_start_seqid_sync = 0; 14907 have_sync_lock = 0; 14908 recovonly = FALSE; 14909 recov_state.rs_flags = 0; 14910 recov_state.rs_num_retry_despite_err = 0; 14911 14912 /* 14913 * Second synchronize with recovery. 14914 */ 14915 if (!isrecov) { 14916 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 14917 &recov_state, &recovonly); 14918 if (!ep->error) { 14919 did_start_op = 1; 14920 } else { 14921 close_failed = 1; 14922 /* 14923 * If we couldn't get start_fop, but have to 14924 * cleanup state, then at least acquire the 14925 * mi_recovlock so we can synchronize with 14926 * recovery. 14927 */ 14928 if (close_type == CLOSE_FORCE) { 14929 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 14930 RW_READER, FALSE); 14931 did_force_recovlock = 1; 14932 } else 14933 goto out; 14934 } 14935 } 14936 14937 /* 14938 * We cannot attempt to get the open seqid sync if nfs4_start_fop 14939 * set 'recovonly' to TRUE since most likely this is due to 14940 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 14941 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 14942 * to retry, causing us to loop until recovery finishes. Plus we 14943 * don't need protection over the open seqid since we're not going 14944 * OTW, hence don't need to use the seqid. 14945 */ 14946 if (recovonly == FALSE) { 14947 /* need to grab the open owner sync before 'os_sync_lock' */ 14948 ep->error = nfs4_start_open_seqid_sync(oop, mi); 14949 if (ep->error == EAGAIN) { 14950 ASSERT(!isrecov); 14951 if (did_start_op) 14952 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 14953 &recov_state, TRUE); 14954 if (did_force_recovlock) 14955 nfs_rw_exit(&mi->mi_recovlock); 14956 goto recov_retry; 14957 } 14958 did_start_seqid_sync = 1; 14959 } 14960 14961 /* 14962 * Third get an open stream and acquire 'os_sync_lock' to 14963 * sychronize the opening/creating of an open stream with the 14964 * closing/destroying of an open stream. 14965 */ 14966 if (!provided_osp) { 14967 /* returns with 'os_sync_lock' held */ 14968 osp = find_open_stream(oop, rp); 14969 if (!osp) { 14970 ep->error = EIO; 14971 goto out; 14972 } 14973 } else { 14974 osp = provided_osp; 14975 open_stream_hold(osp); 14976 mutex_enter(&osp->os_sync_lock); 14977 } 14978 have_sync_lock = 1; 14979 14980 ASSERT(oop == osp->os_open_owner); 14981 14982 /* 14983 * Fourth, do any special pre-OTW CLOSE processing 14984 * based on the specific close type. 14985 */ 14986 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 14987 !did_dec_count) { 14988 ASSERT(osp->os_open_ref_count > 0); 14989 osp->os_open_ref_count--; 14990 did_dec_count = 1; 14991 if (osp->os_open_ref_count == 0) 14992 osp->os_final_close = 1; 14993 } 14994 14995 if (close_type == CLOSE_FORCE) { 14996 /* see if somebody reopened the open stream. */ 14997 if (!osp->os_force_close) { 14998 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 14999 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15000 "was reopened, vp %p", (void *)osp, (void *)vp)); 15001 ep->error = 0; 15002 ep->stat = NFS4_OK; 15003 goto out; 15004 } 15005 15006 if (!osp->os_final_close && !did_dec_count) { 15007 osp->os_open_ref_count--; 15008 did_dec_count = 1; 15009 } 15010 15011 /* 15012 * We can't depend on os_open_ref_count being 0 due to the 15013 * way executables are opened (VN_RELE to match a VOP_OPEN). 15014 */ 15015 #ifdef NOTYET 15016 ASSERT(osp->os_open_ref_count == 0); 15017 #endif 15018 if (osp->os_open_ref_count != 0) { 15019 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15020 "nfs4close_one: should panic here on an " 15021 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15022 "since this is probably the exec problem.")); 15023 15024 osp->os_open_ref_count = 0; 15025 } 15026 15027 /* 15028 * There is the possibility that nfs4close_one() 15029 * for close_type == CLOSE_DELMAP couldn't find the 15030 * open stream, thus couldn't decrement its os_mapcnt; 15031 * therefore we can't use this ASSERT yet. 15032 */ 15033 #ifdef NOTYET 15034 ASSERT(osp->os_mapcnt == 0); 15035 #endif 15036 osp->os_mapcnt = 0; 15037 } 15038 15039 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15040 ASSERT(osp->os_mapcnt >= btopr(len)); 15041 15042 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15043 osp->os_mmap_write -= btopr(len); 15044 if (maxprot & PROT_READ) 15045 osp->os_mmap_read -= btopr(len); 15046 if (maxprot & PROT_EXEC) 15047 osp->os_mmap_read -= btopr(len); 15048 /* mirror the PROT_NONE check in nfs4_addmap() */ 15049 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15050 !(maxprot & PROT_EXEC)) 15051 osp->os_mmap_read -= btopr(len); 15052 osp->os_mapcnt -= btopr(len); 15053 did_dec_count = 1; 15054 } 15055 15056 if (recovonly) { 15057 nfs4_lost_rqst_t lost_rqst; 15058 15059 /* request should not already be in recovery queue */ 15060 ASSERT(lrp == NULL); 15061 nfs4_error_init(ep, EINTR); 15062 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15063 osp, cred_otw, vp); 15064 mutex_exit(&osp->os_sync_lock); 15065 have_sync_lock = 0; 15066 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15067 lost_rqst.lr_op == OP_CLOSE ? 15068 &lost_rqst : NULL, OP_CLOSE, NULL); 15069 close_failed = 1; 15070 force_close = 0; 15071 goto close_cleanup; 15072 } 15073 15074 /* 15075 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15076 * we stopped operating on the open owner's <old oo_name, old seqid> 15077 * space, which means we stopped operating on the open stream 15078 * too. So don't go OTW (as the seqid is likely bad, and the 15079 * stateid could be stale, potentially triggering a false 15080 * setclientid), and just clean up the client's internal state. 15081 */ 15082 if (osp->os_orig_oo_name != oop->oo_name) { 15083 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15084 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15085 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15086 "oo_name %" PRIx64")", 15087 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15088 oop->oo_name)); 15089 close_failed = 1; 15090 } 15091 15092 /* If the file failed recovery, just quit. */ 15093 mutex_enter(&rp->r_statelock); 15094 if (rp->r_flags & R4RECOVERR) { 15095 close_failed = 1; 15096 } 15097 mutex_exit(&rp->r_statelock); 15098 15099 /* 15100 * If the force close path failed to obtain start_fop 15101 * then skip the OTW close and just remove the state. 15102 */ 15103 if (close_failed) 15104 goto close_cleanup; 15105 15106 /* 15107 * Fifth, check to see if there are still mapped pages or other 15108 * opens using this open stream. If there are then we can't 15109 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15110 */ 15111 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15112 nfs4_lost_rqst_t new_lost_rqst; 15113 bool_t needrecov = FALSE; 15114 cred_t *odg_cred_otw = NULL; 15115 seqid4 open_dg_seqid = 0; 15116 15117 if (osp->os_delegation) { 15118 /* 15119 * If this open stream was never OPENed OTW then we 15120 * surely can't DOWNGRADE it (especially since the 15121 * osp->open_stateid is really a delegation stateid 15122 * when os_delegation is 1). 15123 */ 15124 if (access_bits & FREAD) 15125 osp->os_share_acc_read--; 15126 if (access_bits & FWRITE) 15127 osp->os_share_acc_write--; 15128 osp->os_share_deny_none--; 15129 nfs4_error_zinit(ep); 15130 goto out; 15131 } 15132 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15133 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15134 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15135 if (needrecov && !isrecov) { 15136 bool_t abort; 15137 nfs4_bseqid_entry_t *bsep = NULL; 15138 15139 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15140 bsep = nfs4_create_bseqid_entry(oop, NULL, 15141 vp, 0, 15142 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15143 open_dg_seqid); 15144 15145 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15146 oop, osp, odg_cred_otw, vp, access_bits, 0); 15147 mutex_exit(&osp->os_sync_lock); 15148 have_sync_lock = 0; 15149 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15150 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15151 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15152 bsep); 15153 if (odg_cred_otw) 15154 crfree(odg_cred_otw); 15155 if (bsep) 15156 kmem_free(bsep, sizeof (*bsep)); 15157 15158 if (abort == TRUE) 15159 goto out; 15160 15161 if (did_start_seqid_sync) { 15162 nfs4_end_open_seqid_sync(oop); 15163 did_start_seqid_sync = 0; 15164 } 15165 open_stream_rele(osp, rp); 15166 15167 if (did_start_op) 15168 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15169 &recov_state, FALSE); 15170 if (did_force_recovlock) 15171 nfs_rw_exit(&mi->mi_recovlock); 15172 15173 goto recov_retry; 15174 } else { 15175 if (odg_cred_otw) 15176 crfree(odg_cred_otw); 15177 } 15178 goto out; 15179 } 15180 15181 /* 15182 * If this open stream was created as the results of an open 15183 * while holding a delegation, then just release it; no need 15184 * to do an OTW close. Otherwise do a "normal" OTW close. 15185 */ 15186 if (osp->os_delegation) { 15187 nfs4close_notw(vp, osp, &have_sync_lock); 15188 nfs4_error_zinit(ep); 15189 goto out; 15190 } 15191 15192 /* 15193 * If this stream is not valid, we're done. 15194 */ 15195 if (!osp->os_valid) { 15196 nfs4_error_zinit(ep); 15197 goto out; 15198 } 15199 15200 /* 15201 * Last open or mmap ref has vanished, need to do an OTW close. 15202 * First check to see if a close is still necessary. 15203 */ 15204 if (osp->os_failed_reopen) { 15205 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15206 "don't close OTW osp %p since reopen failed.", 15207 (void *)osp)); 15208 /* 15209 * Reopen of the open stream failed, hence the 15210 * stateid of the open stream is invalid/stale, and 15211 * sending this OTW would incorrectly cause another 15212 * round of recovery. In this case, we need to set 15213 * the 'os_valid' bit to 0 so another thread doesn't 15214 * come in and re-open this open stream before 15215 * this "closing" thread cleans up state (decrementing 15216 * the nfs4_server_t's state_ref_count and decrementing 15217 * the os_ref_count). 15218 */ 15219 osp->os_valid = 0; 15220 /* 15221 * This removes the reference obtained at OPEN; ie, 15222 * when the open stream structure was created. 15223 * 15224 * We don't have to worry about calling 'open_stream_rele' 15225 * since we our currently holding a reference to this 15226 * open stream which means the count can not go to 0 with 15227 * this decrement. 15228 */ 15229 ASSERT(osp->os_ref_count >= 2); 15230 osp->os_ref_count--; 15231 nfs4_error_zinit(ep); 15232 close_failed = 0; 15233 goto close_cleanup; 15234 } 15235 15236 ASSERT(osp->os_ref_count > 1); 15237 15238 /* 15239 * Sixth, try the CLOSE OTW. 15240 */ 15241 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15242 close_type, ep, &have_sync_lock); 15243 15244 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15245 /* 15246 * Let the recovery thread be responsible for 15247 * removing the state for CLOSE. 15248 */ 15249 close_failed = 1; 15250 force_close = 0; 15251 retry = 0; 15252 } 15253 15254 /* See if we need to retry with a different cred */ 15255 if ((ep->error == EACCES || 15256 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15257 cred_otw != cr) { 15258 crfree(cred_otw); 15259 cred_otw = cr; 15260 crhold(cred_otw); 15261 retry = 1; 15262 } 15263 15264 if (ep->error || ep->stat) 15265 close_failed = 1; 15266 15267 if (retry && !isrecov && num_retries-- > 0) { 15268 if (have_sync_lock) { 15269 mutex_exit(&osp->os_sync_lock); 15270 have_sync_lock = 0; 15271 } 15272 if (did_start_seqid_sync) { 15273 nfs4_end_open_seqid_sync(oop); 15274 did_start_seqid_sync = 0; 15275 } 15276 open_stream_rele(osp, rp); 15277 15278 if (did_start_op) 15279 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15280 &recov_state, FALSE); 15281 if (did_force_recovlock) 15282 nfs_rw_exit(&mi->mi_recovlock); 15283 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15284 "nfs4close_one: need to retry the close " 15285 "operation")); 15286 goto recov_retry; 15287 } 15288 close_cleanup: 15289 /* 15290 * Seventh and lastly, process our results. 15291 */ 15292 if (close_failed && force_close) { 15293 /* 15294 * It's ok to drop and regrab the 'os_sync_lock' since 15295 * nfs4close_notw() will recheck to make sure the 15296 * "close"/removal of state should happen. 15297 */ 15298 if (!have_sync_lock) { 15299 mutex_enter(&osp->os_sync_lock); 15300 have_sync_lock = 1; 15301 } 15302 /* 15303 * This is last call, remove the ref on the open 15304 * stream created by open and clean everything up. 15305 */ 15306 osp->os_pending_close = 0; 15307 nfs4close_notw(vp, osp, &have_sync_lock); 15308 nfs4_error_zinit(ep); 15309 } 15310 15311 if (!close_failed) { 15312 if (have_sync_lock) { 15313 osp->os_pending_close = 0; 15314 mutex_exit(&osp->os_sync_lock); 15315 have_sync_lock = 0; 15316 } else { 15317 mutex_enter(&osp->os_sync_lock); 15318 osp->os_pending_close = 0; 15319 mutex_exit(&osp->os_sync_lock); 15320 } 15321 if (did_start_op && recov_state.rs_sp != NULL) { 15322 mutex_enter(&recov_state.rs_sp->s_lock); 15323 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15324 mutex_exit(&recov_state.rs_sp->s_lock); 15325 } else { 15326 nfs4_dec_state_ref_count(mi); 15327 } 15328 nfs4_error_zinit(ep); 15329 } 15330 15331 out: 15332 if (have_sync_lock) 15333 mutex_exit(&osp->os_sync_lock); 15334 if (did_start_op) 15335 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15336 recovonly ? TRUE : FALSE); 15337 if (did_force_recovlock) 15338 nfs_rw_exit(&mi->mi_recovlock); 15339 if (cred_otw) 15340 crfree(cred_otw); 15341 if (osp) 15342 open_stream_rele(osp, rp); 15343 if (oop) { 15344 if (did_start_seqid_sync) 15345 nfs4_end_open_seqid_sync(oop); 15346 open_owner_rele(oop); 15347 } 15348 } 15349 15350 /* 15351 * Convert information returned by the server in the LOCK4denied 15352 * structure to the form required by fcntl. 15353 */ 15354 static void 15355 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15356 { 15357 nfs4_lo_name_t *lo; 15358 15359 #ifdef DEBUG 15360 if (denied_to_flk_debug) { 15361 lockt_denied_debug = lockt_denied; 15362 debug_enter("lockt_denied"); 15363 } 15364 #endif 15365 15366 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15367 flk->l_whence = 0; /* aka SEEK_SET */ 15368 flk->l_start = lockt_denied->offset; 15369 flk->l_len = lockt_denied->length; 15370 15371 /* 15372 * If the blocking clientid matches our client id, then we can 15373 * interpret the lockowner (since we built it). If not, then 15374 * fabricate a sysid and pid. Note that the l_sysid field 15375 * in *flk already has the local sysid. 15376 */ 15377 15378 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15379 15380 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15381 lo = (nfs4_lo_name_t *) 15382 lockt_denied->owner.owner_val; 15383 15384 flk->l_pid = lo->ln_pid; 15385 } else { 15386 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15387 "denied_to_flk: bad lock owner length\n")); 15388 15389 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15390 } 15391 } else { 15392 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15393 "denied_to_flk: foreign clientid\n")); 15394 15395 /* 15396 * Construct a new sysid which should be different from 15397 * sysids of other systems. 15398 */ 15399 15400 flk->l_sysid++; 15401 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15402 } 15403 } 15404 15405 static pid_t 15406 lo_to_pid(lock_owner4 *lop) 15407 { 15408 pid_t pid = 0; 15409 uchar_t *cp; 15410 int i; 15411 15412 cp = (uchar_t *)&lop->clientid; 15413 15414 for (i = 0; i < sizeof (lop->clientid); i++) 15415 pid += (pid_t)*cp++; 15416 15417 cp = (uchar_t *)lop->owner_val; 15418 15419 for (i = 0; i < lop->owner_len; i++) 15420 pid += (pid_t)*cp++; 15421 15422 return (pid); 15423 } 15424 15425 /* 15426 * Given a lock pointer, returns the length of that lock. 15427 * "end" is the last locked offset the "l_len" covers from 15428 * the start of the lock. 15429 */ 15430 static off64_t 15431 lock_to_end(flock64_t *lock) 15432 { 15433 off64_t lock_end; 15434 15435 if (lock->l_len == 0) 15436 lock_end = (off64_t)MAXEND; 15437 else 15438 lock_end = lock->l_start + lock->l_len - 1; 15439 15440 return (lock_end); 15441 } 15442 15443 /* 15444 * Given the end of a lock, it will return you the length "l_len" for that lock. 15445 */ 15446 static off64_t 15447 end_to_len(off64_t start, off64_t end) 15448 { 15449 off64_t lock_len; 15450 15451 ASSERT(end >= start); 15452 if (end == MAXEND) 15453 lock_len = 0; 15454 else 15455 lock_len = end - start + 1; 15456 15457 return (lock_len); 15458 } 15459 15460 /* 15461 * On given end for a lock it determines if it is the last locked offset 15462 * or not, if so keeps it as is, else adds one to return the length for 15463 * valid start. 15464 */ 15465 static off64_t 15466 start_check(off64_t x) 15467 { 15468 if (x == MAXEND) 15469 return (x); 15470 else 15471 return (x + 1); 15472 } 15473 15474 /* 15475 * See if these two locks overlap, and if so return 1; 15476 * otherwise, return 0. 15477 */ 15478 static int 15479 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15480 { 15481 off64_t llfp_end, curfp_end; 15482 15483 llfp_end = lock_to_end(llfp); 15484 curfp_end = lock_to_end(curfp); 15485 15486 if (((llfp_end >= curfp->l_start) && 15487 (llfp->l_start <= curfp->l_start)) || 15488 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15489 return (1); 15490 return (0); 15491 } 15492 15493 /* 15494 * Determine what the interseting lock region is, and add that to the 15495 * 'nl_llpp' locklist in increasing order (by l_start). 15496 */ 15497 static void 15498 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15499 locklist_t **nl_llpp, vnode_t *vp) 15500 { 15501 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15502 off64_t lost_flp_end, local_flp_end, len, start; 15503 15504 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15505 15506 if (!locks_intersect(lost_flp, local_flp)) 15507 return; 15508 15509 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15510 "locks intersect")); 15511 15512 lost_flp_end = lock_to_end(lost_flp); 15513 local_flp_end = lock_to_end(local_flp); 15514 15515 /* Find the starting point of the intersecting region */ 15516 if (local_flp->l_start > lost_flp->l_start) 15517 start = local_flp->l_start; 15518 else 15519 start = lost_flp->l_start; 15520 15521 /* Find the lenght of the intersecting region */ 15522 if (lost_flp_end < local_flp_end) 15523 len = end_to_len(start, lost_flp_end); 15524 else 15525 len = end_to_len(start, local_flp_end); 15526 15527 /* 15528 * Prepare the flock structure for the intersection found and insert 15529 * it into the new list in increasing l_start order. This list contains 15530 * intersections of locks registered by the client with the local host 15531 * and the lost lock. 15532 * The lock type of this lock is the same as that of the local_flp. 15533 */ 15534 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15535 intersect_llp->ll_flock.l_start = start; 15536 intersect_llp->ll_flock.l_len = len; 15537 intersect_llp->ll_flock.l_type = local_flp->l_type; 15538 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15539 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15540 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15541 intersect_llp->ll_vp = vp; 15542 15543 tmp_fllp = *nl_llpp; 15544 cur_fllp = NULL; 15545 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15546 intersect_llp->ll_flock.l_start) { 15547 cur_fllp = tmp_fllp; 15548 tmp_fllp = tmp_fllp->ll_next; 15549 } 15550 if (cur_fllp == NULL) { 15551 /* first on the list */ 15552 intersect_llp->ll_next = *nl_llpp; 15553 *nl_llpp = intersect_llp; 15554 } else { 15555 intersect_llp->ll_next = cur_fllp->ll_next; 15556 cur_fllp->ll_next = intersect_llp; 15557 } 15558 15559 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15560 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15561 intersect_llp->ll_flock.l_start, 15562 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15563 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15564 } 15565 15566 /* 15567 * Our local locking current state is potentially different than 15568 * what the NFSv4 server thinks we have due to a lost lock that was 15569 * resent and then received. We need to reset our "NFSv4" locking 15570 * state to match the current local locking state for this pid since 15571 * that is what the user/application sees as what the world is. 15572 * 15573 * We cannot afford to drop the open/lock seqid sync since then we can 15574 * get confused about what the current local locking state "is" versus 15575 * "was". 15576 * 15577 * If we are unable to fix up the locks, we send SIGLOST to the affected 15578 * process. This is not done if the filesystem has been forcibly 15579 * unmounted, in case the process has already exited and a new process 15580 * exists with the same pid. 15581 */ 15582 static void 15583 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15584 nfs4_lock_owner_t *lop) 15585 { 15586 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15587 mntinfo4_t *mi = VTOMI4(vp); 15588 const int cmd = F_SETLK; 15589 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15590 flock64_t ul_fl; 15591 15592 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15593 "nfs4_reinstitute_local_lock_state")); 15594 15595 /* 15596 * Find active locks for this vp from the local locking code. 15597 * Scan through this list and find out the locks that intersect with 15598 * the lost lock. Once we find the lock that intersects, add the 15599 * intersection area as a new lock to a new list "ri_llp". The lock 15600 * type of the intersection region lock added to ri_llp is the same 15601 * as that found in the active lock list, "list". The intersecting 15602 * region locks are added to ri_llp in increasing l_start order. 15603 */ 15604 ASSERT(nfs_zone() == mi->mi_zone); 15605 15606 locks = flk_active_locks_for_vp(vp); 15607 ri_llp = NULL; 15608 15609 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15610 ASSERT(llp->ll_vp == vp); 15611 /* 15612 * Pick locks that belong to this pid/lockowner 15613 */ 15614 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15615 continue; 15616 15617 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15618 } 15619 15620 /* 15621 * Now we have the list of intersections with the lost lock. These are 15622 * the locks that were/are active before the server replied to the 15623 * last/lost lock. Issue these locks to the server here. Playing these 15624 * locks to the server will re-establish aur current local locking state 15625 * with the v4 server. 15626 * If we get an error, send SIGLOST to the application for that lock. 15627 */ 15628 15629 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15630 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15631 "nfs4_reinstitute_local_lock_state: need to issue " 15632 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15633 llp->ll_flock.l_start, 15634 llp->ll_flock.l_start + llp->ll_flock.l_len, 15635 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15636 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15637 /* 15638 * No need to relock what we already have 15639 */ 15640 if (llp->ll_flock.l_type == lost_flp->l_type) 15641 continue; 15642 15643 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15644 } 15645 15646 /* 15647 * Now keeping the start of the lost lock as our reference parse the 15648 * newly created ri_llp locklist to find the ranges that we have locked 15649 * with the v4 server but not in the current local locking. We need 15650 * to unlock these ranges. 15651 * These ranges can also be reffered to as those ranges, where the lost 15652 * lock does not overlap with the locks in the ri_llp but are locked 15653 * since the server replied to the lost lock. 15654 */ 15655 cur_start = lost_flp->l_start; 15656 lost_flp_end = lock_to_end(lost_flp); 15657 15658 ul_fl.l_type = F_UNLCK; 15659 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15660 ul_fl.l_sysid = lost_flp->l_sysid; 15661 ul_fl.l_pid = lost_flp->l_pid; 15662 15663 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15664 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15665 15666 if (llp->ll_flock.l_start <= cur_start) { 15667 cur_start = start_check(llp_ll_flock_end); 15668 continue; 15669 } 15670 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15671 "nfs4_reinstitute_local_lock_state: " 15672 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15673 cur_start, llp->ll_flock.l_start)); 15674 15675 ul_fl.l_start = cur_start; 15676 ul_fl.l_len = end_to_len(cur_start, 15677 (llp->ll_flock.l_start - 1)); 15678 15679 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15680 cur_start = start_check(llp_ll_flock_end); 15681 } 15682 15683 /* 15684 * In the case where the lost lock ends after all intersecting locks, 15685 * unlock the last part of the lost lock range. 15686 */ 15687 if (cur_start != start_check(lost_flp_end)) { 15688 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15689 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15690 "lost lock region [%"PRIx64" - %"PRIx64"]", 15691 cur_start, lost_flp->l_start + lost_flp->l_len)); 15692 15693 ul_fl.l_start = cur_start; 15694 /* 15695 * Is it an to-EOF lock? if so unlock till the end 15696 */ 15697 if (lost_flp->l_len == 0) 15698 ul_fl.l_len = 0; 15699 else 15700 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15701 15702 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15703 } 15704 15705 if (locks != NULL) 15706 flk_free_locklist(locks); 15707 15708 /* Free up our newly created locklist */ 15709 for (llp = ri_llp; llp != NULL; ) { 15710 tmp_llp = llp->ll_next; 15711 kmem_free(llp, sizeof (locklist_t)); 15712 llp = tmp_llp; 15713 } 15714 15715 /* 15716 * Now return back to the original calling nfs4frlock() 15717 * and let us naturally drop our seqid syncs. 15718 */ 15719 } 15720 15721 /* 15722 * Create a lost state record for the given lock reinstantiation request 15723 * and push it onto the lost state queue. 15724 */ 15725 static void 15726 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15727 nfs4_lock_owner_t *lop) 15728 { 15729 nfs4_lost_rqst_t req; 15730 nfs_lock_type4 locktype; 15731 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15732 15733 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15734 15735 locktype = flk_to_locktype(cmd, flk->l_type); 15736 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15737 NULL, NULL, lop, flk, &req, cr, vp); 15738 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15739 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15740 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15741 NULL); 15742 } 15743