1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/vfs_opreg.h> 39 #include <sys/file.h> 40 #include <sys/filio.h> 41 #include <sys/uio.h> 42 #include <sys/buf.h> 43 #include <sys/mman.h> 44 #include <sys/pathname.h> 45 #include <sys/dirent.h> 46 #include <sys/debug.h> 47 #include <sys/vmsystm.h> 48 #include <sys/fcntl.h> 49 #include <sys/flock.h> 50 #include <sys/swap.h> 51 #include <sys/errno.h> 52 #include <sys/strsubr.h> 53 #include <sys/sysmacros.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/pathconf.h> 57 #include <sys/utsname.h> 58 #include <sys/dnlc.h> 59 #include <sys/acl.h> 60 #include <sys/systeminfo.h> 61 #include <sys/policy.h> 62 #include <sys/sdt.h> 63 #include <sys/list.h> 64 #include <sys/stat.h> 65 #include <sys/zone.h> 66 67 #include <rpc/types.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs4_kprot.h> 77 #include <nfs/rnode4.h> 78 #include <nfs/nfs4_clnt.h> 79 80 #include <vm/hat.h> 81 #include <vm/as.h> 82 #include <vm/page.h> 83 #include <vm/pvn.h> 84 #include <vm/seg.h> 85 #include <vm/seg_map.h> 86 #include <vm/seg_kpm.h> 87 #include <vm/seg_vn.h> 88 89 #include <fs/fs_subr.h> 90 91 #include <sys/ddi.h> 92 #include <sys/int_fmtio.h> 93 94 typedef struct { 95 nfs4_ga_res_t *di_garp; 96 cred_t *di_cred; 97 hrtime_t di_time_call; 98 } dirattr_info_t; 99 100 typedef enum nfs4_acl_op { 101 NFS4_ACL_GET, 102 NFS4_ACL_SET 103 } nfs4_acl_op_t; 104 105 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 106 107 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 108 char *, dirattr_info_t *); 109 110 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 111 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 112 nfs4_error_t *, int *); 113 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 114 cred_t *); 115 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 116 stable_how4 *); 117 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 118 cred_t *, bool_t, struct uio *); 119 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 120 vsecattr_t *); 121 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 122 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 123 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 124 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 125 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 126 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 127 int, vnode_t **, cred_t *); 128 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 129 cred_t *, int, int, enum createmode4, int); 130 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 131 caller_context_t *); 132 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 133 vnode_t *, char *, cred_t *, nfsstat4 *); 134 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 135 vnode_t *, char *, cred_t *, nfsstat4 *); 136 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 137 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 138 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 139 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 140 page_t *[], size_t, struct seg *, caddr_t, 141 enum seg_rw, cred_t *); 142 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 143 cred_t *); 144 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 145 int, cred_t *); 146 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 147 int, cred_t *); 148 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 149 static void nfs4_set_mod(vnode_t *); 150 static void nfs4_get_commit(vnode_t *); 151 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 152 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 153 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 154 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 155 cred_t *); 156 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 157 cred_t *); 158 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 159 hrtime_t, vnode_t *, cred_t *); 160 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 161 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 162 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 163 u_offset_t); 164 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 165 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 166 static cred_t *state_to_cred(nfs4_open_stream_t *); 167 static int vtoname(vnode_t *, char *, ssize_t); 168 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 169 static pid_t lo_to_pid(lock_owner4 *); 170 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 171 cred_t *, nfs4_lock_owner_t *); 172 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 173 nfs4_lock_owner_t *); 174 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 175 static void nfs4_delmap_callback(struct as *, void *, uint_t); 176 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 177 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 178 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 179 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 180 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 181 uid_t, gid_t, int); 182 183 /* 184 * Routines that implement the setting of v4 args for the misc. ops 185 */ 186 static void nfs4args_lock_free(nfs_argop4 *); 187 static void nfs4args_lockt_free(nfs_argop4 *); 188 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 189 int, rnode4_t *, cred_t *, bitmap4, int *, 190 nfs4_stateid_types_t *); 191 static void nfs4args_setattr_free(nfs_argop4 *); 192 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 193 bitmap4); 194 static void nfs4args_verify_free(nfs_argop4 *); 195 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 196 WRITE4args **, nfs4_stateid_types_t *); 197 198 /* 199 * These are the vnode ops functions that implement the vnode interface to 200 * the networked file system. See more comments below at nfs4_vnodeops. 201 */ 202 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 203 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 204 caller_context_t *); 205 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 206 caller_context_t *); 207 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 208 caller_context_t *); 209 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 210 caller_context_t *); 211 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 212 caller_context_t *); 213 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 214 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 215 caller_context_t *); 216 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 217 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 218 int, vnode_t **, cred_t *, int, caller_context_t *, 219 vsecattr_t *); 220 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 221 int); 222 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 223 caller_context_t *, int); 224 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 225 caller_context_t *, int); 226 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 227 cred_t *, caller_context_t *, int, vsecattr_t *); 228 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 229 caller_context_t *, int); 230 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 231 cred_t *, caller_context_t *, int); 232 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 233 caller_context_t *, int); 234 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 235 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 236 page_t *[], size_t, struct seg *, caddr_t, 237 enum seg_rw, cred_t *, caller_context_t *); 238 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 239 caller_context_t *); 240 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 241 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 242 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 243 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 244 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 245 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 246 struct flk_callback *, cred_t *, caller_context_t *); 247 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 248 cred_t *, caller_context_t *); 249 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 250 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 251 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 252 cred_t *, caller_context_t *); 253 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 254 caller_context_t *); 255 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 256 caller_context_t *); 257 /* 258 * These vnode ops are required to be called from outside this source file, 259 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 260 * as static. 261 */ 262 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 263 caller_context_t *); 264 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 265 int nfs4_lookup(vnode_t *, char *, vnode_t **, 266 struct pathname *, int, vnode_t *, cred_t *, 267 caller_context_t *, int *, pathname_t *); 268 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 269 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 270 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 271 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 272 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 273 caller_context_t *); 274 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 275 caller_context_t *); 276 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 277 caller_context_t *); 278 279 /* 280 * Used for nfs4_commit_vp() to indicate if we should 281 * wait on pending writes. 282 */ 283 #define NFS4_WRITE_NOWAIT 0 284 #define NFS4_WRITE_WAIT 1 285 286 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 287 288 /* 289 * Error flags used to pass information about certain special errors 290 * which need to be handled specially. 291 */ 292 #define NFS_EOF -98 293 #define NFS_VERF_MISMATCH -97 294 295 /* 296 * Flags used to differentiate between which operation drove the 297 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 298 */ 299 #define NFS4_CLOSE_OP 0x1 300 #define NFS4_DELMAP_OP 0x2 301 #define NFS4_INACTIVE_OP 0x3 302 303 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 304 305 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 306 #define ALIGN64(x, ptr, sz) \ 307 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 308 if (x) { \ 309 x = sizeof (uint64_t) - (x); \ 310 sz -= (x); \ 311 ptr += (x); \ 312 } 313 314 #ifdef DEBUG 315 int nfs4_client_attr_debug = 0; 316 int nfs4_client_state_debug = 0; 317 int nfs4_client_shadow_debug = 0; 318 int nfs4_client_lock_debug = 0; 319 int nfs4_seqid_sync = 0; 320 int nfs4_client_map_debug = 0; 321 static int nfs4_pageio_debug = 0; 322 int nfs4_client_inactive_debug = 0; 323 int nfs4_client_recov_debug = 0; 324 int nfs4_client_failover_debug = 0; 325 int nfs4_client_call_debug = 0; 326 int nfs4_client_lookup_debug = 0; 327 int nfs4_client_zone_debug = 0; 328 int nfs4_lost_rqst_debug = 0; 329 int nfs4_rdattrerr_debug = 0; 330 int nfs4_open_stream_debug = 0; 331 332 int nfs4read_error_inject; 333 334 static int nfs4_create_misses = 0; 335 336 static int nfs4_readdir_cache_shorts = 0; 337 static int nfs4_readdir_readahead = 0; 338 339 static int nfs4_bio_do_stop = 0; 340 341 static int nfs4_lostpage = 0; /* number of times we lost original page */ 342 343 int nfs4_mmap_debug = 0; 344 345 static int nfs4_pathconf_cache_hits = 0; 346 static int nfs4_pathconf_cache_misses = 0; 347 348 int nfs4close_all_cnt; 349 int nfs4close_one_debug = 0; 350 int nfs4close_notw_debug = 0; 351 352 int denied_to_flk_debug = 0; 353 void *lockt_denied_debug; 354 355 #endif 356 357 /* 358 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 359 * or NFS4ERR_RESOURCE. 360 */ 361 static int confirm_retry_sec = 30; 362 363 static int nfs4_lookup_neg_cache = 1; 364 365 /* 366 * number of pages to read ahead 367 * optimized for 100 base-T. 368 */ 369 static int nfs4_nra = 4; 370 371 static int nfs4_do_symlink_cache = 1; 372 373 static int nfs4_pathconf_disable_cache = 0; 374 375 /* 376 * These are the vnode ops routines which implement the vnode interface to 377 * the networked file system. These routines just take their parameters, 378 * make them look networkish by putting the right info into interface structs, 379 * and then calling the appropriate remote routine(s) to do the work. 380 * 381 * Note on directory name lookup cacheing: If we detect a stale fhandle, 382 * we purge the directory cache relative to that vnode. This way, the 383 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 384 * more details on rnode locking. 385 */ 386 387 struct vnodeops *nfs4_vnodeops; 388 389 const fs_operation_def_t nfs4_vnodeops_template[] = { 390 VOPNAME_OPEN, { .vop_open = nfs4_open }, 391 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 392 VOPNAME_READ, { .vop_read = nfs4_read }, 393 VOPNAME_WRITE, { .vop_write = nfs4_write }, 394 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 395 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 396 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 397 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 398 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 399 VOPNAME_CREATE, { .vop_create = nfs4_create }, 400 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 401 VOPNAME_LINK, { .vop_link = nfs4_link }, 402 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 403 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 404 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 405 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 406 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 407 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 408 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 409 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 410 VOPNAME_FID, { .vop_fid = nfs4_fid }, 411 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 412 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 413 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 414 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 415 VOPNAME_SPACE, { .vop_space = nfs4_space }, 416 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 417 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 418 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 419 VOPNAME_MAP, { .vop_map = nfs4_map }, 420 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 421 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 422 /* no separate nfs4_dump */ 423 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 424 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 425 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 426 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 427 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 428 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 429 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 430 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 431 NULL, NULL 432 }; 433 434 /* 435 * The following are subroutines and definitions to set args or get res 436 * for the different nfsv4 ops 437 */ 438 439 void 440 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 441 { 442 int i; 443 444 for (i = 0; i < arglen; i++) { 445 if (argop[i].argop == OP_LOOKUP) { 446 kmem_free( 447 argop[i].nfs_argop4_u.oplookup. 448 objname.utf8string_val, 449 argop[i].nfs_argop4_u.oplookup. 450 objname.utf8string_len); 451 } 452 } 453 } 454 455 static void 456 nfs4args_lock_free(nfs_argop4 *argop) 457 { 458 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 459 460 if (locker->new_lock_owner == TRUE) { 461 open_to_lock_owner4 *open_owner; 462 463 open_owner = &locker->locker4_u.open_owner; 464 if (open_owner->lock_owner.owner_val != NULL) { 465 kmem_free(open_owner->lock_owner.owner_val, 466 open_owner->lock_owner.owner_len); 467 } 468 } 469 } 470 471 static void 472 nfs4args_lockt_free(nfs_argop4 *argop) 473 { 474 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 475 476 if (lowner->owner_val != NULL) { 477 kmem_free(lowner->owner_val, lowner->owner_len); 478 } 479 } 480 481 static void 482 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 483 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 484 nfs4_stateid_types_t *sid_types) 485 { 486 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 487 mntinfo4_t *mi; 488 489 argop->argop = OP_SETATTR; 490 /* 491 * The stateid is set to 0 if client is not modifying the size 492 * and otherwise to whatever nfs4_get_stateid() returns. 493 * 494 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 495 * state struct could be found for the process/file pair. We may 496 * want to change this in the future (by OPENing the file). See 497 * bug # 4474852. 498 */ 499 if (vap->va_mask & AT_SIZE) { 500 501 ASSERT(rp != NULL); 502 mi = VTOMI4(RTOV4(rp)); 503 504 argop->nfs_argop4_u.opsetattr.stateid = 505 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 506 OP_SETATTR, sid_types, FALSE); 507 } else { 508 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 509 sizeof (stateid4)); 510 } 511 512 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 513 if (*error) 514 bzero(attr, sizeof (*attr)); 515 } 516 517 static void 518 nfs4args_setattr_free(nfs_argop4 *argop) 519 { 520 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 521 } 522 523 static int 524 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 525 bitmap4 supp) 526 { 527 fattr4 *attr; 528 int error = 0; 529 530 argop->argop = op; 531 switch (op) { 532 case OP_VERIFY: 533 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 534 break; 535 case OP_NVERIFY: 536 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 537 break; 538 default: 539 return (EINVAL); 540 } 541 if (!error) 542 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 543 if (error) 544 bzero(attr, sizeof (*attr)); 545 return (error); 546 } 547 548 static void 549 nfs4args_verify_free(nfs_argop4 *argop) 550 { 551 switch (argop->argop) { 552 case OP_VERIFY: 553 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 554 break; 555 case OP_NVERIFY: 556 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 557 break; 558 default: 559 break; 560 } 561 } 562 563 static void 564 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 565 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 566 { 567 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 568 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 569 570 argop->argop = OP_WRITE; 571 wargs->stable = stable; 572 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 573 mi, OP_WRITE, sid_tp); 574 wargs->mblk = NULL; 575 *wargs_pp = wargs; 576 } 577 578 void 579 nfs4args_copen_free(OPEN4cargs *open_args) 580 { 581 if (open_args->owner.owner_val) { 582 kmem_free(open_args->owner.owner_val, 583 open_args->owner.owner_len); 584 } 585 if ((open_args->opentype == OPEN4_CREATE) && 586 (open_args->mode != EXCLUSIVE4)) { 587 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 588 } 589 } 590 591 /* 592 * XXX: This is referenced in modstubs.s 593 */ 594 struct vnodeops * 595 nfs4_getvnodeops(void) 596 { 597 return (nfs4_vnodeops); 598 } 599 600 /* 601 * The OPEN operation opens a regular file. 602 */ 603 /*ARGSUSED3*/ 604 static int 605 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 606 { 607 vnode_t *dvp = NULL; 608 rnode4_t *rp, *drp; 609 int error; 610 int just_been_created; 611 char fn[MAXNAMELEN]; 612 613 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 614 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 615 return (EIO); 616 rp = VTOR4(*vpp); 617 618 /* 619 * Check to see if opening something besides a regular file; 620 * if so skip the OTW call 621 */ 622 if ((*vpp)->v_type != VREG) { 623 error = nfs4_open_non_reg_file(vpp, flag, cr); 624 return (error); 625 } 626 627 /* 628 * XXX - would like a check right here to know if the file is 629 * executable or not, so as to skip OTW 630 */ 631 632 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 633 return (error); 634 635 drp = VTOR4(dvp); 636 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 637 return (EINTR); 638 639 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 640 nfs_rw_exit(&drp->r_rwlock); 641 return (error); 642 } 643 644 /* 645 * See if this file has just been CREATEd. 646 * If so, clear the flag and update the dnlc, which was previously 647 * skipped in nfs4_create. 648 * XXX need better serilization on this. 649 * XXX move this into the nf4open_otw call, after we have 650 * XXX acquired the open owner seqid sync. 651 */ 652 mutex_enter(&rp->r_statev4_lock); 653 if (rp->created_v4) { 654 rp->created_v4 = 0; 655 mutex_exit(&rp->r_statev4_lock); 656 657 dnlc_update(dvp, fn, *vpp); 658 /* This is needed so we don't bump the open ref count */ 659 just_been_created = 1; 660 } else { 661 mutex_exit(&rp->r_statev4_lock); 662 just_been_created = 0; 663 } 664 665 /* 666 * If caller specified O_TRUNC/FTRUNC, then be sure to set 667 * FWRITE (to drive successful setattr(size=0) after open) 668 */ 669 if (flag & FTRUNC) 670 flag |= FWRITE; 671 672 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 673 just_been_created); 674 675 if (!error && !((*vpp)->v_flag & VROOT)) 676 dnlc_update(dvp, fn, *vpp); 677 678 nfs_rw_exit(&drp->r_rwlock); 679 680 /* release the hold from vtodv */ 681 VN_RELE(dvp); 682 683 /* exchange the shadow for the master vnode, if needed */ 684 685 if (error == 0 && IS_SHADOW(*vpp, rp)) 686 sv_exchange(vpp); 687 688 return (error); 689 } 690 691 /* 692 * See if there's a "lost open" request to be saved and recovered. 693 */ 694 static void 695 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 696 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 697 vnode_t *dvp, OPEN4cargs *open_args) 698 { 699 vfs_t *vfsp; 700 char *srccfp; 701 702 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 703 704 if (error != ETIMEDOUT && error != EINTR && 705 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 706 lost_rqstp->lr_op = 0; 707 return; 708 } 709 710 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 711 "nfs4open_save_lost_rqst: error %d", error)); 712 713 lost_rqstp->lr_op = OP_OPEN; 714 715 /* 716 * The vp (if it is not NULL) and dvp are held and rele'd via 717 * the recovery code. See nfs4_save_lost_rqst. 718 */ 719 lost_rqstp->lr_vp = vp; 720 lost_rqstp->lr_dvp = dvp; 721 lost_rqstp->lr_oop = oop; 722 lost_rqstp->lr_osp = NULL; 723 lost_rqstp->lr_lop = NULL; 724 lost_rqstp->lr_cr = cr; 725 lost_rqstp->lr_flk = NULL; 726 lost_rqstp->lr_oacc = open_args->share_access; 727 lost_rqstp->lr_odeny = open_args->share_deny; 728 lost_rqstp->lr_oclaim = open_args->claim; 729 if (open_args->claim == CLAIM_DELEGATE_CUR) { 730 lost_rqstp->lr_ostateid = 731 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 732 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 733 } else { 734 srccfp = open_args->open_claim4_u.cfile; 735 } 736 lost_rqstp->lr_ofile.utf8string_len = 0; 737 lost_rqstp->lr_ofile.utf8string_val = NULL; 738 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 739 lost_rqstp->lr_putfirst = FALSE; 740 } 741 742 struct nfs4_excl_time { 743 uint32 seconds; 744 uint32 nseconds; 745 }; 746 747 /* 748 * The OPEN operation creates and/or opens a regular file 749 * 750 * ARGSUSED 751 */ 752 static int 753 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 754 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 755 enum createmode4 createmode, int file_just_been_created) 756 { 757 rnode4_t *rp; 758 rnode4_t *drp = VTOR4(dvp); 759 vnode_t *vp = NULL; 760 vnode_t *vpi = *vpp; 761 bool_t needrecov = FALSE; 762 763 int doqueue = 1; 764 765 COMPOUND4args_clnt args; 766 COMPOUND4res_clnt res; 767 nfs_argop4 *argop; 768 nfs_resop4 *resop; 769 int argoplist_size; 770 int idx_open, idx_fattr; 771 772 GETFH4res *gf_res = NULL; 773 OPEN4res *op_res = NULL; 774 nfs4_ga_res_t *garp; 775 fattr4 *attr = NULL; 776 struct nfs4_excl_time verf; 777 bool_t did_excl_setup = FALSE; 778 int created_osp; 779 780 OPEN4cargs *open_args; 781 nfs4_open_owner_t *oop = NULL; 782 nfs4_open_stream_t *osp = NULL; 783 seqid4 seqid = 0; 784 bool_t retry_open = FALSE; 785 nfs4_recov_state_t recov_state; 786 nfs4_lost_rqst_t lost_rqst; 787 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 788 hrtime_t t; 789 int acc = 0; 790 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 791 cred_t *ncr = NULL; 792 793 nfs4_sharedfh_t *otw_sfh; 794 nfs4_sharedfh_t *orig_sfh; 795 int fh_differs = 0; 796 int numops, setgid_flag; 797 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 798 799 /* 800 * Make sure we properly deal with setting the right gid on 801 * a newly created file to reflect the parent's setgid bit 802 */ 803 setgid_flag = 0; 804 if (create_flag && in_va) { 805 806 /* 807 * If the parent's directory has the setgid bit set 808 * _and_ the client was able to get a valid mapping 809 * for the parent dir's owner_group, we want to 810 * append NVERIFY(owner_group == dva.va_gid) and 811 * SETATTR to the CREATE compound. 812 */ 813 mutex_enter(&drp->r_statelock); 814 if (drp->r_attr.va_mode & VSGID && 815 drp->r_attr.va_gid != GID_NOBODY) { 816 in_va->va_gid = drp->r_attr.va_gid; 817 setgid_flag = 1; 818 } 819 mutex_exit(&drp->r_statelock); 820 } 821 822 /* 823 * Normal/non-create compound: 824 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 825 * 826 * Open(create) compound no setgid: 827 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 828 * RESTOREFH + GETATTR 829 * 830 * Open(create) setgid: 831 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 832 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 833 * NVERIFY(grp) + SETATTR 834 */ 835 if (setgid_flag) { 836 numops = 10; 837 idx_open = 1; 838 idx_fattr = 3; 839 } else if (create_flag) { 840 numops = 7; 841 idx_open = 2; 842 idx_fattr = 4; 843 } else { 844 numops = 4; 845 idx_open = 1; 846 idx_fattr = 3; 847 } 848 849 args.array_len = numops; 850 argoplist_size = numops * sizeof (nfs_argop4); 851 argop = kmem_alloc(argoplist_size, KM_SLEEP); 852 853 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 854 "open %s open flag 0x%x cred %p", file_name, open_flag, 855 (void *)cr)); 856 857 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 858 if (create_flag) { 859 /* 860 * We are to create a file. Initialize the passed in vnode 861 * pointer. 862 */ 863 vpi = NULL; 864 } else { 865 /* 866 * Check to see if the client owns a read delegation and is 867 * trying to open for write. If so, then return the delegation 868 * to avoid the server doing a cb_recall and returning DELAY. 869 * NB - we don't use the statev4_lock here because we'd have 870 * to drop the lock anyway and the result would be stale. 871 */ 872 if ((open_flag & FWRITE) && 873 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 874 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 875 876 /* 877 * If the file has a delegation, then do an access check up 878 * front. This avoids having to an access check later after 879 * we've already done start_op, which could deadlock. 880 */ 881 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 882 if (open_flag & FREAD && 883 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 884 acc |= VREAD; 885 if (open_flag & FWRITE && 886 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 887 acc |= VWRITE; 888 } 889 } 890 891 drp = VTOR4(dvp); 892 893 recov_state.rs_flags = 0; 894 recov_state.rs_num_retry_despite_err = 0; 895 cred_otw = cr; 896 897 recov_retry: 898 fh_differs = 0; 899 nfs4_error_zinit(&e); 900 901 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 902 if (e.error) { 903 if (ncr != NULL) 904 crfree(ncr); 905 kmem_free(argop, argoplist_size); 906 return (e.error); 907 } 908 909 args.ctag = TAG_OPEN; 910 args.array_len = numops; 911 args.array = argop; 912 913 /* putfh directory fh */ 914 argop[0].argop = OP_CPUTFH; 915 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 916 917 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 918 argop[idx_open].argop = OP_COPEN; 919 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 920 open_args->claim = CLAIM_NULL; 921 922 /* name of file */ 923 open_args->open_claim4_u.cfile = file_name; 924 open_args->owner.owner_len = 0; 925 open_args->owner.owner_val = NULL; 926 927 if (create_flag) { 928 /* CREATE a file */ 929 open_args->opentype = OPEN4_CREATE; 930 open_args->mode = createmode; 931 if (createmode == EXCLUSIVE4) { 932 if (did_excl_setup == FALSE) { 933 verf.seconds = zone_get_hostid(NULL); 934 if (verf.seconds != 0) 935 verf.nseconds = newnum(); 936 else { 937 timestruc_t now; 938 939 gethrestime(&now); 940 verf.seconds = now.tv_sec; 941 verf.nseconds = now.tv_nsec; 942 } 943 /* 944 * Since the server will use this value for the 945 * mtime, make sure that it can't overflow. Zero 946 * out the MSB. The actual value does not matter 947 * here, only its uniqeness. 948 */ 949 verf.seconds &= INT32_MAX; 950 did_excl_setup = TRUE; 951 } 952 953 /* Now copy over verifier to OPEN4args. */ 954 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 955 } else { 956 int v_error; 957 bitmap4 supp_attrs; 958 servinfo4_t *svp; 959 960 attr = &open_args->createhow4_u.createattrs; 961 962 svp = drp->r_server; 963 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 964 supp_attrs = svp->sv_supp_attrs; 965 nfs_rw_exit(&svp->sv_lock); 966 967 /* GUARDED4 or UNCHECKED4 */ 968 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 969 supp_attrs); 970 if (v_error) { 971 bzero(attr, sizeof (*attr)); 972 nfs4args_copen_free(open_args); 973 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 974 &recov_state, FALSE); 975 if (ncr != NULL) 976 crfree(ncr); 977 kmem_free(argop, argoplist_size); 978 return (v_error); 979 } 980 } 981 } else { 982 /* NO CREATE */ 983 open_args->opentype = OPEN4_NOCREATE; 984 } 985 986 if (recov_state.rs_sp != NULL) { 987 mutex_enter(&recov_state.rs_sp->s_lock); 988 open_args->owner.clientid = recov_state.rs_sp->clientid; 989 mutex_exit(&recov_state.rs_sp->s_lock); 990 } else { 991 /* XXX should we just fail here? */ 992 open_args->owner.clientid = 0; 993 } 994 995 /* 996 * This increments oop's ref count or creates a temporary 'just_created' 997 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 998 * completes. 999 */ 1000 mutex_enter(&VTOMI4(dvp)->mi_lock); 1001 1002 /* See if a permanent or just created open owner exists */ 1003 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1004 if (!oop) { 1005 /* 1006 * This open owner does not exist so create a temporary 1007 * just created one. 1008 */ 1009 oop = create_open_owner(cr, VTOMI4(dvp)); 1010 ASSERT(oop != NULL); 1011 } 1012 mutex_exit(&VTOMI4(dvp)->mi_lock); 1013 1014 /* this length never changes, do alloc before seqid sync */ 1015 open_args->owner.owner_len = sizeof (oop->oo_name); 1016 open_args->owner.owner_val = 1017 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1018 1019 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1020 if (e.error == EAGAIN) { 1021 open_owner_rele(oop); 1022 nfs4args_copen_free(open_args); 1023 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1024 if (ncr != NULL) { 1025 crfree(ncr); 1026 ncr = NULL; 1027 } 1028 goto recov_retry; 1029 } 1030 1031 /* Check to see if we need to do the OTW call */ 1032 if (!create_flag) { 1033 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1034 file_just_been_created, &e.error, acc, &recov_state)) { 1035 1036 /* 1037 * The OTW open is not necessary. Either 1038 * the open can succeed without it (eg. 1039 * delegation, error == 0) or the open 1040 * must fail due to an access failure 1041 * (error != 0). In either case, tidy 1042 * up and return. 1043 */ 1044 1045 nfs4_end_open_seqid_sync(oop); 1046 open_owner_rele(oop); 1047 nfs4args_copen_free(open_args); 1048 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1049 if (ncr != NULL) 1050 crfree(ncr); 1051 kmem_free(argop, argoplist_size); 1052 return (e.error); 1053 } 1054 } 1055 1056 bcopy(&oop->oo_name, open_args->owner.owner_val, 1057 open_args->owner.owner_len); 1058 1059 seqid = nfs4_get_open_seqid(oop) + 1; 1060 open_args->seqid = seqid; 1061 open_args->share_access = 0; 1062 if (open_flag & FREAD) 1063 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1064 if (open_flag & FWRITE) 1065 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1066 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1067 1068 1069 1070 /* 1071 * getfh w/sanity check for idx_open/idx_fattr 1072 */ 1073 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1074 argop[idx_open + 1].argop = OP_GETFH; 1075 1076 /* getattr */ 1077 argop[idx_fattr].argop = OP_GETATTR; 1078 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1079 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1080 1081 if (setgid_flag) { 1082 vattr_t _v; 1083 servinfo4_t *svp; 1084 bitmap4 supp_attrs; 1085 1086 svp = drp->r_server; 1087 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1088 supp_attrs = svp->sv_supp_attrs; 1089 nfs_rw_exit(&svp->sv_lock); 1090 1091 /* 1092 * For setgid case, we need to: 1093 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1094 */ 1095 argop[4].argop = OP_SAVEFH; 1096 1097 argop[5].argop = OP_CPUTFH; 1098 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1099 1100 argop[6].argop = OP_GETATTR; 1101 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1102 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1103 1104 argop[7].argop = OP_RESTOREFH; 1105 1106 /* 1107 * nverify 1108 */ 1109 _v.va_mask = AT_GID; 1110 _v.va_gid = in_va->va_gid; 1111 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1112 supp_attrs))) { 1113 1114 /* 1115 * setattr 1116 * 1117 * We _know_ we're not messing with AT_SIZE or 1118 * AT_XTIME, so no need for stateid or flags. 1119 * Also we specify NULL rp since we're only 1120 * interested in setting owner_group attributes. 1121 */ 1122 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1123 supp_attrs, &e.error, 0); 1124 if (e.error) 1125 nfs4args_verify_free(&argop[8]); 1126 } 1127 1128 if (e.error) { 1129 /* 1130 * XXX - Revisit the last argument to nfs4_end_op() 1131 * once 5020486 is fixed. 1132 */ 1133 nfs4_end_open_seqid_sync(oop); 1134 open_owner_rele(oop); 1135 nfs4args_copen_free(open_args); 1136 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1137 if (ncr != NULL) 1138 crfree(ncr); 1139 kmem_free(argop, argoplist_size); 1140 return (e.error); 1141 } 1142 } else if (create_flag) { 1143 /* 1144 * For setgid case, we need to: 1145 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1146 */ 1147 argop[1].argop = OP_SAVEFH; 1148 1149 argop[5].argop = OP_RESTOREFH; 1150 1151 argop[6].argop = OP_GETATTR; 1152 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1153 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1154 } 1155 1156 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1157 "nfs4open_otw: %s call, nm %s, rp %s", 1158 needrecov ? "recov" : "first", file_name, 1159 rnode4info(VTOR4(dvp)))); 1160 1161 t = gethrtime(); 1162 1163 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1164 1165 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1166 nfs4_set_open_seqid(seqid, oop, args.ctag); 1167 1168 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1169 1170 if (e.error || needrecov) { 1171 bool_t abort = FALSE; 1172 1173 if (needrecov) { 1174 nfs4_bseqid_entry_t *bsep = NULL; 1175 1176 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1177 cred_otw, vpi, dvp, open_args); 1178 1179 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1180 bsep = nfs4_create_bseqid_entry(oop, NULL, 1181 vpi, 0, args.ctag, open_args->seqid); 1182 num_bseqid_retry--; 1183 } 1184 1185 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1186 NULL, lost_rqst.lr_op == OP_OPEN ? 1187 &lost_rqst : NULL, OP_OPEN, bsep); 1188 1189 if (bsep) 1190 kmem_free(bsep, sizeof (*bsep)); 1191 /* give up if we keep getting BAD_SEQID */ 1192 if (num_bseqid_retry == 0) 1193 abort = TRUE; 1194 if (abort == TRUE && e.error == 0) 1195 e.error = geterrno4(res.status); 1196 } 1197 nfs4_end_open_seqid_sync(oop); 1198 open_owner_rele(oop); 1199 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1200 nfs4args_copen_free(open_args); 1201 if (setgid_flag) { 1202 nfs4args_verify_free(&argop[8]); 1203 nfs4args_setattr_free(&argop[9]); 1204 } 1205 if (!e.error) 1206 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1207 if (ncr != NULL) { 1208 crfree(ncr); 1209 ncr = NULL; 1210 } 1211 if (!needrecov || abort == TRUE || e.error == EINTR || 1212 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1213 kmem_free(argop, argoplist_size); 1214 return (e.error); 1215 } 1216 goto recov_retry; 1217 } 1218 1219 /* 1220 * Will check and update lease after checking the rflag for 1221 * OPEN_CONFIRM in the successful OPEN call. 1222 */ 1223 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1224 1225 /* 1226 * XXX what if we're crossing mount points from server1:/drp 1227 * to server2:/drp/rp. 1228 */ 1229 1230 /* Signal our end of use of the open seqid */ 1231 nfs4_end_open_seqid_sync(oop); 1232 1233 /* 1234 * This will destroy the open owner if it was just created, 1235 * and no one else has put a reference on it. 1236 */ 1237 open_owner_rele(oop); 1238 if (create_flag && (createmode != EXCLUSIVE4) && 1239 res.status == NFS4ERR_BADOWNER) 1240 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1241 1242 e.error = geterrno4(res.status); 1243 nfs4args_copen_free(open_args); 1244 if (setgid_flag) { 1245 nfs4args_verify_free(&argop[8]); 1246 nfs4args_setattr_free(&argop[9]); 1247 } 1248 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1249 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1250 /* 1251 * If the reply is NFS4ERR_ACCESS, it may be because 1252 * we are root (no root net access). If the real uid 1253 * is not root, then retry with the real uid instead. 1254 */ 1255 if (ncr != NULL) { 1256 crfree(ncr); 1257 ncr = NULL; 1258 } 1259 if (res.status == NFS4ERR_ACCESS && 1260 (ncr = crnetadjust(cred_otw)) != NULL) { 1261 cred_otw = ncr; 1262 goto recov_retry; 1263 } 1264 kmem_free(argop, argoplist_size); 1265 return (e.error); 1266 } 1267 1268 resop = &res.array[idx_open]; /* open res */ 1269 op_res = &resop->nfs_resop4_u.opopen; 1270 1271 #ifdef DEBUG 1272 /* 1273 * verify attrset bitmap 1274 */ 1275 if (create_flag && 1276 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1277 /* make sure attrset returned is what we asked for */ 1278 /* XXX Ignore this 'error' for now */ 1279 if (attr->attrmask != op_res->attrset) 1280 /* EMPTY */; 1281 } 1282 #endif 1283 1284 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1285 mutex_enter(&VTOMI4(dvp)->mi_lock); 1286 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1287 mutex_exit(&VTOMI4(dvp)->mi_lock); 1288 } 1289 1290 resop = &res.array[idx_open + 1]; /* getfh res */ 1291 gf_res = &resop->nfs_resop4_u.opgetfh; 1292 1293 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1294 1295 /* 1296 * The open stateid has been updated on the server but not 1297 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1298 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1299 * WRITE call. That, however, will use the old stateid, so go ahead 1300 * and upate the open stateid now, before any call to makenfs4node. 1301 */ 1302 if (vpi) { 1303 nfs4_open_stream_t *tmp_osp; 1304 rnode4_t *tmp_rp = VTOR4(vpi); 1305 1306 tmp_osp = find_open_stream(oop, tmp_rp); 1307 if (tmp_osp) { 1308 tmp_osp->open_stateid = op_res->stateid; 1309 mutex_exit(&tmp_osp->os_sync_lock); 1310 open_stream_rele(tmp_osp, tmp_rp); 1311 } 1312 1313 /* 1314 * We must determine if the file handle given by the otw open 1315 * is the same as the file handle which was passed in with 1316 * *vpp. This case can be reached if the file we are trying 1317 * to open has been removed and another file has been created 1318 * having the same file name. The passed in vnode is released 1319 * later. 1320 */ 1321 orig_sfh = VTOR4(vpi)->r_fh; 1322 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1323 } 1324 1325 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1326 1327 if (create_flag || fh_differs) { 1328 int rnode_err = 0; 1329 1330 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1331 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1332 1333 if (e.error) 1334 PURGE_ATTRCACHE4(vp); 1335 /* 1336 * For the newly created vp case, make sure the rnode 1337 * isn't bad before using it. 1338 */ 1339 mutex_enter(&(VTOR4(vp))->r_statelock); 1340 if (VTOR4(vp)->r_flags & R4RECOVERR) 1341 rnode_err = EIO; 1342 mutex_exit(&(VTOR4(vp))->r_statelock); 1343 1344 if (rnode_err) { 1345 nfs4_end_open_seqid_sync(oop); 1346 nfs4args_copen_free(open_args); 1347 if (setgid_flag) { 1348 nfs4args_verify_free(&argop[8]); 1349 nfs4args_setattr_free(&argop[9]); 1350 } 1351 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1352 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1353 needrecov); 1354 open_owner_rele(oop); 1355 VN_RELE(vp); 1356 if (ncr != NULL) 1357 crfree(ncr); 1358 sfh4_rele(&otw_sfh); 1359 kmem_free(argop, argoplist_size); 1360 return (EIO); 1361 } 1362 } else { 1363 vp = vpi; 1364 } 1365 sfh4_rele(&otw_sfh); 1366 1367 /* 1368 * It seems odd to get a full set of attrs and then not update 1369 * the object's attrcache in the non-create case. Create case uses 1370 * the attrs since makenfs4node checks to see if the attrs need to 1371 * be updated (and then updates them). The non-create case should 1372 * update attrs also. 1373 */ 1374 if (! create_flag && ! fh_differs && !e.error) { 1375 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1376 } 1377 1378 nfs4_error_zinit(&e); 1379 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1380 /* This does not do recovery for vp explicitly. */ 1381 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1382 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1383 1384 if (e.error || e.stat) { 1385 nfs4_end_open_seqid_sync(oop); 1386 nfs4args_copen_free(open_args); 1387 if (setgid_flag) { 1388 nfs4args_verify_free(&argop[8]); 1389 nfs4args_setattr_free(&argop[9]); 1390 } 1391 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1392 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1393 needrecov); 1394 open_owner_rele(oop); 1395 if (create_flag || fh_differs) { 1396 /* rele the makenfs4node */ 1397 VN_RELE(vp); 1398 } 1399 if (ncr != NULL) { 1400 crfree(ncr); 1401 ncr = NULL; 1402 } 1403 if (retry_open == TRUE) { 1404 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1405 "nfs4open_otw: retry the open since OPEN " 1406 "CONFIRM failed with error %d stat %d", 1407 e.error, e.stat)); 1408 if (create_flag && createmode == GUARDED4) { 1409 NFS4_DEBUG(nfs4_client_recov_debug, 1410 (CE_NOTE, "nfs4open_otw: switch " 1411 "createmode from GUARDED4 to " 1412 "UNCHECKED4")); 1413 createmode = UNCHECKED4; 1414 } 1415 goto recov_retry; 1416 } 1417 if (!e.error) { 1418 if (create_flag && (createmode != EXCLUSIVE4) && 1419 e.stat == NFS4ERR_BADOWNER) 1420 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1421 1422 e.error = geterrno4(e.stat); 1423 } 1424 kmem_free(argop, argoplist_size); 1425 return (e.error); 1426 } 1427 } 1428 1429 rp = VTOR4(vp); 1430 1431 mutex_enter(&rp->r_statev4_lock); 1432 if (create_flag) 1433 rp->created_v4 = 1; 1434 mutex_exit(&rp->r_statev4_lock); 1435 1436 mutex_enter(&oop->oo_lock); 1437 /* Doesn't matter if 'oo_just_created' already was set as this */ 1438 oop->oo_just_created = NFS4_PERM_CREATED; 1439 if (oop->oo_cred_otw) 1440 crfree(oop->oo_cred_otw); 1441 oop->oo_cred_otw = cred_otw; 1442 crhold(oop->oo_cred_otw); 1443 mutex_exit(&oop->oo_lock); 1444 1445 /* returns with 'os_sync_lock' held */ 1446 osp = find_or_create_open_stream(oop, rp, &created_osp); 1447 if (!osp) { 1448 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1449 "nfs4open_otw: failed to create an open stream")); 1450 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1451 "signal our end of use of the open seqid")); 1452 1453 nfs4_end_open_seqid_sync(oop); 1454 open_owner_rele(oop); 1455 nfs4args_copen_free(open_args); 1456 if (setgid_flag) { 1457 nfs4args_verify_free(&argop[8]); 1458 nfs4args_setattr_free(&argop[9]); 1459 } 1460 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1461 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1462 if (create_flag || fh_differs) 1463 VN_RELE(vp); 1464 if (ncr != NULL) 1465 crfree(ncr); 1466 1467 kmem_free(argop, argoplist_size); 1468 return (EINVAL); 1469 1470 } 1471 1472 osp->open_stateid = op_res->stateid; 1473 1474 if (open_flag & FREAD) 1475 osp->os_share_acc_read++; 1476 if (open_flag & FWRITE) 1477 osp->os_share_acc_write++; 1478 osp->os_share_deny_none++; 1479 1480 /* 1481 * Need to reset this bitfield for the possible case where we were 1482 * going to OTW CLOSE the file, got a non-recoverable error, and before 1483 * we could retry the CLOSE, OPENed the file again. 1484 */ 1485 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1486 osp->os_final_close = 0; 1487 osp->os_force_close = 0; 1488 #ifdef DEBUG 1489 if (osp->os_failed_reopen) 1490 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1491 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1492 (void *)osp, (void *)cr, rnode4info(rp))); 1493 #endif 1494 osp->os_failed_reopen = 0; 1495 1496 mutex_exit(&osp->os_sync_lock); 1497 1498 nfs4_end_open_seqid_sync(oop); 1499 1500 if (created_osp && recov_state.rs_sp != NULL) { 1501 mutex_enter(&recov_state.rs_sp->s_lock); 1502 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1503 mutex_exit(&recov_state.rs_sp->s_lock); 1504 } 1505 1506 /* get rid of our reference to find oop */ 1507 open_owner_rele(oop); 1508 1509 open_stream_rele(osp, rp); 1510 1511 /* accept delegation, if any */ 1512 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1513 1514 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1515 1516 if (createmode == EXCLUSIVE4 && 1517 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1518 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1519 " EXCLUSIVE4: sending a SETATTR")); 1520 /* 1521 * If doing an exclusive create, then generate 1522 * a SETATTR to set the initial attributes. 1523 * Try to set the mtime and the atime to the 1524 * server's current time. It is somewhat 1525 * expected that these fields will be used to 1526 * store the exclusive create cookie. If not, 1527 * server implementors will need to know that 1528 * a SETATTR will follow an exclusive create 1529 * and the cookie should be destroyed if 1530 * appropriate. 1531 * 1532 * The AT_GID and AT_SIZE bits are turned off 1533 * so that the SETATTR request will not attempt 1534 * to process these. The gid will be set 1535 * separately if appropriate. The size is turned 1536 * off because it is assumed that a new file will 1537 * be created empty and if the file wasn't empty, 1538 * then the exclusive create will have failed 1539 * because the file must have existed already. 1540 * Therefore, no truncate operation is needed. 1541 */ 1542 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1543 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1544 1545 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1546 if (e.error) { 1547 /* 1548 * Couldn't correct the attributes of 1549 * the newly created file and the 1550 * attributes are wrong. Remove the 1551 * file and return an error to the 1552 * application. 1553 */ 1554 /* XXX will this take care of client state ? */ 1555 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1556 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1557 " remove file", e.error)); 1558 VN_RELE(vp); 1559 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1560 /* 1561 * Since we've reled the vnode and removed 1562 * the file we now need to return the error. 1563 * At this point we don't want to update the 1564 * dircaches, call nfs4_waitfor_purge_complete 1565 * or set vpp to vp so we need to skip these 1566 * as well. 1567 */ 1568 goto skip_update_dircaches; 1569 } 1570 } 1571 1572 /* 1573 * If we created or found the correct vnode, due to create_flag or 1574 * fh_differs being set, then update directory cache attribute, readdir 1575 * and dnlc caches. 1576 */ 1577 if (create_flag || fh_differs) { 1578 dirattr_info_t dinfo, *dinfop; 1579 1580 /* 1581 * Make sure getattr succeeded before using results. 1582 * note: op 7 is getattr(dir) for both flavors of 1583 * open(create). 1584 */ 1585 if (create_flag && res.status == NFS4_OK) { 1586 dinfo.di_time_call = t; 1587 dinfo.di_cred = cr; 1588 dinfo.di_garp = 1589 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1590 dinfop = &dinfo; 1591 } else { 1592 dinfop = NULL; 1593 } 1594 1595 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1596 dinfop); 1597 } 1598 1599 /* 1600 * If the page cache for this file was flushed from actions 1601 * above, it was done asynchronously and if that is true, 1602 * there is a need to wait here for it to complete. This must 1603 * be done outside of start_fop/end_fop. 1604 */ 1605 (void) nfs4_waitfor_purge_complete(vp); 1606 1607 /* 1608 * It is implicit that we are in the open case (create_flag == 0) since 1609 * fh_differs can only be set to a non-zero value in the open case. 1610 */ 1611 if (fh_differs != 0 && vpi != NULL) 1612 VN_RELE(vpi); 1613 1614 /* 1615 * Be sure to set *vpp to the correct value before returning. 1616 */ 1617 *vpp = vp; 1618 1619 skip_update_dircaches: 1620 1621 nfs4args_copen_free(open_args); 1622 if (setgid_flag) { 1623 nfs4args_verify_free(&argop[8]); 1624 nfs4args_setattr_free(&argop[9]); 1625 } 1626 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1627 1628 if (ncr) 1629 crfree(ncr); 1630 kmem_free(argop, argoplist_size); 1631 return (e.error); 1632 } 1633 1634 /* 1635 * Reopen an open instance. cf. nfs4open_otw(). 1636 * 1637 * Errors are returned by the nfs4_error_t parameter. 1638 * - ep->error contains an errno value or zero. 1639 * - if it is zero, ep->stat is set to an NFS status code, if any. 1640 * If the file could not be reopened, but the caller should continue, the 1641 * file is marked dead and no error values are returned. If the caller 1642 * should stop recovering open files and start over, either the ep->error 1643 * value or ep->stat will indicate an error (either something that requires 1644 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1645 * filehandles) may be handled silently by this routine. 1646 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1647 * will be started, so the caller should not do it. 1648 * 1649 * Gotos: 1650 * - kill_file : reopen failed in such a fashion to constitute marking the 1651 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1652 * is for cases where recovery is not possible. 1653 * - failed_reopen : same as above, except that the file has already been 1654 * marked dead, so no need to do it again. 1655 * - bailout : reopen failed but we are able to recover and retry the reopen - 1656 * either within this function immediately or via the calling function. 1657 */ 1658 1659 void 1660 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1661 open_claim_type4 claim, bool_t frc_use_claim_previous, 1662 bool_t is_recov) 1663 { 1664 COMPOUND4args_clnt args; 1665 COMPOUND4res_clnt res; 1666 nfs_argop4 argop[4]; 1667 nfs_resop4 *resop; 1668 OPEN4res *op_res = NULL; 1669 OPEN4cargs *open_args; 1670 GETFH4res *gf_res; 1671 rnode4_t *rp = VTOR4(vp); 1672 int doqueue = 1; 1673 cred_t *cr = NULL, *cred_otw = NULL; 1674 nfs4_open_owner_t *oop = NULL; 1675 seqid4 seqid; 1676 nfs4_ga_res_t *garp; 1677 char fn[MAXNAMELEN]; 1678 nfs4_recov_state_t recov = {NULL, 0}; 1679 nfs4_lost_rqst_t lost_rqst; 1680 mntinfo4_t *mi = VTOMI4(vp); 1681 bool_t abort; 1682 char *failed_msg = ""; 1683 int fh_different; 1684 hrtime_t t; 1685 nfs4_bseqid_entry_t *bsep = NULL; 1686 1687 ASSERT(nfs4_consistent_type(vp)); 1688 ASSERT(nfs_zone() == mi->mi_zone); 1689 1690 nfs4_error_zinit(ep); 1691 1692 /* this is the cred used to find the open owner */ 1693 cr = state_to_cred(osp); 1694 if (cr == NULL) { 1695 failed_msg = "Couldn't reopen: no cred"; 1696 goto kill_file; 1697 } 1698 /* use this cred for OTW operations */ 1699 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1700 1701 top: 1702 nfs4_error_zinit(ep); 1703 1704 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1705 /* File system has been unmounted, quit */ 1706 ep->error = EIO; 1707 failed_msg = "Couldn't reopen: file system has been unmounted"; 1708 goto kill_file; 1709 } 1710 1711 oop = osp->os_open_owner; 1712 1713 ASSERT(oop != NULL); 1714 if (oop == NULL) { /* be defensive in non-DEBUG */ 1715 failed_msg = "can't reopen: no open owner"; 1716 goto kill_file; 1717 } 1718 open_owner_hold(oop); 1719 1720 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1721 if (ep->error) { 1722 open_owner_rele(oop); 1723 oop = NULL; 1724 goto bailout; 1725 } 1726 1727 /* 1728 * If the rnode has a delegation and the delegation has been 1729 * recovered and the server didn't request a recall and the caller 1730 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1731 * recovery) and the rnode hasn't been marked dead, then install 1732 * the delegation stateid in the open stream. Otherwise, proceed 1733 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1734 */ 1735 mutex_enter(&rp->r_statev4_lock); 1736 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1737 !rp->r_deleg_return_pending && 1738 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1739 !rp->r_deleg_needs_recall && 1740 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1741 !(rp->r_flags & R4RECOVERR)) { 1742 mutex_enter(&osp->os_sync_lock); 1743 osp->os_delegation = 1; 1744 osp->open_stateid = rp->r_deleg_stateid; 1745 mutex_exit(&osp->os_sync_lock); 1746 mutex_exit(&rp->r_statev4_lock); 1747 goto bailout; 1748 } 1749 mutex_exit(&rp->r_statev4_lock); 1750 1751 /* 1752 * If the file failed recovery, just quit. This failure need not 1753 * affect other reopens, so don't return an error. 1754 */ 1755 mutex_enter(&rp->r_statelock); 1756 if (rp->r_flags & R4RECOVERR) { 1757 mutex_exit(&rp->r_statelock); 1758 ep->error = 0; 1759 goto failed_reopen; 1760 } 1761 mutex_exit(&rp->r_statelock); 1762 1763 /* 1764 * argop is empty here 1765 * 1766 * PUTFH, OPEN, GETATTR 1767 */ 1768 args.ctag = TAG_REOPEN; 1769 args.array_len = 4; 1770 args.array = argop; 1771 1772 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1773 "nfs4_reopen: file is type %d, id %s", 1774 vp->v_type, rnode4info(VTOR4(vp)))); 1775 1776 argop[0].argop = OP_CPUTFH; 1777 1778 if (claim != CLAIM_PREVIOUS) { 1779 /* 1780 * if this is a file mount then 1781 * use the mntinfo parentfh 1782 */ 1783 argop[0].nfs_argop4_u.opcputfh.sfh = 1784 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1785 VTOSV(vp)->sv_dfh; 1786 } else { 1787 /* putfh fh to reopen */ 1788 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1789 } 1790 1791 argop[1].argop = OP_COPEN; 1792 open_args = &argop[1].nfs_argop4_u.opcopen; 1793 open_args->claim = claim; 1794 1795 if (claim == CLAIM_NULL) { 1796 1797 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1798 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1799 "failed for vp 0x%p for CLAIM_NULL with %m", 1800 (void *)vp); 1801 failed_msg = "Couldn't reopen: vtoname failed for " 1802 "CLAIM_NULL"; 1803 /* nothing allocated yet */ 1804 goto kill_file; 1805 } 1806 1807 open_args->open_claim4_u.cfile = fn; 1808 } else if (claim == CLAIM_PREVIOUS) { 1809 1810 /* 1811 * We have two cases to deal with here: 1812 * 1) We're being called to reopen files in order to satisfy 1813 * a lock operation request which requires us to explicitly 1814 * reopen files which were opened under a delegation. If 1815 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1816 * that case, frc_use_claim_previous is TRUE and we must 1817 * use the rnode's current delegation type (r_deleg_type). 1818 * 2) We're reopening files during some form of recovery. 1819 * In this case, frc_use_claim_previous is FALSE and we 1820 * use the delegation type appropriate for recovery 1821 * (r_deleg_needs_recovery). 1822 */ 1823 mutex_enter(&rp->r_statev4_lock); 1824 open_args->open_claim4_u.delegate_type = 1825 frc_use_claim_previous ? 1826 rp->r_deleg_type : 1827 rp->r_deleg_needs_recovery; 1828 mutex_exit(&rp->r_statev4_lock); 1829 1830 } else if (claim == CLAIM_DELEGATE_CUR) { 1831 1832 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1833 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1834 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1835 "with %m", (void *)vp); 1836 failed_msg = "Couldn't reopen: vtoname failed for " 1837 "CLAIM_DELEGATE_CUR"; 1838 /* nothing allocated yet */ 1839 goto kill_file; 1840 } 1841 1842 mutex_enter(&rp->r_statev4_lock); 1843 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1844 rp->r_deleg_stateid; 1845 mutex_exit(&rp->r_statev4_lock); 1846 1847 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1848 } 1849 open_args->opentype = OPEN4_NOCREATE; 1850 open_args->owner.clientid = mi2clientid(mi); 1851 open_args->owner.owner_len = sizeof (oop->oo_name); 1852 open_args->owner.owner_val = 1853 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1854 bcopy(&oop->oo_name, open_args->owner.owner_val, 1855 open_args->owner.owner_len); 1856 open_args->share_access = 0; 1857 open_args->share_deny = 0; 1858 1859 mutex_enter(&osp->os_sync_lock); 1860 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1861 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1862 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1863 (void *)osp, (void *)rp, osp->os_share_acc_read, 1864 osp->os_share_acc_write, osp->os_open_ref_count, 1865 osp->os_mmap_read, osp->os_mmap_write, claim)); 1866 1867 if (osp->os_share_acc_read || osp->os_mmap_read) 1868 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1869 if (osp->os_share_acc_write || osp->os_mmap_write) 1870 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1871 if (osp->os_share_deny_read) 1872 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1873 if (osp->os_share_deny_write) 1874 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1875 mutex_exit(&osp->os_sync_lock); 1876 1877 seqid = nfs4_get_open_seqid(oop) + 1; 1878 open_args->seqid = seqid; 1879 1880 /* Construct the getfh part of the compound */ 1881 argop[2].argop = OP_GETFH; 1882 1883 /* Construct the getattr part of the compound */ 1884 argop[3].argop = OP_GETATTR; 1885 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1886 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1887 1888 t = gethrtime(); 1889 1890 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1891 1892 if (ep->error) { 1893 if (!is_recov && !frc_use_claim_previous && 1894 (ep->error == EINTR || ep->error == ETIMEDOUT || 1895 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1896 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1897 cred_otw, vp, NULL, open_args); 1898 abort = nfs4_start_recovery(ep, 1899 VTOMI4(vp), vp, NULL, NULL, 1900 lost_rqst.lr_op == OP_OPEN ? 1901 &lost_rqst : NULL, OP_OPEN, NULL); 1902 nfs4args_copen_free(open_args); 1903 goto bailout; 1904 } 1905 1906 nfs4args_copen_free(open_args); 1907 1908 if (ep->error == EACCES && cred_otw != cr) { 1909 crfree(cred_otw); 1910 cred_otw = cr; 1911 crhold(cred_otw); 1912 nfs4_end_open_seqid_sync(oop); 1913 open_owner_rele(oop); 1914 oop = NULL; 1915 goto top; 1916 } 1917 if (ep->error == ETIMEDOUT) 1918 goto bailout; 1919 failed_msg = "Couldn't reopen: rpc error"; 1920 goto kill_file; 1921 } 1922 1923 if (nfs4_need_to_bump_seqid(&res)) 1924 nfs4_set_open_seqid(seqid, oop, args.ctag); 1925 1926 switch (res.status) { 1927 case NFS4_OK: 1928 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1929 mutex_enter(&rp->r_statelock); 1930 rp->r_delay_interval = 0; 1931 mutex_exit(&rp->r_statelock); 1932 } 1933 break; 1934 case NFS4ERR_BAD_SEQID: 1935 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1936 args.ctag, open_args->seqid); 1937 1938 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1939 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1940 NULL, OP_OPEN, bsep); 1941 1942 nfs4args_copen_free(open_args); 1943 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1944 nfs4_end_open_seqid_sync(oop); 1945 open_owner_rele(oop); 1946 oop = NULL; 1947 kmem_free(bsep, sizeof (*bsep)); 1948 1949 goto kill_file; 1950 case NFS4ERR_NO_GRACE: 1951 nfs4args_copen_free(open_args); 1952 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1953 nfs4_end_open_seqid_sync(oop); 1954 open_owner_rele(oop); 1955 oop = NULL; 1956 if (claim == CLAIM_PREVIOUS) { 1957 /* 1958 * Retry as a plain open. We don't need to worry about 1959 * checking the changeinfo: it is acceptable for a 1960 * client to re-open a file and continue processing 1961 * (in the absence of locks). 1962 */ 1963 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1964 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1965 "will retry as CLAIM_NULL")); 1966 claim = CLAIM_NULL; 1967 nfs4_mi_kstat_inc_no_grace(mi); 1968 goto top; 1969 } 1970 failed_msg = 1971 "Couldn't reopen: tried reclaim outside grace period. "; 1972 goto kill_file; 1973 case NFS4ERR_GRACE: 1974 nfs4_set_grace_wait(mi); 1975 nfs4args_copen_free(open_args); 1976 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1977 nfs4_end_open_seqid_sync(oop); 1978 open_owner_rele(oop); 1979 oop = NULL; 1980 ep->error = nfs4_wait_for_grace(mi, &recov); 1981 if (ep->error != 0) 1982 goto bailout; 1983 goto top; 1984 case NFS4ERR_DELAY: 1985 nfs4_set_delay_wait(vp); 1986 nfs4args_copen_free(open_args); 1987 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1988 nfs4_end_open_seqid_sync(oop); 1989 open_owner_rele(oop); 1990 oop = NULL; 1991 ep->error = nfs4_wait_for_delay(vp, &recov); 1992 nfs4_mi_kstat_inc_delay(mi); 1993 if (ep->error != 0) 1994 goto bailout; 1995 goto top; 1996 case NFS4ERR_FHEXPIRED: 1997 /* recover filehandle and retry */ 1998 abort = nfs4_start_recovery(ep, 1999 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 2000 nfs4args_copen_free(open_args); 2001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2002 nfs4_end_open_seqid_sync(oop); 2003 open_owner_rele(oop); 2004 oop = NULL; 2005 if (abort == FALSE) 2006 goto top; 2007 failed_msg = "Couldn't reopen: recovery aborted"; 2008 goto kill_file; 2009 case NFS4ERR_RESOURCE: 2010 case NFS4ERR_STALE_CLIENTID: 2011 case NFS4ERR_WRONGSEC: 2012 case NFS4ERR_EXPIRED: 2013 /* 2014 * Do not mark the file dead and let the calling 2015 * function initiate recovery. 2016 */ 2017 nfs4args_copen_free(open_args); 2018 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2019 nfs4_end_open_seqid_sync(oop); 2020 open_owner_rele(oop); 2021 oop = NULL; 2022 goto bailout; 2023 case NFS4ERR_ACCESS: 2024 if (cred_otw != cr) { 2025 crfree(cred_otw); 2026 cred_otw = cr; 2027 crhold(cred_otw); 2028 nfs4args_copen_free(open_args); 2029 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2030 nfs4_end_open_seqid_sync(oop); 2031 open_owner_rele(oop); 2032 oop = NULL; 2033 goto top; 2034 } 2035 /* fall through */ 2036 default: 2037 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2038 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2039 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2040 rnode4info(VTOR4(vp)))); 2041 failed_msg = "Couldn't reopen: NFSv4 error"; 2042 nfs4args_copen_free(open_args); 2043 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2044 goto kill_file; 2045 } 2046 2047 resop = &res.array[1]; /* open res */ 2048 op_res = &resop->nfs_resop4_u.opopen; 2049 2050 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2051 2052 /* 2053 * Check if the path we reopened really is the same 2054 * file. We could end up in a situation where the file 2055 * was removed and a new file created with the same name. 2056 */ 2057 resop = &res.array[2]; 2058 gf_res = &resop->nfs_resop4_u.opgetfh; 2059 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2060 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2061 if (fh_different) { 2062 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2063 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2064 /* Oops, we don't have the same file */ 2065 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2066 failed_msg = "Couldn't reopen: Persistent " 2067 "file handle changed"; 2068 else 2069 failed_msg = "Couldn't reopen: Volatile " 2070 "(no expire on open) file handle changed"; 2071 2072 nfs4args_copen_free(open_args); 2073 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2074 nfs_rw_exit(&mi->mi_fh_lock); 2075 goto kill_file; 2076 2077 } else { 2078 /* 2079 * We have volatile file handles that don't compare. 2080 * If the fids are the same then we assume that the 2081 * file handle expired but the rnode still refers to 2082 * the same file object. 2083 * 2084 * First check that we have fids or not. 2085 * If we don't we have a dumb server so we will 2086 * just assume every thing is ok for now. 2087 */ 2088 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2089 rp->r_attr.va_mask & AT_NODEID && 2090 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2091 /* 2092 * We have fids, but they don't 2093 * compare. So kill the file. 2094 */ 2095 failed_msg = 2096 "Couldn't reopen: file handle changed" 2097 " due to mismatched fids"; 2098 nfs4args_copen_free(open_args); 2099 (void) xdr_free(xdr_COMPOUND4res_clnt, 2100 (caddr_t)&res); 2101 nfs_rw_exit(&mi->mi_fh_lock); 2102 goto kill_file; 2103 } else { 2104 /* 2105 * We have volatile file handles that refers 2106 * to the same file (at least they have the 2107 * same fid) or we don't have fids so we 2108 * can't tell. :(. We'll be a kind and accepting 2109 * client so we'll update the rnode's file 2110 * handle with the otw handle. 2111 * 2112 * We need to drop mi->mi_fh_lock since 2113 * sh4_update acquires it. Since there is 2114 * only one recovery thread there is no 2115 * race. 2116 */ 2117 nfs_rw_exit(&mi->mi_fh_lock); 2118 sfh4_update(rp->r_fh, &gf_res->object); 2119 } 2120 } 2121 } else { 2122 nfs_rw_exit(&mi->mi_fh_lock); 2123 } 2124 2125 ASSERT(nfs4_consistent_type(vp)); 2126 2127 /* 2128 * If the server wanted an OPEN_CONFIRM but that fails, just start 2129 * over. Presumably if there is a persistent error it will show up 2130 * when we resend the OPEN. 2131 */ 2132 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2133 bool_t retry_open = FALSE; 2134 2135 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2136 cred_otw, is_recov, &retry_open, 2137 oop, FALSE, ep, NULL); 2138 if (ep->error || ep->stat) { 2139 nfs4args_copen_free(open_args); 2140 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2141 nfs4_end_open_seqid_sync(oop); 2142 open_owner_rele(oop); 2143 oop = NULL; 2144 goto top; 2145 } 2146 } 2147 2148 mutex_enter(&osp->os_sync_lock); 2149 osp->open_stateid = op_res->stateid; 2150 osp->os_delegation = 0; 2151 /* 2152 * Need to reset this bitfield for the possible case where we were 2153 * going to OTW CLOSE the file, got a non-recoverable error, and before 2154 * we could retry the CLOSE, OPENed the file again. 2155 */ 2156 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2157 osp->os_final_close = 0; 2158 osp->os_force_close = 0; 2159 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2160 osp->os_dc_openacc = open_args->share_access; 2161 mutex_exit(&osp->os_sync_lock); 2162 2163 nfs4_end_open_seqid_sync(oop); 2164 2165 /* accept delegation, if any */ 2166 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2167 2168 nfs4args_copen_free(open_args); 2169 2170 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2171 2172 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2173 2174 ASSERT(nfs4_consistent_type(vp)); 2175 2176 open_owner_rele(oop); 2177 crfree(cr); 2178 crfree(cred_otw); 2179 return; 2180 2181 kill_file: 2182 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2183 failed_reopen: 2184 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2185 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2186 (void *)osp, (void *)cr, rnode4info(rp))); 2187 mutex_enter(&osp->os_sync_lock); 2188 osp->os_failed_reopen = 1; 2189 mutex_exit(&osp->os_sync_lock); 2190 bailout: 2191 if (oop != NULL) { 2192 nfs4_end_open_seqid_sync(oop); 2193 open_owner_rele(oop); 2194 } 2195 if (cr != NULL) 2196 crfree(cr); 2197 if (cred_otw != NULL) 2198 crfree(cred_otw); 2199 } 2200 2201 /* for . and .. OPENs */ 2202 /* ARGSUSED */ 2203 static int 2204 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2205 { 2206 rnode4_t *rp; 2207 nfs4_ga_res_t gar; 2208 2209 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2210 2211 /* 2212 * If close-to-open consistency checking is turned off or 2213 * if there is no cached data, we can avoid 2214 * the over the wire getattr. Otherwise, force a 2215 * call to the server to get fresh attributes and to 2216 * check caches. This is required for close-to-open 2217 * consistency. 2218 */ 2219 rp = VTOR4(*vpp); 2220 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2221 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2222 return (0); 2223 2224 gar.n4g_va.va_mask = AT_ALL; 2225 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2226 } 2227 2228 /* 2229 * CLOSE a file 2230 */ 2231 /* ARGSUSED */ 2232 static int 2233 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2234 caller_context_t *ct) 2235 { 2236 rnode4_t *rp; 2237 int error = 0; 2238 int r_error = 0; 2239 int n4error = 0; 2240 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2241 2242 /* 2243 * Remove client state for this (lockowner, file) pair. 2244 * Issue otw v4 call to have the server do the same. 2245 */ 2246 2247 rp = VTOR4(vp); 2248 2249 /* 2250 * zone_enter(2) prevents processes from changing zones with NFS files 2251 * open; if we happen to get here from the wrong zone we can't do 2252 * anything over the wire. 2253 */ 2254 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2255 /* 2256 * We could attempt to clean up locks, except we're sure 2257 * that the current process didn't acquire any locks on 2258 * the file: any attempt to lock a file belong to another zone 2259 * will fail, and one can't lock an NFS file and then change 2260 * zones, as that fails too. 2261 * 2262 * Returning an error here is the sane thing to do. A 2263 * subsequent call to VN_RELE() which translates to a 2264 * nfs4_inactive() will clean up state: if the zone of the 2265 * vnode's origin is still alive and kicking, the inactive 2266 * thread will handle the request (from the correct zone), and 2267 * everything (minus the OTW close call) should be OK. If the 2268 * zone is going away nfs4_async_inactive() will throw away 2269 * delegations, open streams and cached pages inline. 2270 */ 2271 return (EIO); 2272 } 2273 2274 /* 2275 * If we are using local locking for this filesystem, then 2276 * release all of the SYSV style record locks. Otherwise, 2277 * we are doing network locking and we need to release all 2278 * of the network locks. All of the locks held by this 2279 * process on this file are released no matter what the 2280 * incoming reference count is. 2281 */ 2282 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2283 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2284 cleanshares(vp, ttoproc(curthread)->p_pid); 2285 } else 2286 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2287 2288 if (e.error) { 2289 struct lm_sysid *lmsid; 2290 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2291 if (lmsid == NULL) { 2292 DTRACE_PROBE2(unknown__sysid, int, e.error, 2293 vnode_t *, vp); 2294 } else { 2295 cleanlocks(vp, ttoproc(curthread)->p_pid, 2296 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2297 } 2298 return (e.error); 2299 } 2300 2301 if (count > 1) 2302 return (0); 2303 2304 /* 2305 * If the file has been `unlinked', then purge the 2306 * DNLC so that this vnode will get reycled quicker 2307 * and the .nfs* file on the server will get removed. 2308 */ 2309 if (rp->r_unldvp != NULL) 2310 dnlc_purge_vp(vp); 2311 2312 /* 2313 * If the file was open for write and there are pages, 2314 * do a synchronous flush and commit of all of the 2315 * dirty and uncommitted pages. 2316 */ 2317 ASSERT(!e.error); 2318 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2319 error = nfs4_putpage_commit(vp, 0, 0, cr); 2320 2321 mutex_enter(&rp->r_statelock); 2322 r_error = rp->r_error; 2323 rp->r_error = 0; 2324 mutex_exit(&rp->r_statelock); 2325 2326 /* 2327 * If this file type is one for which no explicit 'open' was 2328 * done, then bail now (ie. no need for protocol 'close'). If 2329 * there was an error w/the vm subsystem, return _that_ error, 2330 * otherwise, return any errors that may've been reported via 2331 * the rnode. 2332 */ 2333 if (vp->v_type != VREG) 2334 return (error ? error : r_error); 2335 2336 /* 2337 * The sync putpage commit may have failed above, but since 2338 * we're working w/a regular file, we need to do the protocol 2339 * 'close' (nfs4close_one will figure out if an otw close is 2340 * needed or not). Report any errors _after_ doing the protocol 2341 * 'close'. 2342 */ 2343 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2344 n4error = e.error ? e.error : geterrno4(e.stat); 2345 2346 /* 2347 * Error reporting prio (Hi -> Lo) 2348 * 2349 * i) nfs4_putpage_commit (error) 2350 * ii) rnode's (r_error) 2351 * iii) nfs4close_one (n4error) 2352 */ 2353 return (error ? error : (r_error ? r_error : n4error)); 2354 } 2355 2356 /* 2357 * Initialize *lost_rqstp. 2358 */ 2359 2360 static void 2361 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2362 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2363 vnode_t *vp) 2364 { 2365 if (error != ETIMEDOUT && error != EINTR && 2366 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2367 lost_rqstp->lr_op = 0; 2368 return; 2369 } 2370 2371 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2372 "nfs4close_save_lost_rqst: error %d", error)); 2373 2374 lost_rqstp->lr_op = OP_CLOSE; 2375 /* 2376 * The vp is held and rele'd via the recovery code. 2377 * See nfs4_save_lost_rqst. 2378 */ 2379 lost_rqstp->lr_vp = vp; 2380 lost_rqstp->lr_dvp = NULL; 2381 lost_rqstp->lr_oop = oop; 2382 lost_rqstp->lr_osp = osp; 2383 ASSERT(osp != NULL); 2384 ASSERT(mutex_owned(&osp->os_sync_lock)); 2385 osp->os_pending_close = 1; 2386 lost_rqstp->lr_lop = NULL; 2387 lost_rqstp->lr_cr = cr; 2388 lost_rqstp->lr_flk = NULL; 2389 lost_rqstp->lr_putfirst = FALSE; 2390 } 2391 2392 /* 2393 * Assumes you already have the open seqid sync grabbed as well as the 2394 * 'os_sync_lock'. Note: this will release the open seqid sync and 2395 * 'os_sync_lock' if client recovery starts. Calling functions have to 2396 * be prepared to handle this. 2397 * 2398 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2399 * was needed and was started, and that the calling function should retry 2400 * this function; otherwise it is returned as 0. 2401 * 2402 * Errors are returned via the nfs4_error_t parameter. 2403 */ 2404 static void 2405 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2406 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2407 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2408 { 2409 COMPOUND4args_clnt args; 2410 COMPOUND4res_clnt res; 2411 CLOSE4args *close_args; 2412 nfs_resop4 *resop; 2413 nfs_argop4 argop[3]; 2414 int doqueue = 1; 2415 mntinfo4_t *mi; 2416 seqid4 seqid; 2417 vnode_t *vp; 2418 bool_t needrecov = FALSE; 2419 nfs4_lost_rqst_t lost_rqst; 2420 hrtime_t t; 2421 2422 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2423 2424 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2425 2426 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2427 2428 /* Only set this to 1 if recovery is started */ 2429 *recov = 0; 2430 2431 /* do the OTW call to close the file */ 2432 2433 if (close_type == CLOSE_RESEND) 2434 args.ctag = TAG_CLOSE_LOST; 2435 else if (close_type == CLOSE_AFTER_RESEND) 2436 args.ctag = TAG_CLOSE_UNDO; 2437 else 2438 args.ctag = TAG_CLOSE; 2439 2440 args.array_len = 3; 2441 args.array = argop; 2442 2443 vp = RTOV4(rp); 2444 2445 mi = VTOMI4(vp); 2446 2447 /* putfh target fh */ 2448 argop[0].argop = OP_CPUTFH; 2449 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2450 2451 argop[1].argop = OP_GETATTR; 2452 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2453 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2454 2455 argop[2].argop = OP_CLOSE; 2456 close_args = &argop[2].nfs_argop4_u.opclose; 2457 2458 seqid = nfs4_get_open_seqid(oop) + 1; 2459 2460 close_args->seqid = seqid; 2461 close_args->open_stateid = osp->open_stateid; 2462 2463 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2464 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2465 rnode4info(rp))); 2466 2467 t = gethrtime(); 2468 2469 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2470 2471 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2472 nfs4_set_open_seqid(seqid, oop, args.ctag); 2473 } 2474 2475 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2476 if (ep->error && !needrecov) { 2477 /* 2478 * if there was an error and no recovery is to be done 2479 * then then set up the file to flush its cache if 2480 * needed for the next caller. 2481 */ 2482 mutex_enter(&rp->r_statelock); 2483 PURGE_ATTRCACHE4_LOCKED(rp); 2484 rp->r_flags &= ~R4WRITEMODIFIED; 2485 mutex_exit(&rp->r_statelock); 2486 return; 2487 } 2488 2489 if (needrecov) { 2490 bool_t abort; 2491 nfs4_bseqid_entry_t *bsep = NULL; 2492 2493 if (close_type != CLOSE_RESEND) 2494 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2495 osp, cred_otw, vp); 2496 2497 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2498 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2499 0, args.ctag, close_args->seqid); 2500 2501 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2502 "nfs4close_otw: initiating recovery. error %d " 2503 "res.status %d", ep->error, res.status)); 2504 2505 /* 2506 * Drop the 'os_sync_lock' here so we don't hit 2507 * a potential recursive mutex_enter via an 2508 * 'open_stream_hold()'. 2509 */ 2510 mutex_exit(&osp->os_sync_lock); 2511 *have_sync_lockp = 0; 2512 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2513 (close_type != CLOSE_RESEND && 2514 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2515 OP_CLOSE, bsep); 2516 2517 /* drop open seq sync, and let the calling function regrab it */ 2518 nfs4_end_open_seqid_sync(oop); 2519 *did_start_seqid_syncp = 0; 2520 2521 if (bsep) 2522 kmem_free(bsep, sizeof (*bsep)); 2523 /* 2524 * For signals, the caller wants to quit, so don't say to 2525 * retry. For forced unmount, if it's a user thread, it 2526 * wants to quit. If it's a recovery thread, the retry 2527 * will happen higher-up on the call stack. Either way, 2528 * don't say to retry. 2529 */ 2530 if (abort == FALSE && ep->error != EINTR && 2531 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2532 close_type != CLOSE_RESEND && 2533 close_type != CLOSE_AFTER_RESEND) 2534 *recov = 1; 2535 else 2536 *recov = 0; 2537 2538 if (!ep->error) 2539 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2540 return; 2541 } 2542 2543 if (res.status) { 2544 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2545 return; 2546 } 2547 2548 mutex_enter(&rp->r_statev4_lock); 2549 rp->created_v4 = 0; 2550 mutex_exit(&rp->r_statev4_lock); 2551 2552 resop = &res.array[2]; 2553 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2554 osp->os_valid = 0; 2555 2556 /* 2557 * This removes the reference obtained at OPEN; ie, when the 2558 * open stream structure was created. 2559 * 2560 * We don't have to worry about calling 'open_stream_rele' 2561 * since we our currently holding a reference to the open 2562 * stream which means the count cannot go to 0 with this 2563 * decrement. 2564 */ 2565 ASSERT(osp->os_ref_count >= 2); 2566 osp->os_ref_count--; 2567 2568 if (!ep->error) 2569 nfs4_attr_cache(vp, 2570 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2571 t, cred_otw, TRUE, NULL); 2572 2573 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2574 " returning %d", ep->error)); 2575 2576 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2577 } 2578 2579 /* ARGSUSED */ 2580 static int 2581 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2582 caller_context_t *ct) 2583 { 2584 rnode4_t *rp; 2585 u_offset_t off; 2586 offset_t diff; 2587 uint_t on; 2588 uint_t n; 2589 caddr_t base; 2590 uint_t flags; 2591 int error; 2592 mntinfo4_t *mi; 2593 2594 rp = VTOR4(vp); 2595 2596 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2597 2598 if (IS_SHADOW(vp, rp)) 2599 vp = RTOV4(rp); 2600 2601 if (vp->v_type != VREG) 2602 return (EISDIR); 2603 2604 mi = VTOMI4(vp); 2605 2606 if (nfs_zone() != mi->mi_zone) 2607 return (EIO); 2608 2609 if (uiop->uio_resid == 0) 2610 return (0); 2611 2612 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2613 return (EINVAL); 2614 2615 mutex_enter(&rp->r_statelock); 2616 if (rp->r_flags & R4RECOVERRP) 2617 error = (rp->r_error ? rp->r_error : EIO); 2618 else 2619 error = 0; 2620 mutex_exit(&rp->r_statelock); 2621 if (error) 2622 return (error); 2623 2624 /* 2625 * Bypass VM if caching has been disabled (e.g., locking) or if 2626 * using client-side direct I/O and the file is not mmap'd and 2627 * there are no cached pages. 2628 */ 2629 if ((vp->v_flag & VNOCACHE) || 2630 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2631 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2632 size_t resid = 0; 2633 2634 return (nfs4read(vp, NULL, uiop->uio_loffset, 2635 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2636 } 2637 2638 error = 0; 2639 2640 do { 2641 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2642 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2643 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2644 2645 if (error = nfs4_validate_caches(vp, cr)) 2646 break; 2647 2648 mutex_enter(&rp->r_statelock); 2649 while (rp->r_flags & R4INCACHEPURGE) { 2650 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2651 mutex_exit(&rp->r_statelock); 2652 return (EINTR); 2653 } 2654 } 2655 diff = rp->r_size - uiop->uio_loffset; 2656 mutex_exit(&rp->r_statelock); 2657 if (diff <= 0) 2658 break; 2659 if (diff < n) 2660 n = (uint_t)diff; 2661 2662 if (vpm_enable) { 2663 /* 2664 * Copy data. 2665 */ 2666 error = vpm_data_copy(vp, off + on, n, uiop, 2667 1, NULL, 0, S_READ); 2668 } else { 2669 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2670 S_READ); 2671 2672 error = uiomove(base + on, n, UIO_READ, uiop); 2673 } 2674 2675 if (!error) { 2676 /* 2677 * If read a whole block or read to eof, 2678 * won't need this buffer again soon. 2679 */ 2680 mutex_enter(&rp->r_statelock); 2681 if (n + on == MAXBSIZE || 2682 uiop->uio_loffset == rp->r_size) 2683 flags = SM_DONTNEED; 2684 else 2685 flags = 0; 2686 mutex_exit(&rp->r_statelock); 2687 if (vpm_enable) { 2688 error = vpm_sync_pages(vp, off, n, flags); 2689 } else { 2690 error = segmap_release(segkmap, base, flags); 2691 } 2692 } else { 2693 if (vpm_enable) { 2694 (void) vpm_sync_pages(vp, off, n, 0); 2695 } else { 2696 (void) segmap_release(segkmap, base, 0); 2697 } 2698 } 2699 } while (!error && uiop->uio_resid > 0); 2700 2701 return (error); 2702 } 2703 2704 /* ARGSUSED */ 2705 static int 2706 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2707 caller_context_t *ct) 2708 { 2709 rlim64_t limit = uiop->uio_llimit; 2710 rnode4_t *rp; 2711 u_offset_t off; 2712 caddr_t base; 2713 uint_t flags; 2714 int remainder; 2715 size_t n; 2716 int on; 2717 int error; 2718 int resid; 2719 u_offset_t offset; 2720 mntinfo4_t *mi; 2721 uint_t bsize; 2722 2723 rp = VTOR4(vp); 2724 2725 if (IS_SHADOW(vp, rp)) 2726 vp = RTOV4(rp); 2727 2728 if (vp->v_type != VREG) 2729 return (EISDIR); 2730 2731 mi = VTOMI4(vp); 2732 2733 if (nfs_zone() != mi->mi_zone) 2734 return (EIO); 2735 2736 if (uiop->uio_resid == 0) 2737 return (0); 2738 2739 mutex_enter(&rp->r_statelock); 2740 if (rp->r_flags & R4RECOVERRP) 2741 error = (rp->r_error ? rp->r_error : EIO); 2742 else 2743 error = 0; 2744 mutex_exit(&rp->r_statelock); 2745 if (error) 2746 return (error); 2747 2748 if (ioflag & FAPPEND) { 2749 struct vattr va; 2750 2751 /* 2752 * Must serialize if appending. 2753 */ 2754 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2755 nfs_rw_exit(&rp->r_rwlock); 2756 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2757 INTR(vp))) 2758 return (EINTR); 2759 } 2760 2761 va.va_mask = AT_SIZE; 2762 error = nfs4getattr(vp, &va, cr); 2763 if (error) 2764 return (error); 2765 uiop->uio_loffset = va.va_size; 2766 } 2767 2768 offset = uiop->uio_loffset + uiop->uio_resid; 2769 2770 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2771 return (EINVAL); 2772 2773 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2774 limit = MAXOFFSET_T; 2775 2776 /* 2777 * Check to make sure that the process will not exceed 2778 * its limit on file size. It is okay to write up to 2779 * the limit, but not beyond. Thus, the write which 2780 * reaches the limit will be short and the next write 2781 * will return an error. 2782 */ 2783 remainder = 0; 2784 if (offset > uiop->uio_llimit) { 2785 remainder = offset - uiop->uio_llimit; 2786 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2787 if (uiop->uio_resid <= 0) { 2788 proc_t *p = ttoproc(curthread); 2789 2790 uiop->uio_resid += remainder; 2791 mutex_enter(&p->p_lock); 2792 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2793 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2794 mutex_exit(&p->p_lock); 2795 return (EFBIG); 2796 } 2797 } 2798 2799 /* update the change attribute, if we have a write delegation */ 2800 2801 mutex_enter(&rp->r_statev4_lock); 2802 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2803 rp->r_deleg_change++; 2804 2805 mutex_exit(&rp->r_statev4_lock); 2806 2807 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2808 return (EINTR); 2809 2810 /* 2811 * Bypass VM if caching has been disabled (e.g., locking) or if 2812 * using client-side direct I/O and the file is not mmap'd and 2813 * there are no cached pages. 2814 */ 2815 if ((vp->v_flag & VNOCACHE) || 2816 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2817 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2818 size_t bufsize; 2819 int count; 2820 u_offset_t org_offset; 2821 stable_how4 stab_comm; 2822 nfs4_fwrite: 2823 if (rp->r_flags & R4STALE) { 2824 resid = uiop->uio_resid; 2825 offset = uiop->uio_loffset; 2826 error = rp->r_error; 2827 goto bottom; 2828 } 2829 2830 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2831 base = kmem_alloc(bufsize, KM_SLEEP); 2832 do { 2833 if (ioflag & FDSYNC) 2834 stab_comm = DATA_SYNC4; 2835 else 2836 stab_comm = FILE_SYNC4; 2837 resid = uiop->uio_resid; 2838 offset = uiop->uio_loffset; 2839 count = MIN(uiop->uio_resid, bufsize); 2840 org_offset = uiop->uio_loffset; 2841 error = uiomove(base, count, UIO_WRITE, uiop); 2842 if (!error) { 2843 error = nfs4write(vp, base, org_offset, 2844 count, cr, &stab_comm); 2845 if (!error) { 2846 mutex_enter(&rp->r_statelock); 2847 if (rp->r_size < uiop->uio_loffset) 2848 rp->r_size = uiop->uio_loffset; 2849 mutex_exit(&rp->r_statelock); 2850 } 2851 } 2852 } while (!error && uiop->uio_resid > 0); 2853 kmem_free(base, bufsize); 2854 goto bottom; 2855 } 2856 2857 bsize = vp->v_vfsp->vfs_bsize; 2858 2859 do { 2860 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2861 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2862 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2863 2864 resid = uiop->uio_resid; 2865 offset = uiop->uio_loffset; 2866 2867 if (rp->r_flags & R4STALE) { 2868 error = rp->r_error; 2869 break; 2870 } 2871 2872 /* 2873 * Don't create dirty pages faster than they 2874 * can be cleaned so that the system doesn't 2875 * get imbalanced. If the async queue is 2876 * maxed out, then wait for it to drain before 2877 * creating more dirty pages. Also, wait for 2878 * any threads doing pagewalks in the vop_getattr 2879 * entry points so that they don't block for 2880 * long periods. 2881 */ 2882 mutex_enter(&rp->r_statelock); 2883 while ((mi->mi_max_threads != 0 && 2884 rp->r_awcount > 2 * mi->mi_max_threads) || 2885 rp->r_gcount > 0) 2886 cv_wait(&rp->r_cv, &rp->r_statelock); 2887 mutex_exit(&rp->r_statelock); 2888 2889 /* 2890 * Touch the page and fault it in if it is not in core 2891 * before segmap_getmapflt or vpm_data_copy can lock it. 2892 * This is to avoid the deadlock if the buffer is mapped 2893 * to the same file through mmap which we want to write. 2894 */ 2895 uio_prefaultpages((long)n, uiop); 2896 2897 if (vpm_enable) { 2898 /* 2899 * It will use kpm mappings, so no need to 2900 * pass an address. 2901 */ 2902 error = writerp4(rp, NULL, n, uiop, 0); 2903 } else { 2904 if (segmap_kpm) { 2905 int pon = uiop->uio_loffset & PAGEOFFSET; 2906 size_t pn = MIN(PAGESIZE - pon, 2907 uiop->uio_resid); 2908 int pagecreate; 2909 2910 mutex_enter(&rp->r_statelock); 2911 pagecreate = (pon == 0) && (pn == PAGESIZE || 2912 uiop->uio_loffset + pn >= rp->r_size); 2913 mutex_exit(&rp->r_statelock); 2914 2915 base = segmap_getmapflt(segkmap, vp, off + on, 2916 pn, !pagecreate, S_WRITE); 2917 2918 error = writerp4(rp, base + pon, n, uiop, 2919 pagecreate); 2920 2921 } else { 2922 base = segmap_getmapflt(segkmap, vp, off + on, 2923 n, 0, S_READ); 2924 error = writerp4(rp, base + on, n, uiop, 0); 2925 } 2926 } 2927 2928 if (!error) { 2929 if (mi->mi_flags & MI4_NOAC) 2930 flags = SM_WRITE; 2931 else if ((uiop->uio_loffset % bsize) == 0 || 2932 IS_SWAPVP(vp)) { 2933 /* 2934 * Have written a whole block. 2935 * Start an asynchronous write 2936 * and mark the buffer to 2937 * indicate that it won't be 2938 * needed again soon. 2939 */ 2940 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2941 } else 2942 flags = 0; 2943 if ((ioflag & (FSYNC|FDSYNC)) || 2944 (rp->r_flags & R4OUTOFSPACE)) { 2945 flags &= ~SM_ASYNC; 2946 flags |= SM_WRITE; 2947 } 2948 if (vpm_enable) { 2949 error = vpm_sync_pages(vp, off, n, flags); 2950 } else { 2951 error = segmap_release(segkmap, base, flags); 2952 } 2953 } else { 2954 if (vpm_enable) { 2955 (void) vpm_sync_pages(vp, off, n, 0); 2956 } else { 2957 (void) segmap_release(segkmap, base, 0); 2958 } 2959 /* 2960 * In the event that we got an access error while 2961 * faulting in a page for a write-only file just 2962 * force a write. 2963 */ 2964 if (error == EACCES) 2965 goto nfs4_fwrite; 2966 } 2967 } while (!error && uiop->uio_resid > 0); 2968 2969 bottom: 2970 if (error) { 2971 uiop->uio_resid = resid + remainder; 2972 uiop->uio_loffset = offset; 2973 } else { 2974 uiop->uio_resid += remainder; 2975 2976 mutex_enter(&rp->r_statev4_lock); 2977 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2978 gethrestime(&rp->r_attr.va_mtime); 2979 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2980 } 2981 mutex_exit(&rp->r_statev4_lock); 2982 } 2983 2984 nfs_rw_exit(&rp->r_lkserlock); 2985 2986 return (error); 2987 } 2988 2989 /* 2990 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2991 */ 2992 static int 2993 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2994 int flags, cred_t *cr) 2995 { 2996 struct buf *bp; 2997 int error; 2998 page_t *savepp; 2999 uchar_t fsdata; 3000 stable_how4 stab_comm; 3001 3002 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3003 bp = pageio_setup(pp, len, vp, flags); 3004 ASSERT(bp != NULL); 3005 3006 /* 3007 * pageio_setup should have set b_addr to 0. This 3008 * is correct since we want to do I/O on a page 3009 * boundary. bp_mapin will use this addr to calculate 3010 * an offset, and then set b_addr to the kernel virtual 3011 * address it allocated for us. 3012 */ 3013 ASSERT(bp->b_un.b_addr == 0); 3014 3015 bp->b_edev = 0; 3016 bp->b_dev = 0; 3017 bp->b_lblkno = lbtodb(off); 3018 bp->b_file = vp; 3019 bp->b_offset = (offset_t)off; 3020 bp_mapin(bp); 3021 3022 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3023 freemem > desfree) 3024 stab_comm = UNSTABLE4; 3025 else 3026 stab_comm = FILE_SYNC4; 3027 3028 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3029 3030 bp_mapout(bp); 3031 pageio_done(bp); 3032 3033 if (stab_comm == UNSTABLE4) 3034 fsdata = C_DELAYCOMMIT; 3035 else 3036 fsdata = C_NOCOMMIT; 3037 3038 savepp = pp; 3039 do { 3040 pp->p_fsdata = fsdata; 3041 } while ((pp = pp->p_next) != savepp); 3042 3043 return (error); 3044 } 3045 3046 /* 3047 */ 3048 static int 3049 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3050 { 3051 nfs4_open_owner_t *oop; 3052 nfs4_open_stream_t *osp; 3053 rnode4_t *rp = VTOR4(vp); 3054 mntinfo4_t *mi = VTOMI4(vp); 3055 int reopen_needed; 3056 3057 ASSERT(nfs_zone() == mi->mi_zone); 3058 3059 3060 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3061 if (!oop) 3062 return (EIO); 3063 3064 /* returns with 'os_sync_lock' held */ 3065 osp = find_open_stream(oop, rp); 3066 if (!osp) { 3067 open_owner_rele(oop); 3068 return (EIO); 3069 } 3070 3071 if (osp->os_failed_reopen) { 3072 mutex_exit(&osp->os_sync_lock); 3073 open_stream_rele(osp, rp); 3074 open_owner_rele(oop); 3075 return (EIO); 3076 } 3077 3078 /* 3079 * Determine whether a reopen is needed. If this 3080 * is a delegation open stream, then the os_delegation bit 3081 * should be set. 3082 */ 3083 3084 reopen_needed = osp->os_delegation; 3085 3086 mutex_exit(&osp->os_sync_lock); 3087 open_owner_rele(oop); 3088 3089 if (reopen_needed) { 3090 nfs4_error_zinit(ep); 3091 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3092 mutex_enter(&osp->os_sync_lock); 3093 if (ep->error || ep->stat || osp->os_failed_reopen) { 3094 mutex_exit(&osp->os_sync_lock); 3095 open_stream_rele(osp, rp); 3096 return (EIO); 3097 } 3098 mutex_exit(&osp->os_sync_lock); 3099 } 3100 open_stream_rele(osp, rp); 3101 3102 return (0); 3103 } 3104 3105 /* 3106 * Write to file. Writes to remote server in largest size 3107 * chunks that the server can handle. Write is synchronous. 3108 */ 3109 static int 3110 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3111 stable_how4 *stab_comm) 3112 { 3113 mntinfo4_t *mi; 3114 COMPOUND4args_clnt args; 3115 COMPOUND4res_clnt res; 3116 WRITE4args *wargs; 3117 WRITE4res *wres; 3118 nfs_argop4 argop[2]; 3119 nfs_resop4 *resop; 3120 int tsize; 3121 stable_how4 stable; 3122 rnode4_t *rp; 3123 int doqueue = 1; 3124 bool_t needrecov; 3125 nfs4_recov_state_t recov_state; 3126 nfs4_stateid_types_t sid_types; 3127 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3128 int recov; 3129 3130 rp = VTOR4(vp); 3131 mi = VTOMI4(vp); 3132 3133 ASSERT(nfs_zone() == mi->mi_zone); 3134 3135 stable = *stab_comm; 3136 *stab_comm = FILE_SYNC4; 3137 3138 needrecov = FALSE; 3139 recov_state.rs_flags = 0; 3140 recov_state.rs_num_retry_despite_err = 0; 3141 nfs4_init_stateid_types(&sid_types); 3142 3143 /* Is curthread the recovery thread? */ 3144 mutex_enter(&mi->mi_lock); 3145 recov = (mi->mi_recovthread == curthread); 3146 mutex_exit(&mi->mi_lock); 3147 3148 recov_retry: 3149 args.ctag = TAG_WRITE; 3150 args.array_len = 2; 3151 args.array = argop; 3152 3153 if (!recov) { 3154 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3155 &recov_state, NULL); 3156 if (e.error) 3157 return (e.error); 3158 } 3159 3160 /* 0. putfh target fh */ 3161 argop[0].argop = OP_CPUTFH; 3162 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3163 3164 /* 1. write */ 3165 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3166 3167 do { 3168 3169 wargs->offset = (offset4)offset; 3170 wargs->data_val = base; 3171 3172 if (mi->mi_io_kstats) { 3173 mutex_enter(&mi->mi_lock); 3174 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3175 mutex_exit(&mi->mi_lock); 3176 } 3177 3178 if ((vp->v_flag & VNOCACHE) || 3179 (rp->r_flags & R4DIRECTIO) || 3180 (mi->mi_flags & MI4_DIRECTIO)) 3181 tsize = MIN(mi->mi_stsize, count); 3182 else 3183 tsize = MIN(mi->mi_curwrite, count); 3184 wargs->data_len = (uint_t)tsize; 3185 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3186 3187 if (mi->mi_io_kstats) { 3188 mutex_enter(&mi->mi_lock); 3189 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3190 mutex_exit(&mi->mi_lock); 3191 } 3192 3193 if (!recov) { 3194 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3195 if (e.error && !needrecov) { 3196 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3197 &recov_state, needrecov); 3198 return (e.error); 3199 } 3200 } else { 3201 if (e.error) 3202 return (e.error); 3203 } 3204 3205 /* 3206 * Do handling of OLD_STATEID outside 3207 * of the normal recovery framework. 3208 * 3209 * If write receives a BAD stateid error while using a 3210 * delegation stateid, retry using the open stateid (if it 3211 * exists). If it doesn't have an open stateid, reopen the 3212 * file first, then retry. 3213 */ 3214 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3215 sid_types.cur_sid_type != SPEC_SID) { 3216 nfs4_save_stateid(&wargs->stateid, &sid_types); 3217 if (!recov) 3218 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3219 &recov_state, needrecov); 3220 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3221 goto recov_retry; 3222 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3223 sid_types.cur_sid_type == DEL_SID) { 3224 nfs4_save_stateid(&wargs->stateid, &sid_types); 3225 mutex_enter(&rp->r_statev4_lock); 3226 rp->r_deleg_return_pending = TRUE; 3227 mutex_exit(&rp->r_statev4_lock); 3228 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3229 if (!recov) 3230 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3231 &recov_state, needrecov); 3232 (void) xdr_free(xdr_COMPOUND4res_clnt, 3233 (caddr_t)&res); 3234 return (EIO); 3235 } 3236 if (!recov) 3237 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3238 &recov_state, needrecov); 3239 /* hold needed for nfs4delegreturn_thread */ 3240 VN_HOLD(vp); 3241 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3242 NFS4_DR_DISCARD), FALSE); 3243 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3244 goto recov_retry; 3245 } 3246 3247 if (needrecov) { 3248 bool_t abort; 3249 3250 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3251 "nfs4write: client got error %d, res.status %d" 3252 ", so start recovery", e.error, res.status)); 3253 3254 abort = nfs4_start_recovery(&e, 3255 VTOMI4(vp), vp, NULL, &wargs->stateid, 3256 NULL, OP_WRITE, NULL); 3257 if (!e.error) { 3258 e.error = geterrno4(res.status); 3259 (void) xdr_free(xdr_COMPOUND4res_clnt, 3260 (caddr_t)&res); 3261 } 3262 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3263 &recov_state, needrecov); 3264 if (abort == FALSE) 3265 goto recov_retry; 3266 return (e.error); 3267 } 3268 3269 if (res.status) { 3270 e.error = geterrno4(res.status); 3271 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3272 if (!recov) 3273 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3274 &recov_state, needrecov); 3275 return (e.error); 3276 } 3277 3278 resop = &res.array[1]; /* write res */ 3279 wres = &resop->nfs_resop4_u.opwrite; 3280 3281 if ((int)wres->count > tsize) { 3282 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3283 3284 zcmn_err(getzoneid(), CE_WARN, 3285 "nfs4write: server wrote %u, requested was %u", 3286 (int)wres->count, tsize); 3287 if (!recov) 3288 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3289 &recov_state, needrecov); 3290 return (EIO); 3291 } 3292 if (wres->committed == UNSTABLE4) { 3293 *stab_comm = UNSTABLE4; 3294 if (wargs->stable == DATA_SYNC4 || 3295 wargs->stable == FILE_SYNC4) { 3296 (void) xdr_free(xdr_COMPOUND4res_clnt, 3297 (caddr_t)&res); 3298 zcmn_err(getzoneid(), CE_WARN, 3299 "nfs4write: server %s did not commit " 3300 "to stable storage", 3301 rp->r_server->sv_hostname); 3302 if (!recov) 3303 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3304 OH_WRITE, &recov_state, needrecov); 3305 return (EIO); 3306 } 3307 } 3308 3309 tsize = (int)wres->count; 3310 count -= tsize; 3311 base += tsize; 3312 offset += tsize; 3313 if (mi->mi_io_kstats) { 3314 mutex_enter(&mi->mi_lock); 3315 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3316 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3317 tsize; 3318 mutex_exit(&mi->mi_lock); 3319 } 3320 lwp_stat_update(LWP_STAT_OUBLK, 1); 3321 mutex_enter(&rp->r_statelock); 3322 if (rp->r_flags & R4HAVEVERF) { 3323 if (rp->r_writeverf != wres->writeverf) { 3324 nfs4_set_mod(vp); 3325 rp->r_writeverf = wres->writeverf; 3326 } 3327 } else { 3328 rp->r_writeverf = wres->writeverf; 3329 rp->r_flags |= R4HAVEVERF; 3330 } 3331 PURGE_ATTRCACHE4_LOCKED(rp); 3332 rp->r_flags |= R4WRITEMODIFIED; 3333 gethrestime(&rp->r_attr.va_mtime); 3334 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3335 mutex_exit(&rp->r_statelock); 3336 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3337 } while (count); 3338 3339 if (!recov) 3340 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3341 needrecov); 3342 3343 return (e.error); 3344 } 3345 3346 /* 3347 * Read from a file. Reads data in largest chunks our interface can handle. 3348 */ 3349 static int 3350 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3351 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3352 { 3353 mntinfo4_t *mi; 3354 COMPOUND4args_clnt args; 3355 COMPOUND4res_clnt res; 3356 READ4args *rargs; 3357 nfs_argop4 argop[2]; 3358 int tsize; 3359 int doqueue; 3360 rnode4_t *rp; 3361 int data_len; 3362 bool_t is_eof; 3363 bool_t needrecov = FALSE; 3364 nfs4_recov_state_t recov_state; 3365 nfs4_stateid_types_t sid_types; 3366 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3367 3368 rp = VTOR4(vp); 3369 mi = VTOMI4(vp); 3370 doqueue = 1; 3371 3372 ASSERT(nfs_zone() == mi->mi_zone); 3373 3374 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3375 3376 args.array_len = 2; 3377 args.array = argop; 3378 3379 nfs4_init_stateid_types(&sid_types); 3380 3381 recov_state.rs_flags = 0; 3382 recov_state.rs_num_retry_despite_err = 0; 3383 3384 recov_retry: 3385 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3386 &recov_state, NULL); 3387 if (e.error) 3388 return (e.error); 3389 3390 /* putfh target fh */ 3391 argop[0].argop = OP_CPUTFH; 3392 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3393 3394 /* read */ 3395 argop[1].argop = OP_READ; 3396 rargs = &argop[1].nfs_argop4_u.opread; 3397 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3398 OP_READ, &sid_types, async); 3399 3400 do { 3401 if (mi->mi_io_kstats) { 3402 mutex_enter(&mi->mi_lock); 3403 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3404 mutex_exit(&mi->mi_lock); 3405 } 3406 3407 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3408 "nfs4read: %s call, rp %s", 3409 needrecov ? "recov" : "first", 3410 rnode4info(rp))); 3411 3412 if ((vp->v_flag & VNOCACHE) || 3413 (rp->r_flags & R4DIRECTIO) || 3414 (mi->mi_flags & MI4_DIRECTIO)) 3415 tsize = MIN(mi->mi_tsize, count); 3416 else 3417 tsize = MIN(mi->mi_curread, count); 3418 3419 rargs->offset = (offset4)offset; 3420 rargs->count = (count4)tsize; 3421 rargs->res_data_val_alt = NULL; 3422 rargs->res_mblk = NULL; 3423 rargs->res_uiop = NULL; 3424 rargs->res_maxsize = 0; 3425 rargs->wlist = NULL; 3426 3427 if (uiop) 3428 rargs->res_uiop = uiop; 3429 else 3430 rargs->res_data_val_alt = base; 3431 rargs->res_maxsize = tsize; 3432 3433 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3434 #ifdef DEBUG 3435 if (nfs4read_error_inject) { 3436 res.status = nfs4read_error_inject; 3437 nfs4read_error_inject = 0; 3438 } 3439 #endif 3440 3441 if (mi->mi_io_kstats) { 3442 mutex_enter(&mi->mi_lock); 3443 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3444 mutex_exit(&mi->mi_lock); 3445 } 3446 3447 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3448 if (e.error != 0 && !needrecov) { 3449 nfs4_end_fop(mi, vp, NULL, OH_READ, 3450 &recov_state, needrecov); 3451 return (e.error); 3452 } 3453 3454 /* 3455 * Do proper retry for OLD and BAD stateid errors outside 3456 * of the normal recovery framework. There are two differences 3457 * between async and sync reads. The first is that we allow 3458 * retry on BAD_STATEID for async reads, but not sync reads. 3459 * The second is that we mark the file dead for a failed 3460 * attempt with a special stateid for sync reads, but just 3461 * return EIO for async reads. 3462 * 3463 * If a sync read receives a BAD stateid error while using a 3464 * delegation stateid, retry using the open stateid (if it 3465 * exists). If it doesn't have an open stateid, reopen the 3466 * file first, then retry. 3467 */ 3468 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3469 res.status == NFS4ERR_BAD_STATEID) && async) { 3470 nfs4_end_fop(mi, vp, NULL, OH_READ, 3471 &recov_state, needrecov); 3472 if (sid_types.cur_sid_type == SPEC_SID) { 3473 (void) xdr_free(xdr_COMPOUND4res_clnt, 3474 (caddr_t)&res); 3475 return (EIO); 3476 } 3477 nfs4_save_stateid(&rargs->stateid, &sid_types); 3478 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3479 goto recov_retry; 3480 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3481 !async && sid_types.cur_sid_type != SPEC_SID) { 3482 nfs4_save_stateid(&rargs->stateid, &sid_types); 3483 nfs4_end_fop(mi, vp, NULL, OH_READ, 3484 &recov_state, needrecov); 3485 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3486 goto recov_retry; 3487 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3488 sid_types.cur_sid_type == DEL_SID) { 3489 nfs4_save_stateid(&rargs->stateid, &sid_types); 3490 mutex_enter(&rp->r_statev4_lock); 3491 rp->r_deleg_return_pending = TRUE; 3492 mutex_exit(&rp->r_statev4_lock); 3493 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3494 nfs4_end_fop(mi, vp, NULL, OH_READ, 3495 &recov_state, needrecov); 3496 (void) xdr_free(xdr_COMPOUND4res_clnt, 3497 (caddr_t)&res); 3498 return (EIO); 3499 } 3500 nfs4_end_fop(mi, vp, NULL, OH_READ, 3501 &recov_state, needrecov); 3502 /* hold needed for nfs4delegreturn_thread */ 3503 VN_HOLD(vp); 3504 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3505 NFS4_DR_DISCARD), FALSE); 3506 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3507 goto recov_retry; 3508 } 3509 if (needrecov) { 3510 bool_t abort; 3511 3512 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3513 "nfs4read: initiating recovery\n")); 3514 abort = nfs4_start_recovery(&e, 3515 mi, vp, NULL, &rargs->stateid, 3516 NULL, OP_READ, NULL); 3517 nfs4_end_fop(mi, vp, NULL, OH_READ, 3518 &recov_state, needrecov); 3519 /* 3520 * Do not retry if we got OLD_STATEID using a special 3521 * stateid. This avoids looping with a broken server. 3522 */ 3523 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3524 sid_types.cur_sid_type == SPEC_SID) 3525 abort = TRUE; 3526 3527 if (abort == FALSE) { 3528 /* 3529 * Need to retry all possible stateids in 3530 * case the recovery error wasn't stateid 3531 * related or the stateids have become 3532 * stale (server reboot). 3533 */ 3534 nfs4_init_stateid_types(&sid_types); 3535 (void) xdr_free(xdr_COMPOUND4res_clnt, 3536 (caddr_t)&res); 3537 goto recov_retry; 3538 } 3539 3540 if (!e.error) { 3541 e.error = geterrno4(res.status); 3542 (void) xdr_free(xdr_COMPOUND4res_clnt, 3543 (caddr_t)&res); 3544 } 3545 return (e.error); 3546 } 3547 3548 if (res.status) { 3549 e.error = geterrno4(res.status); 3550 nfs4_end_fop(mi, vp, NULL, OH_READ, 3551 &recov_state, needrecov); 3552 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3553 return (e.error); 3554 } 3555 3556 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3557 count -= data_len; 3558 if (base) 3559 base += data_len; 3560 offset += data_len; 3561 if (mi->mi_io_kstats) { 3562 mutex_enter(&mi->mi_lock); 3563 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3564 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3565 mutex_exit(&mi->mi_lock); 3566 } 3567 lwp_stat_update(LWP_STAT_INBLK, 1); 3568 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3569 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3570 3571 } while (count && !is_eof); 3572 3573 *residp = count; 3574 3575 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3576 3577 return (e.error); 3578 } 3579 3580 /* ARGSUSED */ 3581 static int 3582 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3583 caller_context_t *ct) 3584 { 3585 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3586 return (EIO); 3587 switch (cmd) { 3588 case _FIODIRECTIO: 3589 return (nfs4_directio(vp, (int)arg, cr)); 3590 default: 3591 return (ENOTTY); 3592 } 3593 } 3594 3595 /* ARGSUSED */ 3596 int 3597 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3598 caller_context_t *ct) 3599 { 3600 int error; 3601 rnode4_t *rp = VTOR4(vp); 3602 3603 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3604 return (EIO); 3605 /* 3606 * If it has been specified that the return value will 3607 * just be used as a hint, and we are only being asked 3608 * for size, fsid or rdevid, then return the client's 3609 * notion of these values without checking to make sure 3610 * that the attribute cache is up to date. 3611 * The whole point is to avoid an over the wire GETATTR 3612 * call. 3613 */ 3614 if (flags & ATTR_HINT) { 3615 if (vap->va_mask == 3616 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3617 mutex_enter(&rp->r_statelock); 3618 if (vap->va_mask | AT_SIZE) 3619 vap->va_size = rp->r_size; 3620 if (vap->va_mask | AT_FSID) 3621 vap->va_fsid = rp->r_attr.va_fsid; 3622 if (vap->va_mask | AT_RDEV) 3623 vap->va_rdev = rp->r_attr.va_rdev; 3624 mutex_exit(&rp->r_statelock); 3625 return (0); 3626 } 3627 } 3628 3629 /* 3630 * Only need to flush pages if asking for the mtime 3631 * and if there any dirty pages or any outstanding 3632 * asynchronous (write) requests for this file. 3633 */ 3634 if (vap->va_mask & AT_MTIME) { 3635 rp = VTOR4(vp); 3636 if (nfs4_has_pages(vp)) { 3637 mutex_enter(&rp->r_statev4_lock); 3638 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3639 mutex_exit(&rp->r_statev4_lock); 3640 if (rp->r_flags & R4DIRTY || 3641 rp->r_awcount > 0) { 3642 mutex_enter(&rp->r_statelock); 3643 rp->r_gcount++; 3644 mutex_exit(&rp->r_statelock); 3645 error = 3646 nfs4_putpage(vp, (u_offset_t)0, 3647 0, 0, cr, NULL); 3648 mutex_enter(&rp->r_statelock); 3649 if (error && (error == ENOSPC || 3650 error == EDQUOT)) { 3651 if (!rp->r_error) 3652 rp->r_error = error; 3653 } 3654 if (--rp->r_gcount == 0) 3655 cv_broadcast(&rp->r_cv); 3656 mutex_exit(&rp->r_statelock); 3657 } 3658 } else { 3659 mutex_exit(&rp->r_statev4_lock); 3660 } 3661 } 3662 } 3663 return (nfs4getattr(vp, vap, cr)); 3664 } 3665 3666 int 3667 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3668 { 3669 /* 3670 * If these are the only two bits cleared 3671 * on the server then return 0 (OK) else 3672 * return 1 (BAD). 3673 */ 3674 on_client &= ~(S_ISUID|S_ISGID); 3675 if (on_client == from_server) 3676 return (0); 3677 else 3678 return (1); 3679 } 3680 3681 /*ARGSUSED4*/ 3682 static int 3683 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3684 caller_context_t *ct) 3685 { 3686 if (vap->va_mask & AT_NOSET) 3687 return (EINVAL); 3688 3689 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3690 return (EIO); 3691 3692 /* 3693 * Don't call secpolicy_vnode_setattr, the client cannot 3694 * use its cached attributes to make security decisions 3695 * as the server may be faking mode bits or mapping uid/gid. 3696 * Always just let the server to the checking. 3697 * If we provide the ability to remove basic priviledges 3698 * to setattr (e.g. basic without chmod) then we will 3699 * need to add a check here before calling the server. 3700 */ 3701 3702 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3703 } 3704 3705 /* 3706 * To replace the "guarded" version 3 setattr, we use two types of compound 3707 * setattr requests: 3708 * 1. The "normal" setattr, used when the size of the file isn't being 3709 * changed - { Putfh <fh>; Setattr; Getattr }/ 3710 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3711 * with only ctime as the argument. If the server ctime differs from 3712 * what is cached on the client, the verify will fail, but we would 3713 * already have the ctime from the preceding getattr, so just set it 3714 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3715 * Setattr; Getattr }. 3716 * 3717 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3718 * this setattr and NULL if they are not. 3719 */ 3720 static int 3721 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3722 vsecattr_t *vsap) 3723 { 3724 COMPOUND4args_clnt args; 3725 COMPOUND4res_clnt res, *resp = NULL; 3726 nfs4_ga_res_t *garp = NULL; 3727 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3728 nfs_argop4 argop[5]; 3729 int verify_argop = -1; 3730 int setattr_argop = 1; 3731 nfs_resop4 *resop; 3732 vattr_t va; 3733 rnode4_t *rp; 3734 int doqueue = 1; 3735 uint_t mask = vap->va_mask; 3736 mode_t omode; 3737 vsecattr_t *vsp; 3738 timestruc_t ctime; 3739 bool_t needrecov = FALSE; 3740 nfs4_recov_state_t recov_state; 3741 nfs4_stateid_types_t sid_types; 3742 stateid4 stateid; 3743 hrtime_t t; 3744 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3745 servinfo4_t *svp; 3746 bitmap4 supp_attrs; 3747 3748 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3749 rp = VTOR4(vp); 3750 nfs4_init_stateid_types(&sid_types); 3751 3752 /* 3753 * Only need to flush pages if there are any pages and 3754 * if the file is marked as dirty in some fashion. The 3755 * file must be flushed so that we can accurately 3756 * determine the size of the file and the cached data 3757 * after the SETATTR returns. A file is considered to 3758 * be dirty if it is either marked with R4DIRTY, has 3759 * outstanding i/o's active, or is mmap'd. In this 3760 * last case, we can't tell whether there are dirty 3761 * pages, so we flush just to be sure. 3762 */ 3763 if (nfs4_has_pages(vp) && 3764 ((rp->r_flags & R4DIRTY) || 3765 rp->r_count > 0 || 3766 rp->r_mapcnt > 0)) { 3767 ASSERT(vp->v_type != VCHR); 3768 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3769 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3770 mutex_enter(&rp->r_statelock); 3771 if (!rp->r_error) 3772 rp->r_error = e.error; 3773 mutex_exit(&rp->r_statelock); 3774 } 3775 } 3776 3777 if (mask & AT_SIZE) { 3778 /* 3779 * Verification setattr compound for non-deleg AT_SIZE: 3780 * { Putfh; Getattr; Verify; Setattr; Getattr } 3781 * Set ctime local here (outside the do_again label) 3782 * so that subsequent retries (after failed VERIFY) 3783 * will use ctime from GETATTR results (from failed 3784 * verify compound) as VERIFY arg. 3785 * If file has delegation, then VERIFY(time_metadata) 3786 * is of little added value, so don't bother. 3787 */ 3788 mutex_enter(&rp->r_statev4_lock); 3789 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3790 rp->r_deleg_return_pending) { 3791 numops = 5; 3792 ctime = rp->r_attr.va_ctime; 3793 } 3794 mutex_exit(&rp->r_statev4_lock); 3795 } 3796 3797 recov_state.rs_flags = 0; 3798 recov_state.rs_num_retry_despite_err = 0; 3799 3800 args.ctag = TAG_SETATTR; 3801 do_again: 3802 recov_retry: 3803 setattr_argop = numops - 2; 3804 3805 args.array = argop; 3806 args.array_len = numops; 3807 3808 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3809 if (e.error) 3810 return (e.error); 3811 3812 3813 /* putfh target fh */ 3814 argop[0].argop = OP_CPUTFH; 3815 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3816 3817 if (numops == 5) { 3818 /* 3819 * We only care about the ctime, but need to get mtime 3820 * and size for proper cache update. 3821 */ 3822 /* getattr */ 3823 argop[1].argop = OP_GETATTR; 3824 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3825 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3826 3827 /* verify - set later in loop */ 3828 verify_argop = 2; 3829 } 3830 3831 /* setattr */ 3832 svp = rp->r_server; 3833 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3834 supp_attrs = svp->sv_supp_attrs; 3835 nfs_rw_exit(&svp->sv_lock); 3836 3837 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3838 supp_attrs, &e.error, &sid_types); 3839 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3840 if (e.error) { 3841 /* req time field(s) overflow - return immediately */ 3842 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3843 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3844 opsetattr.obj_attributes); 3845 return (e.error); 3846 } 3847 omode = rp->r_attr.va_mode; 3848 3849 /* getattr */ 3850 argop[numops-1].argop = OP_GETATTR; 3851 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3852 /* 3853 * If we are setting the ACL (indicated only by vsap != NULL), request 3854 * the ACL in this getattr. The ACL returned from this getattr will be 3855 * used in updating the ACL cache. 3856 */ 3857 if (vsap != NULL) 3858 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3859 FATTR4_ACL_MASK; 3860 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3861 3862 /* 3863 * setattr iterates if the object size is set and the cached ctime 3864 * does not match the file ctime. In that case, verify the ctime first. 3865 */ 3866 3867 do { 3868 if (verify_argop != -1) { 3869 /* 3870 * Verify that the ctime match before doing setattr. 3871 */ 3872 va.va_mask = AT_CTIME; 3873 va.va_ctime = ctime; 3874 svp = rp->r_server; 3875 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3876 supp_attrs = svp->sv_supp_attrs; 3877 nfs_rw_exit(&svp->sv_lock); 3878 e.error = nfs4args_verify(&argop[verify_argop], &va, 3879 OP_VERIFY, supp_attrs); 3880 if (e.error) { 3881 /* req time field(s) overflow - return */ 3882 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3883 needrecov); 3884 break; 3885 } 3886 } 3887 3888 doqueue = 1; 3889 3890 t = gethrtime(); 3891 3892 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3893 3894 /* 3895 * Purge the access cache and ACL cache if changing either the 3896 * owner of the file, the group owner, or the mode. These may 3897 * change the access permissions of the file, so purge old 3898 * information and start over again. 3899 */ 3900 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3901 (void) nfs4_access_purge_rp(rp); 3902 if (rp->r_secattr != NULL) { 3903 mutex_enter(&rp->r_statelock); 3904 vsp = rp->r_secattr; 3905 rp->r_secattr = NULL; 3906 mutex_exit(&rp->r_statelock); 3907 if (vsp != NULL) 3908 nfs4_acl_free_cache(vsp); 3909 } 3910 } 3911 3912 /* 3913 * If res.array_len == numops, then everything succeeded, 3914 * except for possibly the final getattr. If only the 3915 * last getattr failed, give up, and don't try recovery. 3916 */ 3917 if (res.array_len == numops) { 3918 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3919 needrecov); 3920 if (! e.error) 3921 resp = &res; 3922 break; 3923 } 3924 3925 /* 3926 * if either rpc call failed or completely succeeded - done 3927 */ 3928 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3929 if (e.error) { 3930 PURGE_ATTRCACHE4(vp); 3931 if (!needrecov) { 3932 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3933 needrecov); 3934 break; 3935 } 3936 } 3937 3938 /* 3939 * Do proper retry for OLD_STATEID outside of the normal 3940 * recovery framework. 3941 */ 3942 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3943 sid_types.cur_sid_type != SPEC_SID && 3944 sid_types.cur_sid_type != NO_SID) { 3945 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3946 needrecov); 3947 nfs4_save_stateid(&stateid, &sid_types); 3948 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3949 opsetattr.obj_attributes); 3950 if (verify_argop != -1) { 3951 nfs4args_verify_free(&argop[verify_argop]); 3952 verify_argop = -1; 3953 } 3954 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3955 goto recov_retry; 3956 } 3957 3958 if (needrecov) { 3959 bool_t abort; 3960 3961 abort = nfs4_start_recovery(&e, 3962 VTOMI4(vp), vp, NULL, NULL, NULL, 3963 OP_SETATTR, NULL); 3964 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3965 needrecov); 3966 /* 3967 * Do not retry if we failed with OLD_STATEID using 3968 * a special stateid. This is done to avoid looping 3969 * with a broken server. 3970 */ 3971 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3972 (sid_types.cur_sid_type == SPEC_SID || 3973 sid_types.cur_sid_type == NO_SID)) 3974 abort = TRUE; 3975 if (!e.error) { 3976 if (res.status == NFS4ERR_BADOWNER) 3977 nfs4_log_badowner(VTOMI4(vp), 3978 OP_SETATTR); 3979 3980 e.error = geterrno4(res.status); 3981 (void) xdr_free(xdr_COMPOUND4res_clnt, 3982 (caddr_t)&res); 3983 } 3984 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3985 opsetattr.obj_attributes); 3986 if (verify_argop != -1) { 3987 nfs4args_verify_free(&argop[verify_argop]); 3988 verify_argop = -1; 3989 } 3990 if (abort == FALSE) { 3991 /* 3992 * Need to retry all possible stateids in 3993 * case the recovery error wasn't stateid 3994 * related or the stateids have become 3995 * stale (server reboot). 3996 */ 3997 nfs4_init_stateid_types(&sid_types); 3998 goto recov_retry; 3999 } 4000 return (e.error); 4001 } 4002 4003 /* 4004 * Need to call nfs4_end_op before nfs4getattr to 4005 * avoid potential nfs4_start_op deadlock. See RFE 4006 * 4777612. Calls to nfs4_invalidate_pages() and 4007 * nfs4_purge_stale_fh() might also generate over the 4008 * wire calls which my cause nfs4_start_op() deadlock. 4009 */ 4010 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4011 4012 /* 4013 * Check to update lease. 4014 */ 4015 resp = &res; 4016 if (res.status == NFS4_OK) { 4017 break; 4018 } 4019 4020 /* 4021 * Check if verify failed to see if try again 4022 */ 4023 if ((verify_argop == -1) || (res.array_len != 3)) { 4024 /* 4025 * can't continue... 4026 */ 4027 if (res.status == NFS4ERR_BADOWNER) 4028 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4029 4030 e.error = geterrno4(res.status); 4031 } else { 4032 /* 4033 * When the verify request fails, the client ctime is 4034 * not in sync with the server. This is the same as 4035 * the version 3 "not synchronized" error, and we 4036 * handle it in a similar manner (XXX do we need to???). 4037 * Use the ctime returned in the first getattr for 4038 * the input to the next verify. 4039 * If we couldn't get the attributes, then we give up 4040 * because we can't complete the operation as required. 4041 */ 4042 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4043 } 4044 if (e.error) { 4045 PURGE_ATTRCACHE4(vp); 4046 nfs4_purge_stale_fh(e.error, vp, cr); 4047 } else { 4048 /* 4049 * retry with a new verify value 4050 */ 4051 ctime = garp->n4g_va.va_ctime; 4052 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4053 resp = NULL; 4054 } 4055 if (!e.error) { 4056 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4057 opsetattr.obj_attributes); 4058 if (verify_argop != -1) { 4059 nfs4args_verify_free(&argop[verify_argop]); 4060 verify_argop = -1; 4061 } 4062 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4063 goto do_again; 4064 } 4065 } while (!e.error); 4066 4067 if (e.error) { 4068 /* 4069 * If we are here, rfs4call has an irrecoverable error - return 4070 */ 4071 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4072 opsetattr.obj_attributes); 4073 if (verify_argop != -1) { 4074 nfs4args_verify_free(&argop[verify_argop]); 4075 verify_argop = -1; 4076 } 4077 if (resp) 4078 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4079 return (e.error); 4080 } 4081 4082 4083 4084 /* 4085 * If changing the size of the file, invalidate 4086 * any local cached data which is no longer part 4087 * of the file. We also possibly invalidate the 4088 * last page in the file. We could use 4089 * pvn_vpzero(), but this would mark the page as 4090 * modified and require it to be written back to 4091 * the server for no particularly good reason. 4092 * This way, if we access it, then we bring it 4093 * back in. A read should be cheaper than a 4094 * write. 4095 */ 4096 if (mask & AT_SIZE) { 4097 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4098 } 4099 4100 /* either no error or one of the postop getattr failed */ 4101 4102 /* 4103 * XXX Perform a simplified version of wcc checking. Instead of 4104 * have another getattr to get pre-op, just purge cache if 4105 * any of the ops prior to and including the getattr failed. 4106 * If the getattr succeeded then update the attrcache accordingly. 4107 */ 4108 4109 garp = NULL; 4110 if (res.status == NFS4_OK) { 4111 /* 4112 * Last getattr 4113 */ 4114 resop = &res.array[numops - 1]; 4115 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4116 } 4117 /* 4118 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4119 * rather than filling it. See the function itself for details. 4120 */ 4121 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4122 if (garp != NULL) { 4123 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4124 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4125 vs_ace4_destroy(&garp->n4g_vsa); 4126 } else { 4127 if (vsap != NULL) { 4128 /* 4129 * The ACL was supposed to be set and to be 4130 * returned in the last getattr of this 4131 * compound, but for some reason the getattr 4132 * result doesn't contain the ACL. In this 4133 * case, purge the ACL cache. 4134 */ 4135 if (rp->r_secattr != NULL) { 4136 mutex_enter(&rp->r_statelock); 4137 vsp = rp->r_secattr; 4138 rp->r_secattr = NULL; 4139 mutex_exit(&rp->r_statelock); 4140 if (vsp != NULL) 4141 nfs4_acl_free_cache(vsp); 4142 } 4143 } 4144 } 4145 } 4146 4147 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4148 /* 4149 * Set the size, rather than relying on getting it updated 4150 * via a GETATTR. With delegations the client tries to 4151 * suppress GETATTR calls. 4152 */ 4153 mutex_enter(&rp->r_statelock); 4154 rp->r_size = vap->va_size; 4155 mutex_exit(&rp->r_statelock); 4156 } 4157 4158 /* 4159 * Can free up request args and res 4160 */ 4161 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4162 opsetattr.obj_attributes); 4163 if (verify_argop != -1) { 4164 nfs4args_verify_free(&argop[verify_argop]); 4165 verify_argop = -1; 4166 } 4167 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4168 4169 /* 4170 * Some servers will change the mode to clear the setuid 4171 * and setgid bits when changing the uid or gid. The 4172 * client needs to compensate appropriately. 4173 */ 4174 if (mask & (AT_UID | AT_GID)) { 4175 int terror, do_setattr; 4176 4177 do_setattr = 0; 4178 va.va_mask = AT_MODE; 4179 terror = nfs4getattr(vp, &va, cr); 4180 if (!terror && 4181 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4182 (!(mask & AT_MODE) && va.va_mode != omode))) { 4183 va.va_mask = AT_MODE; 4184 if (mask & AT_MODE) { 4185 /* 4186 * We asked the mode to be changed and what 4187 * we just got from the server in getattr is 4188 * not what we wanted it to be, so set it now. 4189 */ 4190 va.va_mode = vap->va_mode; 4191 do_setattr = 1; 4192 } else { 4193 /* 4194 * We did not ask the mode to be changed, 4195 * Check to see that the server just cleared 4196 * I_SUID and I_GUID from it. If not then 4197 * set mode to omode with UID/GID cleared. 4198 */ 4199 if (nfs4_compare_modes(va.va_mode, omode)) { 4200 omode &= ~(S_ISUID|S_ISGID); 4201 va.va_mode = omode; 4202 do_setattr = 1; 4203 } 4204 } 4205 4206 if (do_setattr) 4207 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4208 } 4209 } 4210 4211 return (e.error); 4212 } 4213 4214 /* ARGSUSED */ 4215 static int 4216 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4217 { 4218 COMPOUND4args_clnt args; 4219 COMPOUND4res_clnt res; 4220 int doqueue; 4221 uint32_t acc, resacc, argacc; 4222 rnode4_t *rp; 4223 cred_t *cred, *ncr, *ncrfree = NULL; 4224 nfs4_access_type_t cacc; 4225 int num_ops; 4226 nfs_argop4 argop[3]; 4227 nfs_resop4 *resop; 4228 bool_t needrecov = FALSE, do_getattr; 4229 nfs4_recov_state_t recov_state; 4230 int rpc_error; 4231 hrtime_t t; 4232 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4233 mntinfo4_t *mi = VTOMI4(vp); 4234 4235 if (nfs_zone() != mi->mi_zone) 4236 return (EIO); 4237 4238 acc = 0; 4239 if (mode & VREAD) 4240 acc |= ACCESS4_READ; 4241 if (mode & VWRITE) { 4242 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4243 return (EROFS); 4244 if (vp->v_type == VDIR) 4245 acc |= ACCESS4_DELETE; 4246 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4247 } 4248 if (mode & VEXEC) { 4249 if (vp->v_type == VDIR) 4250 acc |= ACCESS4_LOOKUP; 4251 else 4252 acc |= ACCESS4_EXECUTE; 4253 } 4254 4255 if (VTOR4(vp)->r_acache != NULL) { 4256 e.error = nfs4_validate_caches(vp, cr); 4257 if (e.error) 4258 return (e.error); 4259 } 4260 4261 rp = VTOR4(vp); 4262 if (vp->v_type == VDIR) 4263 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4264 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4265 else 4266 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4267 ACCESS4_EXECUTE; 4268 recov_state.rs_flags = 0; 4269 recov_state.rs_num_retry_despite_err = 0; 4270 4271 cred = cr; 4272 /* 4273 * ncr and ncrfree both initially 4274 * point to the memory area returned 4275 * by crnetadjust(); 4276 * ncrfree not NULL when exiting means 4277 * that we need to release it 4278 */ 4279 ncr = crnetadjust(cred); 4280 ncrfree = ncr; 4281 4282 tryagain: 4283 cacc = nfs4_access_check(rp, acc, cred); 4284 if (cacc == NFS4_ACCESS_ALLOWED) { 4285 if (ncrfree != NULL) 4286 crfree(ncrfree); 4287 return (0); 4288 } 4289 if (cacc == NFS4_ACCESS_DENIED) { 4290 /* 4291 * If the cred can be adjusted, try again 4292 * with the new cred. 4293 */ 4294 if (ncr != NULL) { 4295 cred = ncr; 4296 ncr = NULL; 4297 goto tryagain; 4298 } 4299 if (ncrfree != NULL) 4300 crfree(ncrfree); 4301 return (EACCES); 4302 } 4303 4304 recov_retry: 4305 /* 4306 * Don't take with r_statev4_lock here. r_deleg_type could 4307 * change as soon as lock is released. Since it is an int, 4308 * there is no atomicity issue. 4309 */ 4310 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4311 num_ops = do_getattr ? 3 : 2; 4312 4313 args.ctag = TAG_ACCESS; 4314 4315 args.array_len = num_ops; 4316 args.array = argop; 4317 4318 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4319 &recov_state, NULL)) { 4320 if (ncrfree != NULL) 4321 crfree(ncrfree); 4322 return (e.error); 4323 } 4324 4325 /* putfh target fh */ 4326 argop[0].argop = OP_CPUTFH; 4327 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4328 4329 /* access */ 4330 argop[1].argop = OP_ACCESS; 4331 argop[1].nfs_argop4_u.opaccess.access = argacc; 4332 4333 /* getattr */ 4334 if (do_getattr) { 4335 argop[2].argop = OP_GETATTR; 4336 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4337 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4338 } 4339 4340 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4341 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4342 rnode4info(VTOR4(vp)))); 4343 4344 doqueue = 1; 4345 t = gethrtime(); 4346 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4347 rpc_error = e.error; 4348 4349 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4350 if (needrecov) { 4351 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4352 "nfs4_access: initiating recovery\n")); 4353 4354 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4355 NULL, OP_ACCESS, NULL) == FALSE) { 4356 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4357 &recov_state, needrecov); 4358 if (!e.error) 4359 (void) xdr_free(xdr_COMPOUND4res_clnt, 4360 (caddr_t)&res); 4361 goto recov_retry; 4362 } 4363 } 4364 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4365 4366 if (e.error) 4367 goto out; 4368 4369 if (res.status) { 4370 e.error = geterrno4(res.status); 4371 /* 4372 * This might generate over the wire calls throught 4373 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4374 * here to avoid a deadlock. 4375 */ 4376 nfs4_purge_stale_fh(e.error, vp, cr); 4377 goto out; 4378 } 4379 resop = &res.array[1]; /* access res */ 4380 4381 resacc = resop->nfs_resop4_u.opaccess.access; 4382 4383 if (do_getattr) { 4384 resop++; /* getattr res */ 4385 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4386 t, cr, FALSE, NULL); 4387 } 4388 4389 if (!e.error) { 4390 nfs4_access_cache(rp, argacc, resacc, cred); 4391 /* 4392 * we just cached results with cred; if cred is the 4393 * adjusted credentials from crnetadjust, we do not want 4394 * to release them before exiting: hence setting ncrfree 4395 * to NULL 4396 */ 4397 if (cred != cr) 4398 ncrfree = NULL; 4399 /* XXX check the supported bits too? */ 4400 if ((acc & resacc) != acc) { 4401 /* 4402 * The following code implements the semantic 4403 * that a setuid root program has *at least* the 4404 * permissions of the user that is running the 4405 * program. See rfs3call() for more portions 4406 * of the implementation of this functionality. 4407 */ 4408 /* XXX-LP */ 4409 if (ncr != NULL) { 4410 (void) xdr_free(xdr_COMPOUND4res_clnt, 4411 (caddr_t)&res); 4412 cred = ncr; 4413 ncr = NULL; 4414 goto tryagain; 4415 } 4416 e.error = EACCES; 4417 } 4418 } 4419 4420 out: 4421 if (!rpc_error) 4422 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4423 4424 if (ncrfree != NULL) 4425 crfree(ncrfree); 4426 4427 return (e.error); 4428 } 4429 4430 /* ARGSUSED */ 4431 static int 4432 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4433 { 4434 COMPOUND4args_clnt args; 4435 COMPOUND4res_clnt res; 4436 int doqueue; 4437 rnode4_t *rp; 4438 nfs_argop4 argop[3]; 4439 nfs_resop4 *resop; 4440 READLINK4res *lr_res; 4441 nfs4_ga_res_t *garp; 4442 uint_t len; 4443 char *linkdata; 4444 bool_t needrecov = FALSE; 4445 nfs4_recov_state_t recov_state; 4446 hrtime_t t; 4447 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4448 4449 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4450 return (EIO); 4451 /* 4452 * Can't readlink anything other than a symbolic link. 4453 */ 4454 if (vp->v_type != VLNK) 4455 return (EINVAL); 4456 4457 rp = VTOR4(vp); 4458 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4459 e.error = nfs4_validate_caches(vp, cr); 4460 if (e.error) 4461 return (e.error); 4462 mutex_enter(&rp->r_statelock); 4463 if (rp->r_symlink.contents != NULL) { 4464 e.error = uiomove(rp->r_symlink.contents, 4465 rp->r_symlink.len, UIO_READ, uiop); 4466 mutex_exit(&rp->r_statelock); 4467 return (e.error); 4468 } 4469 mutex_exit(&rp->r_statelock); 4470 } 4471 recov_state.rs_flags = 0; 4472 recov_state.rs_num_retry_despite_err = 0; 4473 4474 recov_retry: 4475 args.array_len = 3; 4476 args.array = argop; 4477 args.ctag = TAG_READLINK; 4478 4479 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4480 if (e.error) { 4481 return (e.error); 4482 } 4483 4484 /* 0. putfh symlink fh */ 4485 argop[0].argop = OP_CPUTFH; 4486 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4487 4488 /* 1. readlink */ 4489 argop[1].argop = OP_READLINK; 4490 4491 /* 2. getattr */ 4492 argop[2].argop = OP_GETATTR; 4493 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4494 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4495 4496 doqueue = 1; 4497 4498 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4499 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4500 rnode4info(VTOR4(vp)))); 4501 4502 t = gethrtime(); 4503 4504 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4505 4506 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4507 if (needrecov) { 4508 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4509 "nfs4_readlink: initiating recovery\n")); 4510 4511 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4512 NULL, OP_READLINK, NULL) == FALSE) { 4513 if (!e.error) 4514 (void) xdr_free(xdr_COMPOUND4res_clnt, 4515 (caddr_t)&res); 4516 4517 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4518 needrecov); 4519 goto recov_retry; 4520 } 4521 } 4522 4523 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4524 4525 if (e.error) 4526 return (e.error); 4527 4528 /* 4529 * There is an path in the code below which calls 4530 * nfs4_purge_stale_fh(), which may generate otw calls through 4531 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4532 * here to avoid nfs4_start_op() deadlock. 4533 */ 4534 4535 if (res.status && (res.array_len < args.array_len)) { 4536 /* 4537 * either Putfh or Link failed 4538 */ 4539 e.error = geterrno4(res.status); 4540 nfs4_purge_stale_fh(e.error, vp, cr); 4541 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4542 return (e.error); 4543 } 4544 4545 resop = &res.array[1]; /* readlink res */ 4546 lr_res = &resop->nfs_resop4_u.opreadlink; 4547 4548 /* 4549 * treat symlink names as data 4550 */ 4551 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4552 if (linkdata != NULL) { 4553 int uio_len = len - 1; 4554 /* len includes null byte, which we won't uiomove */ 4555 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4556 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4557 mutex_enter(&rp->r_statelock); 4558 if (rp->r_symlink.contents == NULL) { 4559 rp->r_symlink.contents = linkdata; 4560 rp->r_symlink.len = uio_len; 4561 rp->r_symlink.size = len; 4562 mutex_exit(&rp->r_statelock); 4563 } else { 4564 mutex_exit(&rp->r_statelock); 4565 kmem_free(linkdata, len); 4566 } 4567 } else { 4568 kmem_free(linkdata, len); 4569 } 4570 } 4571 if (res.status == NFS4_OK) { 4572 resop++; /* getattr res */ 4573 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4574 } 4575 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4576 4577 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4578 4579 /* 4580 * The over the wire error for attempting to readlink something 4581 * other than a symbolic link is ENXIO. However, we need to 4582 * return EINVAL instead of ENXIO, so we map it here. 4583 */ 4584 return (e.error == ENXIO ? EINVAL : e.error); 4585 } 4586 4587 /* 4588 * Flush local dirty pages to stable storage on the server. 4589 * 4590 * If FNODSYNC is specified, then there is nothing to do because 4591 * metadata changes are not cached on the client before being 4592 * sent to the server. 4593 */ 4594 /* ARGSUSED */ 4595 static int 4596 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4597 { 4598 int error; 4599 4600 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4601 return (0); 4602 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4603 return (EIO); 4604 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4605 if (!error) 4606 error = VTOR4(vp)->r_error; 4607 return (error); 4608 } 4609 4610 /* 4611 * Weirdness: if the file was removed or the target of a rename 4612 * operation while it was open, it got renamed instead. Here we 4613 * remove the renamed file. 4614 */ 4615 /* ARGSUSED */ 4616 void 4617 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4618 { 4619 rnode4_t *rp; 4620 4621 ASSERT(vp != DNLC_NO_VNODE); 4622 4623 rp = VTOR4(vp); 4624 4625 if (IS_SHADOW(vp, rp)) { 4626 sv_inactive(vp); 4627 return; 4628 } 4629 4630 /* 4631 * If this is coming from the wrong zone, we let someone in the right 4632 * zone take care of it asynchronously. We can get here due to 4633 * VN_RELE() being called from pageout() or fsflush(). This call may 4634 * potentially turn into an expensive no-op if, for instance, v_count 4635 * gets incremented in the meantime, but it's still correct. 4636 */ 4637 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4638 nfs4_async_inactive(vp, cr); 4639 return; 4640 } 4641 4642 /* 4643 * Some of the cleanup steps might require over-the-wire 4644 * operations. Since VOP_INACTIVE can get called as a result of 4645 * other over-the-wire operations (e.g., an attribute cache update 4646 * can lead to a DNLC purge), doing those steps now would lead to a 4647 * nested call to the recovery framework, which can deadlock. So 4648 * do any over-the-wire cleanups asynchronously, in a separate 4649 * thread. 4650 */ 4651 4652 mutex_enter(&rp->r_os_lock); 4653 mutex_enter(&rp->r_statelock); 4654 mutex_enter(&rp->r_statev4_lock); 4655 4656 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4657 mutex_exit(&rp->r_statev4_lock); 4658 mutex_exit(&rp->r_statelock); 4659 mutex_exit(&rp->r_os_lock); 4660 nfs4_async_inactive(vp, cr); 4661 return; 4662 } 4663 4664 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4665 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4666 mutex_exit(&rp->r_statev4_lock); 4667 mutex_exit(&rp->r_statelock); 4668 mutex_exit(&rp->r_os_lock); 4669 nfs4_async_inactive(vp, cr); 4670 return; 4671 } 4672 4673 if (rp->r_unldvp != NULL) { 4674 mutex_exit(&rp->r_statev4_lock); 4675 mutex_exit(&rp->r_statelock); 4676 mutex_exit(&rp->r_os_lock); 4677 nfs4_async_inactive(vp, cr); 4678 return; 4679 } 4680 mutex_exit(&rp->r_statev4_lock); 4681 mutex_exit(&rp->r_statelock); 4682 mutex_exit(&rp->r_os_lock); 4683 4684 rp4_addfree(rp, cr); 4685 } 4686 4687 /* 4688 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4689 * various bits of state. The caller must not refer to vp after this call. 4690 */ 4691 4692 void 4693 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4694 { 4695 rnode4_t *rp = VTOR4(vp); 4696 nfs4_recov_state_t recov_state; 4697 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4698 vnode_t *unldvp; 4699 char *unlname; 4700 cred_t *unlcred; 4701 COMPOUND4args_clnt args; 4702 COMPOUND4res_clnt res, *resp; 4703 nfs_argop4 argop[2]; 4704 int doqueue; 4705 #ifdef DEBUG 4706 char *name; 4707 #endif 4708 4709 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4710 ASSERT(!IS_SHADOW(vp, rp)); 4711 4712 #ifdef DEBUG 4713 name = fn_name(VTOSV(vp)->sv_name); 4714 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4715 "release vnode %s", name)); 4716 kmem_free(name, MAXNAMELEN); 4717 #endif 4718 4719 if (vp->v_type == VREG) { 4720 bool_t recov_failed = FALSE; 4721 4722 e.error = nfs4close_all(vp, cr); 4723 if (e.error) { 4724 /* Check to see if recovery failed */ 4725 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4726 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4727 recov_failed = TRUE; 4728 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4729 if (!recov_failed) { 4730 mutex_enter(&rp->r_statelock); 4731 if (rp->r_flags & R4RECOVERR) 4732 recov_failed = TRUE; 4733 mutex_exit(&rp->r_statelock); 4734 } 4735 if (recov_failed) { 4736 NFS4_DEBUG(nfs4_client_recov_debug, 4737 (CE_NOTE, "nfs4_inactive_otw: " 4738 "close failed (recovery failure)")); 4739 } 4740 } 4741 } 4742 4743 redo: 4744 if (rp->r_unldvp == NULL) { 4745 rp4_addfree(rp, cr); 4746 return; 4747 } 4748 4749 /* 4750 * Save the vnode pointer for the directory where the 4751 * unlinked-open file got renamed, then set it to NULL 4752 * to prevent another thread from getting here before 4753 * we're done with the remove. While we have the 4754 * statelock, make local copies of the pertinent rnode 4755 * fields. If we weren't to do this in an atomic way, the 4756 * the unl* fields could become inconsistent with respect 4757 * to each other due to a race condition between this 4758 * code and nfs_remove(). See bug report 1034328. 4759 */ 4760 mutex_enter(&rp->r_statelock); 4761 if (rp->r_unldvp == NULL) { 4762 mutex_exit(&rp->r_statelock); 4763 rp4_addfree(rp, cr); 4764 return; 4765 } 4766 4767 unldvp = rp->r_unldvp; 4768 rp->r_unldvp = NULL; 4769 unlname = rp->r_unlname; 4770 rp->r_unlname = NULL; 4771 unlcred = rp->r_unlcred; 4772 rp->r_unlcred = NULL; 4773 mutex_exit(&rp->r_statelock); 4774 4775 /* 4776 * If there are any dirty pages left, then flush 4777 * them. This is unfortunate because they just 4778 * may get thrown away during the remove operation, 4779 * but we have to do this for correctness. 4780 */ 4781 if (nfs4_has_pages(vp) && 4782 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4783 ASSERT(vp->v_type != VCHR); 4784 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4785 if (e.error) { 4786 mutex_enter(&rp->r_statelock); 4787 if (!rp->r_error) 4788 rp->r_error = e.error; 4789 mutex_exit(&rp->r_statelock); 4790 } 4791 } 4792 4793 recov_state.rs_flags = 0; 4794 recov_state.rs_num_retry_despite_err = 0; 4795 recov_retry_remove: 4796 /* 4797 * Do the remove operation on the renamed file 4798 */ 4799 args.ctag = TAG_INACTIVE; 4800 4801 /* 4802 * Remove ops: putfh dir; remove 4803 */ 4804 args.array_len = 2; 4805 args.array = argop; 4806 4807 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4808 if (e.error) { 4809 kmem_free(unlname, MAXNAMELEN); 4810 crfree(unlcred); 4811 VN_RELE(unldvp); 4812 /* 4813 * Try again; this time around r_unldvp will be NULL, so we'll 4814 * just call rp4_addfree() and return. 4815 */ 4816 goto redo; 4817 } 4818 4819 /* putfh directory */ 4820 argop[0].argop = OP_CPUTFH; 4821 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4822 4823 /* remove */ 4824 argop[1].argop = OP_CREMOVE; 4825 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4826 4827 doqueue = 1; 4828 resp = &res; 4829 4830 #if 0 /* notyet */ 4831 /* 4832 * Can't do this yet. We may be being called from 4833 * dnlc_purge_XXX while that routine is holding a 4834 * mutex lock to the nc_rele list. The calls to 4835 * nfs3_cache_wcc_data may result in calls to 4836 * dnlc_purge_XXX. This will result in a deadlock. 4837 */ 4838 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4839 if (e.error) { 4840 PURGE_ATTRCACHE4(unldvp); 4841 resp = NULL; 4842 } else if (res.status) { 4843 e.error = geterrno4(res.status); 4844 PURGE_ATTRCACHE4(unldvp); 4845 /* 4846 * This code is inactive right now 4847 * but if made active there should 4848 * be a nfs4_end_op() call before 4849 * nfs4_purge_stale_fh to avoid start_op() 4850 * deadlock. See BugId: 4948726 4851 */ 4852 nfs4_purge_stale_fh(error, unldvp, cr); 4853 } else { 4854 nfs_resop4 *resop; 4855 REMOVE4res *rm_res; 4856 4857 resop = &res.array[1]; 4858 rm_res = &resop->nfs_resop4_u.opremove; 4859 /* 4860 * Update directory cache attribute, 4861 * readdir and dnlc caches. 4862 */ 4863 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4864 } 4865 #else 4866 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4867 4868 PURGE_ATTRCACHE4(unldvp); 4869 #endif 4870 4871 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4872 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4873 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4874 if (!e.error) 4875 (void) xdr_free(xdr_COMPOUND4res_clnt, 4876 (caddr_t)&res); 4877 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4878 &recov_state, TRUE); 4879 goto recov_retry_remove; 4880 } 4881 } 4882 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4883 4884 /* 4885 * Release stuff held for the remove 4886 */ 4887 VN_RELE(unldvp); 4888 if (!e.error && resp) 4889 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4890 4891 kmem_free(unlname, MAXNAMELEN); 4892 crfree(unlcred); 4893 goto redo; 4894 } 4895 4896 /* 4897 * Remote file system operations having to do with directory manipulation. 4898 */ 4899 /* ARGSUSED3 */ 4900 int 4901 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4902 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4903 int *direntflags, pathname_t *realpnp) 4904 { 4905 int error; 4906 vnode_t *vp, *avp = NULL; 4907 rnode4_t *drp; 4908 4909 *vpp = NULL; 4910 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4911 return (EPERM); 4912 /* 4913 * if LOOKUP_XATTR, must replace dvp (object) with 4914 * object's attrdir before continuing with lookup 4915 */ 4916 if (flags & LOOKUP_XATTR) { 4917 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4918 if (error) 4919 return (error); 4920 4921 dvp = avp; 4922 4923 /* 4924 * If lookup is for "", just return dvp now. The attrdir 4925 * has already been activated (from nfs4lookup_xattr), and 4926 * the caller will RELE the original dvp -- not 4927 * the attrdir. So, set vpp and return. 4928 * Currently, when the LOOKUP_XATTR flag is 4929 * passed to VOP_LOOKUP, the name is always empty, and 4930 * shortcircuiting here avoids 3 unneeded lock/unlock 4931 * pairs. 4932 * 4933 * If a non-empty name was provided, then it is the 4934 * attribute name, and it will be looked up below. 4935 */ 4936 if (*nm == '\0') { 4937 *vpp = dvp; 4938 return (0); 4939 } 4940 4941 /* 4942 * The vfs layer never sends a name when asking for the 4943 * attrdir, so we should never get here (unless of course 4944 * name is passed at some time in future -- at which time 4945 * we'll blow up here). 4946 */ 4947 ASSERT(0); 4948 } 4949 4950 drp = VTOR4(dvp); 4951 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4952 return (EINTR); 4953 4954 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4955 nfs_rw_exit(&drp->r_rwlock); 4956 4957 /* 4958 * If vnode is a device, create special vnode. 4959 */ 4960 if (!error && ISVDEV((*vpp)->v_type)) { 4961 vp = *vpp; 4962 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4963 VN_RELE(vp); 4964 } 4965 4966 return (error); 4967 } 4968 4969 /* ARGSUSED */ 4970 static int 4971 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4972 { 4973 int error; 4974 rnode4_t *drp; 4975 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4976 mntinfo4_t *mi; 4977 4978 mi = VTOMI4(dvp); 4979 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 4980 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 4981 return (EINVAL); 4982 4983 drp = VTOR4(dvp); 4984 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4985 return (EINTR); 4986 4987 mutex_enter(&drp->r_statelock); 4988 /* 4989 * If the server doesn't support xattrs just return EINVAL 4990 */ 4991 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4992 mutex_exit(&drp->r_statelock); 4993 nfs_rw_exit(&drp->r_rwlock); 4994 return (EINVAL); 4995 } 4996 4997 /* 4998 * If there is a cached xattr directory entry, 4999 * use it as long as the attributes are valid. If the 5000 * attributes are not valid, take the simple approach and 5001 * free the cached value and re-fetch a new value. 5002 * 5003 * We don't negative entry cache for now, if we did we 5004 * would need to check if the file has changed on every 5005 * lookup. But xattrs don't exist very often and failing 5006 * an openattr is not much more expensive than and NVERIFY or GETATTR 5007 * so do an openattr over the wire for now. 5008 */ 5009 if (drp->r_xattr_dir != NULL) { 5010 if (ATTRCACHE4_VALID(dvp)) { 5011 VN_HOLD(drp->r_xattr_dir); 5012 *vpp = drp->r_xattr_dir; 5013 mutex_exit(&drp->r_statelock); 5014 nfs_rw_exit(&drp->r_rwlock); 5015 return (0); 5016 } 5017 VN_RELE(drp->r_xattr_dir); 5018 drp->r_xattr_dir = NULL; 5019 } 5020 mutex_exit(&drp->r_statelock); 5021 5022 error = nfs4openattr(dvp, vpp, cflag, cr); 5023 5024 nfs_rw_exit(&drp->r_rwlock); 5025 5026 return (error); 5027 } 5028 5029 static int 5030 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5031 { 5032 int error; 5033 rnode4_t *drp; 5034 5035 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5036 5037 /* 5038 * If lookup is for "", just return dvp. Don't need 5039 * to send it over the wire, look it up in the dnlc, 5040 * or perform any access checks. 5041 */ 5042 if (*nm == '\0') { 5043 VN_HOLD(dvp); 5044 *vpp = dvp; 5045 return (0); 5046 } 5047 5048 /* 5049 * Can't do lookups in non-directories. 5050 */ 5051 if (dvp->v_type != VDIR) 5052 return (ENOTDIR); 5053 5054 /* 5055 * If lookup is for ".", just return dvp. Don't need 5056 * to send it over the wire or look it up in the dnlc, 5057 * just need to check access. 5058 */ 5059 if (nm[0] == '.' && nm[1] == '\0') { 5060 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5061 if (error) 5062 return (error); 5063 VN_HOLD(dvp); 5064 *vpp = dvp; 5065 return (0); 5066 } 5067 5068 drp = VTOR4(dvp); 5069 if (!(drp->r_flags & R4LOOKUP)) { 5070 mutex_enter(&drp->r_statelock); 5071 drp->r_flags |= R4LOOKUP; 5072 mutex_exit(&drp->r_statelock); 5073 } 5074 5075 *vpp = NULL; 5076 /* 5077 * Lookup this name in the DNLC. If there is no entry 5078 * lookup over the wire. 5079 */ 5080 if (!skipdnlc) 5081 *vpp = dnlc_lookup(dvp, nm); 5082 if (*vpp == NULL) { 5083 /* 5084 * We need to go over the wire to lookup the name. 5085 */ 5086 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5087 } 5088 5089 /* 5090 * We hit on the dnlc 5091 */ 5092 if (*vpp != DNLC_NO_VNODE || 5093 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5094 /* 5095 * But our attrs may not be valid. 5096 */ 5097 if (ATTRCACHE4_VALID(dvp)) { 5098 error = nfs4_waitfor_purge_complete(dvp); 5099 if (error) { 5100 VN_RELE(*vpp); 5101 *vpp = NULL; 5102 return (error); 5103 } 5104 5105 /* 5106 * If after the purge completes, check to make sure 5107 * our attrs are still valid. 5108 */ 5109 if (ATTRCACHE4_VALID(dvp)) { 5110 /* 5111 * If we waited for a purge we may have 5112 * lost our vnode so look it up again. 5113 */ 5114 VN_RELE(*vpp); 5115 *vpp = dnlc_lookup(dvp, nm); 5116 if (*vpp == NULL) 5117 return (nfs4lookupnew_otw(dvp, 5118 nm, vpp, cr)); 5119 5120 /* 5121 * The access cache should almost always hit 5122 */ 5123 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5124 5125 if (error) { 5126 VN_RELE(*vpp); 5127 *vpp = NULL; 5128 return (error); 5129 } 5130 if (*vpp == DNLC_NO_VNODE) { 5131 VN_RELE(*vpp); 5132 *vpp = NULL; 5133 return (ENOENT); 5134 } 5135 return (0); 5136 } 5137 } 5138 } 5139 5140 ASSERT(*vpp != NULL); 5141 5142 /* 5143 * We may have gotten here we have one of the following cases: 5144 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5145 * need to validate them. 5146 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5147 * must validate. 5148 * 5149 * Go to the server and check if the directory has changed, if 5150 * it hasn't we are done and can use the dnlc entry. 5151 */ 5152 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5153 } 5154 5155 /* 5156 * Go to the server and check if the directory has changed, if 5157 * it hasn't we are done and can use the dnlc entry. If it 5158 * has changed we get a new copy of its attributes and check 5159 * the access for VEXEC, then relookup the filename and 5160 * get its filehandle and attributes. 5161 * 5162 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5163 * if the NVERIFY failed we must 5164 * purge the caches 5165 * cache new attributes (will set r_time_attr_inval) 5166 * cache new access 5167 * recheck VEXEC access 5168 * add name to dnlc, possibly negative 5169 * if LOOKUP succeeded 5170 * cache new attributes 5171 * else 5172 * set a new r_time_attr_inval for dvp 5173 * check to make sure we have access 5174 * 5175 * The vpp returned is the vnode passed in if the directory is valid, 5176 * a new vnode if successful lookup, or NULL on error. 5177 */ 5178 static int 5179 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5180 { 5181 COMPOUND4args_clnt args; 5182 COMPOUND4res_clnt res; 5183 fattr4 *ver_fattr; 5184 fattr4_change dchange; 5185 int32_t *ptr; 5186 int argoplist_size = 7 * sizeof (nfs_argop4); 5187 nfs_argop4 *argop; 5188 int doqueue; 5189 mntinfo4_t *mi; 5190 nfs4_recov_state_t recov_state; 5191 hrtime_t t; 5192 int isdotdot; 5193 vnode_t *nvp; 5194 nfs_fh4 *fhp; 5195 nfs4_sharedfh_t *sfhp; 5196 nfs4_access_type_t cacc; 5197 rnode4_t *nrp; 5198 rnode4_t *drp = VTOR4(dvp); 5199 nfs4_ga_res_t *garp = NULL; 5200 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5201 5202 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5203 ASSERT(nm != NULL); 5204 ASSERT(nm[0] != '\0'); 5205 ASSERT(dvp->v_type == VDIR); 5206 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5207 ASSERT(*vpp != NULL); 5208 5209 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5210 isdotdot = 1; 5211 args.ctag = TAG_LOOKUP_VPARENT; 5212 } else { 5213 /* 5214 * If dvp were a stub, it should have triggered and caused 5215 * a mount for us to get this far. 5216 */ 5217 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5218 5219 isdotdot = 0; 5220 args.ctag = TAG_LOOKUP_VALID; 5221 } 5222 5223 mi = VTOMI4(dvp); 5224 recov_state.rs_flags = 0; 5225 recov_state.rs_num_retry_despite_err = 0; 5226 5227 nvp = NULL; 5228 5229 /* Save the original mount point security information */ 5230 (void) save_mnt_secinfo(mi->mi_curr_serv); 5231 5232 recov_retry: 5233 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5234 &recov_state, NULL); 5235 if (e.error) { 5236 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5237 VN_RELE(*vpp); 5238 *vpp = NULL; 5239 return (e.error); 5240 } 5241 5242 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5243 5244 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5245 args.array_len = 7; 5246 args.array = argop; 5247 5248 /* 0. putfh file */ 5249 argop[0].argop = OP_CPUTFH; 5250 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5251 5252 /* 1. nverify the change info */ 5253 argop[1].argop = OP_NVERIFY; 5254 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5255 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5256 ver_fattr->attrlist4 = (char *)&dchange; 5257 ptr = (int32_t *)&dchange; 5258 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5259 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5260 5261 /* 2. getattr directory */ 5262 argop[2].argop = OP_GETATTR; 5263 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5264 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5265 5266 /* 3. access directory */ 5267 argop[3].argop = OP_ACCESS; 5268 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5269 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5270 5271 /* 4. lookup name */ 5272 if (isdotdot) { 5273 argop[4].argop = OP_LOOKUPP; 5274 } else { 5275 argop[4].argop = OP_CLOOKUP; 5276 argop[4].nfs_argop4_u.opclookup.cname = nm; 5277 } 5278 5279 /* 5. resulting file handle */ 5280 argop[5].argop = OP_GETFH; 5281 5282 /* 6. resulting file attributes */ 5283 argop[6].argop = OP_GETATTR; 5284 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5285 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5286 5287 doqueue = 1; 5288 t = gethrtime(); 5289 5290 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5291 5292 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5293 /* 5294 * For WRONGSEC of a non-dotdot case, send secinfo directly 5295 * from this thread, do not go thru the recovery thread since 5296 * we need the nm information. 5297 * 5298 * Not doing dotdot case because there is no specification 5299 * for (PUTFH, SECINFO "..") yet. 5300 */ 5301 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5302 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5303 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5304 &recov_state, FALSE); 5305 else 5306 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5307 &recov_state, TRUE); 5308 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5309 kmem_free(argop, argoplist_size); 5310 if (!e.error) 5311 goto recov_retry; 5312 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5313 VN_RELE(*vpp); 5314 *vpp = NULL; 5315 return (e.error); 5316 } 5317 5318 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5319 OP_LOOKUP, NULL) == FALSE) { 5320 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5321 &recov_state, TRUE); 5322 5323 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5324 kmem_free(argop, argoplist_size); 5325 goto recov_retry; 5326 } 5327 } 5328 5329 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5330 5331 if (e.error || res.array_len == 0) { 5332 /* 5333 * If e.error isn't set, then reply has no ops (or we couldn't 5334 * be here). The only legal way to reply without an op array 5335 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5336 * be in the reply for all other status values. 5337 * 5338 * For valid replies without an ops array, return ENOTSUP 5339 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5340 * return EIO -- don't trust status. 5341 */ 5342 if (e.error == 0) 5343 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5344 ENOTSUP : EIO; 5345 VN_RELE(*vpp); 5346 *vpp = NULL; 5347 kmem_free(argop, argoplist_size); 5348 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5349 return (e.error); 5350 } 5351 5352 if (res.status != NFS4ERR_SAME) { 5353 e.error = geterrno4(res.status); 5354 5355 /* 5356 * The NVERIFY "failed" so the directory has changed 5357 * First make sure PUTFH succeeded and NVERIFY "failed" 5358 * cleanly. 5359 */ 5360 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5361 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5362 nfs4_purge_stale_fh(e.error, dvp, cr); 5363 VN_RELE(*vpp); 5364 *vpp = NULL; 5365 goto exit; 5366 } 5367 5368 /* 5369 * We know the NVERIFY "failed" so we must: 5370 * purge the caches (access and indirectly dnlc if needed) 5371 */ 5372 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5373 5374 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5375 nfs4_purge_stale_fh(e.error, dvp, cr); 5376 VN_RELE(*vpp); 5377 *vpp = NULL; 5378 goto exit; 5379 } 5380 5381 /* 5382 * Install new cached attributes for the directory 5383 */ 5384 nfs4_attr_cache(dvp, 5385 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5386 t, cr, FALSE, NULL); 5387 5388 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5389 nfs4_purge_stale_fh(e.error, dvp, cr); 5390 VN_RELE(*vpp); 5391 *vpp = NULL; 5392 e.error = geterrno4(res.status); 5393 goto exit; 5394 } 5395 5396 /* 5397 * Now we know the directory is valid, 5398 * cache new directory access 5399 */ 5400 nfs4_access_cache(drp, 5401 args.array[3].nfs_argop4_u.opaccess.access, 5402 res.array[3].nfs_resop4_u.opaccess.access, cr); 5403 5404 /* 5405 * recheck VEXEC access 5406 */ 5407 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5408 if (cacc != NFS4_ACCESS_ALLOWED) { 5409 /* 5410 * Directory permissions might have been revoked 5411 */ 5412 if (cacc == NFS4_ACCESS_DENIED) { 5413 e.error = EACCES; 5414 VN_RELE(*vpp); 5415 *vpp = NULL; 5416 goto exit; 5417 } 5418 5419 /* 5420 * Somehow we must not have asked for enough 5421 * so try a singleton ACCESS, should never happen. 5422 */ 5423 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5424 if (e.error) { 5425 VN_RELE(*vpp); 5426 *vpp = NULL; 5427 goto exit; 5428 } 5429 } 5430 5431 e.error = geterrno4(res.status); 5432 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5433 /* 5434 * The lookup failed, probably no entry 5435 */ 5436 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5437 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5438 } else { 5439 /* 5440 * Might be some other error, so remove 5441 * the dnlc entry to make sure we start all 5442 * over again, next time. 5443 */ 5444 dnlc_remove(dvp, nm); 5445 } 5446 VN_RELE(*vpp); 5447 *vpp = NULL; 5448 goto exit; 5449 } 5450 5451 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5452 /* 5453 * The file exists but we can't get its fh for 5454 * some unknown reason. Remove it from the dnlc 5455 * and error out to be safe. 5456 */ 5457 dnlc_remove(dvp, nm); 5458 VN_RELE(*vpp); 5459 *vpp = NULL; 5460 goto exit; 5461 } 5462 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5463 if (fhp->nfs_fh4_len == 0) { 5464 /* 5465 * The file exists but a bogus fh 5466 * some unknown reason. Remove it from the dnlc 5467 * and error out to be safe. 5468 */ 5469 e.error = ENOENT; 5470 dnlc_remove(dvp, nm); 5471 VN_RELE(*vpp); 5472 *vpp = NULL; 5473 goto exit; 5474 } 5475 sfhp = sfh4_get(fhp, mi); 5476 5477 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5478 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5479 5480 /* 5481 * Make the new rnode 5482 */ 5483 if (isdotdot) { 5484 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5485 if (e.error) { 5486 sfh4_rele(&sfhp); 5487 VN_RELE(*vpp); 5488 *vpp = NULL; 5489 goto exit; 5490 } 5491 /* 5492 * XXX if nfs4_make_dotdot uses an existing rnode 5493 * XXX it doesn't update the attributes. 5494 * XXX for now just save them again to save an OTW 5495 */ 5496 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5497 } else { 5498 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5499 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5500 /* 5501 * If v_type == VNON, then garp was NULL because 5502 * the last op in the compound failed and makenfs4node 5503 * could not find the vnode for sfhp. It created 5504 * a new vnode, so we have nothing to purge here. 5505 */ 5506 if (nvp->v_type == VNON) { 5507 vattr_t vattr; 5508 5509 vattr.va_mask = AT_TYPE; 5510 /* 5511 * N.B. We've already called nfs4_end_fop above. 5512 */ 5513 e.error = nfs4getattr(nvp, &vattr, cr); 5514 if (e.error) { 5515 sfh4_rele(&sfhp); 5516 VN_RELE(*vpp); 5517 *vpp = NULL; 5518 VN_RELE(nvp); 5519 goto exit; 5520 } 5521 nvp->v_type = vattr.va_type; 5522 } 5523 } 5524 sfh4_rele(&sfhp); 5525 5526 nrp = VTOR4(nvp); 5527 mutex_enter(&nrp->r_statev4_lock); 5528 if (!nrp->created_v4) { 5529 mutex_exit(&nrp->r_statev4_lock); 5530 dnlc_update(dvp, nm, nvp); 5531 } else 5532 mutex_exit(&nrp->r_statev4_lock); 5533 5534 VN_RELE(*vpp); 5535 *vpp = nvp; 5536 } else { 5537 hrtime_t now; 5538 hrtime_t delta = 0; 5539 5540 e.error = 0; 5541 5542 /* 5543 * Because the NVERIFY "succeeded" we know that the 5544 * directory attributes are still valid 5545 * so update r_time_attr_inval 5546 */ 5547 now = gethrtime(); 5548 mutex_enter(&drp->r_statelock); 5549 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5550 delta = now - drp->r_time_attr_saved; 5551 if (delta < mi->mi_acdirmin) 5552 delta = mi->mi_acdirmin; 5553 else if (delta > mi->mi_acdirmax) 5554 delta = mi->mi_acdirmax; 5555 } 5556 drp->r_time_attr_inval = now + delta; 5557 mutex_exit(&drp->r_statelock); 5558 dnlc_update(dvp, nm, *vpp); 5559 5560 /* 5561 * Even though we have a valid directory attr cache 5562 * and dnlc entry, we may not have access. 5563 * This should almost always hit the cache. 5564 */ 5565 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5566 if (e.error) { 5567 VN_RELE(*vpp); 5568 *vpp = NULL; 5569 } 5570 5571 if (*vpp == DNLC_NO_VNODE) { 5572 VN_RELE(*vpp); 5573 *vpp = NULL; 5574 e.error = ENOENT; 5575 } 5576 } 5577 5578 exit: 5579 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5580 kmem_free(argop, argoplist_size); 5581 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5582 return (e.error); 5583 } 5584 5585 /* 5586 * We need to go over the wire to lookup the name, but 5587 * while we are there verify the directory has not 5588 * changed but if it has, get new attributes and check access 5589 * 5590 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5591 * NVERIFY GETATTR ACCESS 5592 * 5593 * With the results: 5594 * if the NVERIFY failed we must purge the caches, add new attributes, 5595 * and cache new access. 5596 * set a new r_time_attr_inval 5597 * add name to dnlc, possibly negative 5598 * if LOOKUP succeeded 5599 * cache new attributes 5600 */ 5601 static int 5602 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5603 { 5604 COMPOUND4args_clnt args; 5605 COMPOUND4res_clnt res; 5606 fattr4 *ver_fattr; 5607 fattr4_change dchange; 5608 int32_t *ptr; 5609 nfs4_ga_res_t *garp = NULL; 5610 int argoplist_size = 9 * sizeof (nfs_argop4); 5611 nfs_argop4 *argop; 5612 int doqueue; 5613 mntinfo4_t *mi; 5614 nfs4_recov_state_t recov_state; 5615 hrtime_t t; 5616 int isdotdot; 5617 vnode_t *nvp; 5618 nfs_fh4 *fhp; 5619 nfs4_sharedfh_t *sfhp; 5620 nfs4_access_type_t cacc; 5621 rnode4_t *nrp; 5622 rnode4_t *drp = VTOR4(dvp); 5623 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5624 5625 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5626 ASSERT(nm != NULL); 5627 ASSERT(nm[0] != '\0'); 5628 ASSERT(dvp->v_type == VDIR); 5629 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5630 ASSERT(*vpp == NULL); 5631 5632 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5633 isdotdot = 1; 5634 args.ctag = TAG_LOOKUP_PARENT; 5635 } else { 5636 /* 5637 * If dvp were a stub, it should have triggered and caused 5638 * a mount for us to get this far. 5639 */ 5640 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5641 5642 isdotdot = 0; 5643 args.ctag = TAG_LOOKUP; 5644 } 5645 5646 mi = VTOMI4(dvp); 5647 recov_state.rs_flags = 0; 5648 recov_state.rs_num_retry_despite_err = 0; 5649 5650 nvp = NULL; 5651 5652 /* Save the original mount point security information */ 5653 (void) save_mnt_secinfo(mi->mi_curr_serv); 5654 5655 recov_retry: 5656 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5657 &recov_state, NULL); 5658 if (e.error) { 5659 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5660 return (e.error); 5661 } 5662 5663 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5664 5665 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5666 args.array_len = 9; 5667 args.array = argop; 5668 5669 /* 0. putfh file */ 5670 argop[0].argop = OP_CPUTFH; 5671 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5672 5673 /* 1. savefh for the nverify */ 5674 argop[1].argop = OP_SAVEFH; 5675 5676 /* 2. lookup name */ 5677 if (isdotdot) { 5678 argop[2].argop = OP_LOOKUPP; 5679 } else { 5680 argop[2].argop = OP_CLOOKUP; 5681 argop[2].nfs_argop4_u.opclookup.cname = nm; 5682 } 5683 5684 /* 3. resulting file handle */ 5685 argop[3].argop = OP_GETFH; 5686 5687 /* 4. resulting file attributes */ 5688 argop[4].argop = OP_GETATTR; 5689 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5690 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5691 5692 /* 5. restorefh back the directory for the nverify */ 5693 argop[5].argop = OP_RESTOREFH; 5694 5695 /* 6. nverify the change info */ 5696 argop[6].argop = OP_NVERIFY; 5697 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5698 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5699 ver_fattr->attrlist4 = (char *)&dchange; 5700 ptr = (int32_t *)&dchange; 5701 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5702 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5703 5704 /* 7. getattr directory */ 5705 argop[7].argop = OP_GETATTR; 5706 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5707 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5708 5709 /* 8. access directory */ 5710 argop[8].argop = OP_ACCESS; 5711 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5712 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5713 5714 doqueue = 1; 5715 t = gethrtime(); 5716 5717 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5718 5719 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5720 /* 5721 * For WRONGSEC of a non-dotdot case, send secinfo directly 5722 * from this thread, do not go thru the recovery thread since 5723 * we need the nm information. 5724 * 5725 * Not doing dotdot case because there is no specification 5726 * for (PUTFH, SECINFO "..") yet. 5727 */ 5728 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5729 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5730 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5731 &recov_state, FALSE); 5732 else 5733 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5734 &recov_state, TRUE); 5735 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5736 kmem_free(argop, argoplist_size); 5737 if (!e.error) 5738 goto recov_retry; 5739 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5740 return (e.error); 5741 } 5742 5743 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5744 OP_LOOKUP, NULL) == FALSE) { 5745 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5746 &recov_state, TRUE); 5747 5748 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5749 kmem_free(argop, argoplist_size); 5750 goto recov_retry; 5751 } 5752 } 5753 5754 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5755 5756 if (e.error || res.array_len == 0) { 5757 /* 5758 * If e.error isn't set, then reply has no ops (or we couldn't 5759 * be here). The only legal way to reply without an op array 5760 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5761 * be in the reply for all other status values. 5762 * 5763 * For valid replies without an ops array, return ENOTSUP 5764 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5765 * return EIO -- don't trust status. 5766 */ 5767 if (e.error == 0) 5768 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5769 ENOTSUP : EIO; 5770 5771 kmem_free(argop, argoplist_size); 5772 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5773 return (e.error); 5774 } 5775 5776 e.error = geterrno4(res.status); 5777 5778 /* 5779 * The PUTFH and SAVEFH may have failed. 5780 */ 5781 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5782 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5783 nfs4_purge_stale_fh(e.error, dvp, cr); 5784 goto exit; 5785 } 5786 5787 /* 5788 * Check if the file exists, if it does delay entering 5789 * into the dnlc until after we update the directory 5790 * attributes so we don't cause it to get purged immediately. 5791 */ 5792 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5793 /* 5794 * The lookup failed, probably no entry 5795 */ 5796 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5797 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5798 goto exit; 5799 } 5800 5801 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5802 /* 5803 * The file exists but we can't get its fh for 5804 * some unknown reason. Error out to be safe. 5805 */ 5806 goto exit; 5807 } 5808 5809 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5810 if (fhp->nfs_fh4_len == 0) { 5811 /* 5812 * The file exists but a bogus fh 5813 * some unknown reason. Error out to be safe. 5814 */ 5815 e.error = EIO; 5816 goto exit; 5817 } 5818 sfhp = sfh4_get(fhp, mi); 5819 5820 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5821 sfh4_rele(&sfhp); 5822 goto exit; 5823 } 5824 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5825 5826 /* 5827 * The RESTOREFH may have failed 5828 */ 5829 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5830 sfh4_rele(&sfhp); 5831 e.error = EIO; 5832 goto exit; 5833 } 5834 5835 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5836 /* 5837 * First make sure the NVERIFY failed as we expected, 5838 * if it didn't then be conservative and error out 5839 * as we can't trust the directory. 5840 */ 5841 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5842 sfh4_rele(&sfhp); 5843 e.error = EIO; 5844 goto exit; 5845 } 5846 5847 /* 5848 * We know the NVERIFY "failed" so the directory has changed, 5849 * so we must: 5850 * purge the caches (access and indirectly dnlc if needed) 5851 */ 5852 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5853 5854 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5855 sfh4_rele(&sfhp); 5856 goto exit; 5857 } 5858 nfs4_attr_cache(dvp, 5859 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5860 t, cr, FALSE, NULL); 5861 5862 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5863 nfs4_purge_stale_fh(e.error, dvp, cr); 5864 sfh4_rele(&sfhp); 5865 e.error = geterrno4(res.status); 5866 goto exit; 5867 } 5868 5869 /* 5870 * Now we know the directory is valid, 5871 * cache new directory access 5872 */ 5873 nfs4_access_cache(drp, 5874 args.array[8].nfs_argop4_u.opaccess.access, 5875 res.array[8].nfs_resop4_u.opaccess.access, cr); 5876 5877 /* 5878 * recheck VEXEC access 5879 */ 5880 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5881 if (cacc != NFS4_ACCESS_ALLOWED) { 5882 /* 5883 * Directory permissions might have been revoked 5884 */ 5885 if (cacc == NFS4_ACCESS_DENIED) { 5886 sfh4_rele(&sfhp); 5887 e.error = EACCES; 5888 goto exit; 5889 } 5890 5891 /* 5892 * Somehow we must not have asked for enough 5893 * so try a singleton ACCESS should never happen 5894 */ 5895 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5896 if (e.error) { 5897 sfh4_rele(&sfhp); 5898 goto exit; 5899 } 5900 } 5901 5902 e.error = geterrno4(res.status); 5903 } else { 5904 hrtime_t now; 5905 hrtime_t delta = 0; 5906 5907 e.error = 0; 5908 5909 /* 5910 * Because the NVERIFY "succeeded" we know that the 5911 * directory attributes are still valid 5912 * so update r_time_attr_inval 5913 */ 5914 now = gethrtime(); 5915 mutex_enter(&drp->r_statelock); 5916 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5917 delta = now - drp->r_time_attr_saved; 5918 if (delta < mi->mi_acdirmin) 5919 delta = mi->mi_acdirmin; 5920 else if (delta > mi->mi_acdirmax) 5921 delta = mi->mi_acdirmax; 5922 } 5923 drp->r_time_attr_inval = now + delta; 5924 mutex_exit(&drp->r_statelock); 5925 5926 /* 5927 * Even though we have a valid directory attr cache, 5928 * we may not have access. 5929 * This should almost always hit the cache. 5930 */ 5931 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5932 if (e.error) { 5933 sfh4_rele(&sfhp); 5934 goto exit; 5935 } 5936 } 5937 5938 /* 5939 * Now we have successfully completed the lookup, if the 5940 * directory has changed we now have the valid attributes. 5941 * We also know we have directory access. 5942 * Create the new rnode and insert it in the dnlc. 5943 */ 5944 if (isdotdot) { 5945 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5946 if (e.error) { 5947 sfh4_rele(&sfhp); 5948 goto exit; 5949 } 5950 /* 5951 * XXX if nfs4_make_dotdot uses an existing rnode 5952 * XXX it doesn't update the attributes. 5953 * XXX for now just save them again to save an OTW 5954 */ 5955 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5956 } else { 5957 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5958 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5959 } 5960 sfh4_rele(&sfhp); 5961 5962 nrp = VTOR4(nvp); 5963 mutex_enter(&nrp->r_statev4_lock); 5964 if (!nrp->created_v4) { 5965 mutex_exit(&nrp->r_statev4_lock); 5966 dnlc_update(dvp, nm, nvp); 5967 } else 5968 mutex_exit(&nrp->r_statev4_lock); 5969 5970 *vpp = nvp; 5971 5972 exit: 5973 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5974 kmem_free(argop, argoplist_size); 5975 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5976 return (e.error); 5977 } 5978 5979 #ifdef DEBUG 5980 void 5981 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5982 { 5983 uint_t i, len; 5984 zoneid_t zoneid = getzoneid(); 5985 char *s; 5986 5987 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5988 for (i = 0; i < argcnt; i++) { 5989 nfs_argop4 *op = &argbase[i]; 5990 switch (op->argop) { 5991 case OP_CPUTFH: 5992 case OP_PUTFH: 5993 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5994 break; 5995 case OP_PUTROOTFH: 5996 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5997 break; 5998 case OP_CLOOKUP: 5999 s = op->nfs_argop4_u.opclookup.cname; 6000 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6001 break; 6002 case OP_LOOKUP: 6003 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6004 &len, NULL); 6005 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6006 kmem_free(s, len); 6007 break; 6008 case OP_LOOKUPP: 6009 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6010 break; 6011 case OP_GETFH: 6012 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6013 break; 6014 case OP_GETATTR: 6015 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6016 break; 6017 case OP_OPENATTR: 6018 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6019 break; 6020 default: 6021 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6022 op->argop); 6023 break; 6024 } 6025 } 6026 } 6027 #endif 6028 6029 /* 6030 * nfs4lookup_setup - constructs a multi-lookup compound request. 6031 * 6032 * Given the path "nm1/nm2/.../nmn", the following compound requests 6033 * may be created: 6034 * 6035 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6036 * is faster, for now. 6037 * 6038 * l4_getattrs indicates the type of compound requested. 6039 * 6040 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6041 * 6042 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6043 * 6044 * total number of ops is n + 1. 6045 * 6046 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6047 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6048 * before the last component, and only get attributes 6049 * for the last component. Note that the second-to-last 6050 * pathname component is XATTR_RPATH, which does NOT go 6051 * over-the-wire as a lookup. 6052 * 6053 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6054 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6055 * 6056 * and total number of ops is n + 5. 6057 * 6058 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6059 * attribute directory: create lookups plus an OPENATTR 6060 * replacing the last lookup. Note that the last pathname 6061 * component is XATTR_RPATH, which does NOT go over-the-wire 6062 * as a lookup. 6063 * 6064 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6065 * Openattr; Getfh; Getattr } 6066 * 6067 * and total number of ops is n + 5. 6068 * 6069 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6070 * nodes too. 6071 * 6072 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6073 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6074 * 6075 * and total number of ops is 3*n + 1. 6076 * 6077 * All cases: returns the index in the arg array of the final LOOKUP op, or 6078 * -1 if no LOOKUPs were used. 6079 */ 6080 int 6081 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6082 { 6083 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6084 nfs_argop4 *argbase, *argop; 6085 int arglen, argcnt; 6086 int n = 1; /* number of components */ 6087 int nga = 1; /* number of Getattr's in request */ 6088 char c = '\0', *s, *p; 6089 int lookup_idx = -1; 6090 int argoplist_size; 6091 6092 /* set lookuparg response result to 0 */ 6093 lookupargp->resp->status = NFS4_OK; 6094 6095 /* skip leading "/" or "." e.g. ".//./" if there is */ 6096 for (; ; nm++) { 6097 if (*nm != '/' && *nm != '.') 6098 break; 6099 6100 /* ".." is counted as 1 component */ 6101 if (*nm == '.' && *(nm + 1) != '/') 6102 break; 6103 } 6104 6105 /* 6106 * Find n = number of components - nm must be null terminated 6107 * Skip "." components. 6108 */ 6109 if (*nm != '\0') 6110 for (n = 1, s = nm; *s != '\0'; s++) { 6111 if ((*s == '/') && (*(s + 1) != '/') && 6112 (*(s + 1) != '\0') && 6113 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6114 *(s + 2) == '\0'))) 6115 n++; 6116 } 6117 else 6118 n = 0; 6119 6120 /* 6121 * nga is number of components that need Getfh+Getattr 6122 */ 6123 switch (l4_getattrs) { 6124 case LKP4_NO_ATTRIBUTES: 6125 nga = 0; 6126 break; 6127 case LKP4_ALL_ATTRIBUTES: 6128 nga = n; 6129 /* 6130 * Always have at least 1 getfh, getattr pair 6131 */ 6132 if (nga == 0) 6133 nga++; 6134 break; 6135 case LKP4_LAST_ATTRDIR: 6136 case LKP4_LAST_NAMED_ATTR: 6137 nga = n+1; 6138 break; 6139 } 6140 6141 /* 6142 * If change to use the filehandle attr instead of getfh 6143 * the following line can be deleted. 6144 */ 6145 nga *= 2; 6146 6147 /* 6148 * calculate number of ops in request as 6149 * header + trailer + lookups + getattrs 6150 */ 6151 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6152 6153 argoplist_size = arglen * sizeof (nfs_argop4); 6154 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6155 lookupargp->argsp->array = argop; 6156 6157 argcnt = lookupargp->header_len; 6158 argop += argcnt; 6159 6160 /* 6161 * loop and create a lookup op and possibly getattr/getfh for 6162 * each component. Skip "." components. 6163 */ 6164 for (s = nm; *s != '\0'; s = p) { 6165 /* 6166 * Set up a pathname struct for each component if needed 6167 */ 6168 while (*s == '/') 6169 s++; 6170 if (*s == '\0') 6171 break; 6172 6173 for (p = s; (*p != '/') && (*p != '\0'); p++) 6174 ; 6175 c = *p; 6176 *p = '\0'; 6177 6178 if (s[0] == '.' && s[1] == '\0') { 6179 *p = c; 6180 continue; 6181 } 6182 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6183 strcmp(s, XATTR_RPATH) == 0) { 6184 /* getfh XXX may not be needed in future */ 6185 argop->argop = OP_GETFH; 6186 argop++; 6187 argcnt++; 6188 6189 /* getattr */ 6190 argop->argop = OP_GETATTR; 6191 argop->nfs_argop4_u.opgetattr.attr_request = 6192 lookupargp->ga_bits; 6193 argop->nfs_argop4_u.opgetattr.mi = 6194 lookupargp->mi; 6195 argop++; 6196 argcnt++; 6197 6198 /* openattr */ 6199 argop->argop = OP_OPENATTR; 6200 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6201 strcmp(s, XATTR_RPATH) == 0) { 6202 /* openattr */ 6203 argop->argop = OP_OPENATTR; 6204 argop++; 6205 argcnt++; 6206 6207 /* getfh XXX may not be needed in future */ 6208 argop->argop = OP_GETFH; 6209 argop++; 6210 argcnt++; 6211 6212 /* getattr */ 6213 argop->argop = OP_GETATTR; 6214 argop->nfs_argop4_u.opgetattr.attr_request = 6215 lookupargp->ga_bits; 6216 argop->nfs_argop4_u.opgetattr.mi = 6217 lookupargp->mi; 6218 argop++; 6219 argcnt++; 6220 *p = c; 6221 continue; 6222 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6223 /* lookupp */ 6224 argop->argop = OP_LOOKUPP; 6225 } else { 6226 /* lookup */ 6227 argop->argop = OP_LOOKUP; 6228 (void) str_to_utf8(s, 6229 &argop->nfs_argop4_u.oplookup.objname); 6230 } 6231 lookup_idx = argcnt; 6232 argop++; 6233 argcnt++; 6234 6235 *p = c; 6236 6237 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6238 /* getfh XXX may not be needed in future */ 6239 argop->argop = OP_GETFH; 6240 argop++; 6241 argcnt++; 6242 6243 /* getattr */ 6244 argop->argop = OP_GETATTR; 6245 argop->nfs_argop4_u.opgetattr.attr_request = 6246 lookupargp->ga_bits; 6247 argop->nfs_argop4_u.opgetattr.mi = 6248 lookupargp->mi; 6249 argop++; 6250 argcnt++; 6251 } 6252 } 6253 6254 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6255 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6256 if (needgetfh) { 6257 /* stick in a post-lookup getfh */ 6258 argop->argop = OP_GETFH; 6259 argcnt++; 6260 argop++; 6261 } 6262 /* post-lookup getattr */ 6263 argop->argop = OP_GETATTR; 6264 argop->nfs_argop4_u.opgetattr.attr_request = 6265 lookupargp->ga_bits; 6266 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6267 argcnt++; 6268 } 6269 argcnt += lookupargp->trailer_len; /* actual op count */ 6270 lookupargp->argsp->array_len = argcnt; 6271 lookupargp->arglen = arglen; 6272 6273 #ifdef DEBUG 6274 if (nfs4_client_lookup_debug) 6275 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6276 #endif 6277 6278 return (lookup_idx); 6279 } 6280 6281 static int 6282 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6283 { 6284 COMPOUND4args_clnt args; 6285 COMPOUND4res_clnt res; 6286 GETFH4res *gf_res = NULL; 6287 nfs_argop4 argop[4]; 6288 nfs_resop4 *resop = NULL; 6289 nfs4_sharedfh_t *sfhp; 6290 hrtime_t t; 6291 nfs4_error_t e; 6292 6293 rnode4_t *drp; 6294 int doqueue = 1; 6295 vnode_t *vp; 6296 int needrecov = 0; 6297 nfs4_recov_state_t recov_state; 6298 6299 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6300 6301 *avp = NULL; 6302 recov_state.rs_flags = 0; 6303 recov_state.rs_num_retry_despite_err = 0; 6304 6305 recov_retry: 6306 /* COMPOUND: putfh, openattr, getfh, getattr */ 6307 args.array_len = 4; 6308 args.array = argop; 6309 args.ctag = TAG_OPENATTR; 6310 6311 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6312 if (e.error) 6313 return (e.error); 6314 6315 drp = VTOR4(dvp); 6316 6317 /* putfh */ 6318 argop[0].argop = OP_CPUTFH; 6319 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6320 6321 /* openattr */ 6322 argop[1].argop = OP_OPENATTR; 6323 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6324 6325 /* getfh */ 6326 argop[2].argop = OP_GETFH; 6327 6328 /* getattr */ 6329 argop[3].argop = OP_GETATTR; 6330 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6331 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6332 6333 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6334 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6335 rnode4info(drp))); 6336 6337 t = gethrtime(); 6338 6339 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6340 6341 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6342 if (needrecov) { 6343 bool_t abort; 6344 6345 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6346 "nfs4openattr: initiating recovery\n")); 6347 6348 abort = nfs4_start_recovery(&e, 6349 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6350 OP_OPENATTR, NULL); 6351 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6352 if (!e.error) { 6353 e.error = geterrno4(res.status); 6354 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6355 } 6356 if (abort == FALSE) 6357 goto recov_retry; 6358 return (e.error); 6359 } 6360 6361 if (e.error) { 6362 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6363 return (e.error); 6364 } 6365 6366 if (res.status) { 6367 /* 6368 * If OTW errro is NOTSUPP, then it should be 6369 * translated to EINVAL. All Solaris file system 6370 * implementations return EINVAL to the syscall layer 6371 * when the attrdir cannot be created due to an 6372 * implementation restriction or noxattr mount option. 6373 */ 6374 if (res.status == NFS4ERR_NOTSUPP) { 6375 mutex_enter(&drp->r_statelock); 6376 if (drp->r_xattr_dir) 6377 VN_RELE(drp->r_xattr_dir); 6378 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6379 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6380 mutex_exit(&drp->r_statelock); 6381 6382 e.error = EINVAL; 6383 } else { 6384 e.error = geterrno4(res.status); 6385 } 6386 6387 if (e.error) { 6388 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6389 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6390 needrecov); 6391 return (e.error); 6392 } 6393 } 6394 6395 resop = &res.array[0]; /* putfh res */ 6396 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6397 6398 resop = &res.array[1]; /* openattr res */ 6399 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6400 6401 resop = &res.array[2]; /* getfh res */ 6402 gf_res = &resop->nfs_resop4_u.opgetfh; 6403 if (gf_res->object.nfs_fh4_len == 0) { 6404 *avp = NULL; 6405 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6406 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6407 return (ENOENT); 6408 } 6409 6410 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6411 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6412 dvp->v_vfsp, t, cr, dvp, 6413 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6414 sfh4_rele(&sfhp); 6415 6416 if (e.error) 6417 PURGE_ATTRCACHE4(vp); 6418 6419 mutex_enter(&vp->v_lock); 6420 vp->v_flag |= V_XATTRDIR; 6421 mutex_exit(&vp->v_lock); 6422 6423 *avp = vp; 6424 6425 mutex_enter(&drp->r_statelock); 6426 if (drp->r_xattr_dir) 6427 VN_RELE(drp->r_xattr_dir); 6428 VN_HOLD(vp); 6429 drp->r_xattr_dir = vp; 6430 6431 /* 6432 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6433 * NULL. xattrs could be created at any time, and we have no 6434 * way to update pc4_xattr_exists in the base object if/when 6435 * it happens. 6436 */ 6437 drp->r_pathconf.pc4_xattr_valid = 0; 6438 6439 mutex_exit(&drp->r_statelock); 6440 6441 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6442 6443 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6444 6445 return (0); 6446 } 6447 6448 /* ARGSUSED */ 6449 static int 6450 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6451 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6452 vsecattr_t *vsecp) 6453 { 6454 int error; 6455 vnode_t *vp = NULL; 6456 rnode4_t *rp; 6457 struct vattr vattr; 6458 rnode4_t *drp; 6459 vnode_t *tempvp; 6460 enum createmode4 createmode; 6461 bool_t must_trunc = FALSE; 6462 int truncating = 0; 6463 6464 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6465 return (EPERM); 6466 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6467 return (EINVAL); 6468 } 6469 6470 /* . and .. have special meaning in the protocol, reject them. */ 6471 6472 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6473 return (EISDIR); 6474 6475 drp = VTOR4(dvp); 6476 6477 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6478 return (EINTR); 6479 6480 top: 6481 /* 6482 * We make a copy of the attributes because the caller does not 6483 * expect us to change what va points to. 6484 */ 6485 vattr = *va; 6486 6487 /* 6488 * If the pathname is "", then dvp is the root vnode of 6489 * a remote file mounted over a local directory. 6490 * All that needs to be done is access 6491 * checking and truncation. Note that we avoid doing 6492 * open w/ create because the parent directory might 6493 * be in pseudo-fs and the open would fail. 6494 */ 6495 if (*nm == '\0') { 6496 error = 0; 6497 VN_HOLD(dvp); 6498 vp = dvp; 6499 must_trunc = TRUE; 6500 } else { 6501 /* 6502 * We need to go over the wire, just to be sure whether the 6503 * file exists or not. Using the DNLC can be dangerous in 6504 * this case when making a decision regarding existence. 6505 */ 6506 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6507 } 6508 6509 if (exclusive) 6510 createmode = EXCLUSIVE4; 6511 else 6512 createmode = GUARDED4; 6513 6514 /* 6515 * error would be set if the file does not exist on the 6516 * server, so lets go create it. 6517 */ 6518 if (error) { 6519 goto create_otw; 6520 } 6521 6522 /* 6523 * File does exist on the server 6524 */ 6525 if (exclusive == EXCL) 6526 error = EEXIST; 6527 else if (vp->v_type == VDIR && (mode & VWRITE)) 6528 error = EISDIR; 6529 else { 6530 /* 6531 * If vnode is a device, create special vnode. 6532 */ 6533 if (ISVDEV(vp->v_type)) { 6534 tempvp = vp; 6535 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6536 VN_RELE(tempvp); 6537 } 6538 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6539 if ((vattr.va_mask & AT_SIZE) && 6540 vp->v_type == VREG) { 6541 rp = VTOR4(vp); 6542 /* 6543 * Check here for large file handled 6544 * by LF-unaware process (as 6545 * ufs_create() does) 6546 */ 6547 if (!(flags & FOFFMAX)) { 6548 mutex_enter(&rp->r_statelock); 6549 if (rp->r_size > MAXOFF32_T) 6550 error = EOVERFLOW; 6551 mutex_exit(&rp->r_statelock); 6552 } 6553 6554 /* if error is set then we need to return */ 6555 if (error) { 6556 nfs_rw_exit(&drp->r_rwlock); 6557 VN_RELE(vp); 6558 return (error); 6559 } 6560 6561 if (must_trunc) { 6562 vattr.va_mask = AT_SIZE; 6563 error = nfs4setattr(vp, &vattr, 0, cr, 6564 NULL); 6565 } else { 6566 /* 6567 * we know we have a regular file that already 6568 * exists and we may end up truncating the file 6569 * as a result of the open_otw, so flush out 6570 * any dirty pages for this file first. 6571 */ 6572 if (nfs4_has_pages(vp) && 6573 ((rp->r_flags & R4DIRTY) || 6574 rp->r_count > 0 || 6575 rp->r_mapcnt > 0)) { 6576 error = nfs4_putpage(vp, 6577 (offset_t)0, 0, 0, cr, ct); 6578 if (error && (error == ENOSPC || 6579 error == EDQUOT)) { 6580 mutex_enter( 6581 &rp->r_statelock); 6582 if (!rp->r_error) 6583 rp->r_error = 6584 error; 6585 mutex_exit( 6586 &rp->r_statelock); 6587 } 6588 } 6589 vattr.va_mask = (AT_SIZE | 6590 AT_TYPE | AT_MODE); 6591 vattr.va_type = VREG; 6592 createmode = UNCHECKED4; 6593 truncating = 1; 6594 goto create_otw; 6595 } 6596 } 6597 } 6598 } 6599 nfs_rw_exit(&drp->r_rwlock); 6600 if (error) { 6601 VN_RELE(vp); 6602 } else { 6603 vnode_t *tvp; 6604 rnode4_t *trp; 6605 /* 6606 * existing file got truncated, notify. 6607 */ 6608 tvp = vp; 6609 if (vp->v_type == VREG) { 6610 trp = VTOR4(vp); 6611 if (IS_SHADOW(vp, trp)) 6612 tvp = RTOV4(trp); 6613 } 6614 vnevent_create(tvp, ct); 6615 *vpp = vp; 6616 } 6617 return (error); 6618 6619 create_otw: 6620 dnlc_remove(dvp, nm); 6621 6622 ASSERT(vattr.va_mask & AT_TYPE); 6623 6624 /* 6625 * If not a regular file let nfs4mknod() handle it. 6626 */ 6627 if (vattr.va_type != VREG) { 6628 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6629 nfs_rw_exit(&drp->r_rwlock); 6630 return (error); 6631 } 6632 6633 /* 6634 * It _is_ a regular file. 6635 */ 6636 ASSERT(vattr.va_mask & AT_MODE); 6637 if (MANDMODE(vattr.va_mode)) { 6638 nfs_rw_exit(&drp->r_rwlock); 6639 return (EACCES); 6640 } 6641 6642 /* 6643 * If this happens to be a mknod of a regular file, then flags will 6644 * have neither FREAD or FWRITE. However, we must set at least one 6645 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6646 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6647 * set (based on openmode specified by app). 6648 */ 6649 if ((flags & (FREAD|FWRITE)) == 0) 6650 flags |= (FREAD|FWRITE); 6651 6652 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6653 6654 if (vp != NULL) { 6655 /* if create was successful, throw away the file's pages */ 6656 if (!error && (vattr.va_mask & AT_SIZE)) 6657 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6658 cr); 6659 /* release the lookup hold */ 6660 VN_RELE(vp); 6661 vp = NULL; 6662 } 6663 6664 /* 6665 * validate that we opened a regular file. This handles a misbehaving 6666 * server that returns an incorrect FH. 6667 */ 6668 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6669 error = EISDIR; 6670 VN_RELE(*vpp); 6671 } 6672 6673 /* 6674 * If this is not an exclusive create, then the CREATE 6675 * request will be made with the GUARDED mode set. This 6676 * means that the server will return EEXIST if the file 6677 * exists. The file could exist because of a retransmitted 6678 * request. In this case, we recover by starting over and 6679 * checking to see whether the file exists. This second 6680 * time through it should and a CREATE request will not be 6681 * sent. 6682 * 6683 * This handles the problem of a dangling CREATE request 6684 * which contains attributes which indicate that the file 6685 * should be truncated. This retransmitted request could 6686 * possibly truncate valid data in the file if not caught 6687 * by the duplicate request mechanism on the server or if 6688 * not caught by other means. The scenario is: 6689 * 6690 * Client transmits CREATE request with size = 0 6691 * Client times out, retransmits request. 6692 * Response to the first request arrives from the server 6693 * and the client proceeds on. 6694 * Client writes data to the file. 6695 * The server now processes retransmitted CREATE request 6696 * and truncates file. 6697 * 6698 * The use of the GUARDED CREATE request prevents this from 6699 * happening because the retransmitted CREATE would fail 6700 * with EEXIST and would not truncate the file. 6701 */ 6702 if (error == EEXIST && exclusive == NONEXCL) { 6703 #ifdef DEBUG 6704 nfs4_create_misses++; 6705 #endif 6706 goto top; 6707 } 6708 nfs_rw_exit(&drp->r_rwlock); 6709 if (truncating && !error && *vpp) { 6710 vnode_t *tvp; 6711 rnode4_t *trp; 6712 /* 6713 * existing file got truncated, notify. 6714 */ 6715 tvp = *vpp; 6716 trp = VTOR4(tvp); 6717 if (IS_SHADOW(tvp, trp)) 6718 tvp = RTOV4(trp); 6719 vnevent_create(tvp, ct); 6720 } 6721 return (error); 6722 } 6723 6724 /* 6725 * Create compound (for mkdir, mknod, symlink): 6726 * { Putfh <dfh>; Create; Getfh; Getattr } 6727 * It's okay if setattr failed to set gid - this is not considered 6728 * an error, but purge attrs in that case. 6729 */ 6730 static int 6731 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6732 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6733 { 6734 int need_end_op = FALSE; 6735 COMPOUND4args_clnt args; 6736 COMPOUND4res_clnt res, *resp = NULL; 6737 nfs_argop4 *argop; 6738 nfs_resop4 *resop; 6739 int doqueue; 6740 mntinfo4_t *mi; 6741 rnode4_t *drp = VTOR4(dvp); 6742 change_info4 *cinfo; 6743 GETFH4res *gf_res; 6744 struct vattr vattr; 6745 vnode_t *vp; 6746 fattr4 *crattr; 6747 bool_t needrecov = FALSE; 6748 nfs4_recov_state_t recov_state; 6749 nfs4_sharedfh_t *sfhp = NULL; 6750 hrtime_t t; 6751 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6752 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6753 dirattr_info_t dinfo, *dinfop; 6754 servinfo4_t *svp; 6755 bitmap4 supp_attrs; 6756 6757 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6758 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6759 6760 mi = VTOMI4(dvp); 6761 6762 /* 6763 * Make sure we properly deal with setting the right gid 6764 * on a new directory to reflect the parent's setgid bit 6765 */ 6766 setgid_flag = 0; 6767 if (type == NF4DIR) { 6768 struct vattr dva; 6769 6770 va->va_mode &= ~VSGID; 6771 dva.va_mask = AT_MODE | AT_GID; 6772 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6773 6774 /* 6775 * If the parent's directory has the setgid bit set 6776 * _and_ the client was able to get a valid mapping 6777 * for the parent dir's owner_group, we want to 6778 * append NVERIFY(owner_group == dva.va_gid) and 6779 * SETTATTR to the CREATE compound. 6780 */ 6781 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6782 setgid_flag = 1; 6783 va->va_mode |= VSGID; 6784 if (dva.va_gid != GID_NOBODY) { 6785 va->va_mask |= AT_GID; 6786 va->va_gid = dva.va_gid; 6787 } 6788 } 6789 } 6790 } 6791 6792 /* 6793 * Create ops: 6794 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6795 * 5:restorefh(dir) 6:getattr(dir) 6796 * 6797 * if (setgid) 6798 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6799 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6800 * 8:nverify 9:setattr 6801 */ 6802 if (setgid_flag) { 6803 numops = 10; 6804 idx_create = 1; 6805 idx_fattr = 3; 6806 } else { 6807 numops = 7; 6808 idx_create = 2; 6809 idx_fattr = 4; 6810 } 6811 6812 ASSERT(nfs_zone() == mi->mi_zone); 6813 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6814 return (EINTR); 6815 } 6816 recov_state.rs_flags = 0; 6817 recov_state.rs_num_retry_despite_err = 0; 6818 6819 argoplist_size = numops * sizeof (nfs_argop4); 6820 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6821 6822 recov_retry: 6823 if (type == NF4LNK) 6824 args.ctag = TAG_SYMLINK; 6825 else if (type == NF4DIR) 6826 args.ctag = TAG_MKDIR; 6827 else 6828 args.ctag = TAG_MKNOD; 6829 6830 args.array_len = numops; 6831 args.array = argop; 6832 6833 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6834 nfs_rw_exit(&drp->r_rwlock); 6835 kmem_free(argop, argoplist_size); 6836 return (e.error); 6837 } 6838 need_end_op = TRUE; 6839 6840 6841 /* 0: putfh directory */ 6842 argop[0].argop = OP_CPUTFH; 6843 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6844 6845 /* 1/2: Create object */ 6846 argop[idx_create].argop = OP_CCREATE; 6847 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6848 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6849 if (type == NF4LNK) { 6850 /* 6851 * symlink, treat name as data 6852 */ 6853 ASSERT(data != NULL); 6854 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6855 (char *)data; 6856 } 6857 if (type == NF4BLK || type == NF4CHR) { 6858 ASSERT(data != NULL); 6859 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6860 *((specdata4 *)data); 6861 } 6862 6863 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6864 6865 svp = drp->r_server; 6866 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6867 supp_attrs = svp->sv_supp_attrs; 6868 nfs_rw_exit(&svp->sv_lock); 6869 6870 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6871 nfs_rw_exit(&drp->r_rwlock); 6872 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6873 e.error = EINVAL; 6874 kmem_free(argop, argoplist_size); 6875 return (e.error); 6876 } 6877 6878 /* 2/3: getfh fh of created object */ 6879 ASSERT(idx_create + 1 == idx_fattr - 1); 6880 argop[idx_create + 1].argop = OP_GETFH; 6881 6882 /* 3/4: getattr of new object */ 6883 argop[idx_fattr].argop = OP_GETATTR; 6884 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6885 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6886 6887 if (setgid_flag) { 6888 vattr_t _v; 6889 6890 argop[4].argop = OP_SAVEFH; 6891 6892 argop[5].argop = OP_CPUTFH; 6893 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6894 6895 argop[6].argop = OP_GETATTR; 6896 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6897 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6898 6899 argop[7].argop = OP_RESTOREFH; 6900 6901 /* 6902 * nverify 6903 * 6904 * XXX - Revisit the last argument to nfs4_end_op() 6905 * once 5020486 is fixed. 6906 */ 6907 _v.va_mask = AT_GID; 6908 _v.va_gid = va->va_gid; 6909 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6910 supp_attrs)) { 6911 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6912 nfs_rw_exit(&drp->r_rwlock); 6913 nfs4_fattr4_free(crattr); 6914 kmem_free(argop, argoplist_size); 6915 return (e.error); 6916 } 6917 6918 /* 6919 * setattr 6920 * 6921 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6922 * so no need for stateid or flags. Also we specify NULL 6923 * rp since we're only interested in setting owner_group 6924 * attributes. 6925 */ 6926 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6927 &e.error, 0); 6928 6929 if (e.error) { 6930 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6931 nfs_rw_exit(&drp->r_rwlock); 6932 nfs4_fattr4_free(crattr); 6933 nfs4args_verify_free(&argop[8]); 6934 kmem_free(argop, argoplist_size); 6935 return (e.error); 6936 } 6937 } else { 6938 argop[1].argop = OP_SAVEFH; 6939 6940 argop[5].argop = OP_RESTOREFH; 6941 6942 argop[6].argop = OP_GETATTR; 6943 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6944 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6945 } 6946 6947 dnlc_remove(dvp, nm); 6948 6949 doqueue = 1; 6950 t = gethrtime(); 6951 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6952 6953 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6954 if (e.error) { 6955 PURGE_ATTRCACHE4(dvp); 6956 if (!needrecov) 6957 goto out; 6958 } 6959 6960 if (needrecov) { 6961 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6962 OP_CREATE, NULL) == FALSE) { 6963 nfs4_end_op(mi, dvp, NULL, &recov_state, 6964 needrecov); 6965 need_end_op = FALSE; 6966 nfs4_fattr4_free(crattr); 6967 if (setgid_flag) { 6968 nfs4args_verify_free(&argop[8]); 6969 nfs4args_setattr_free(&argop[9]); 6970 } 6971 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6972 goto recov_retry; 6973 } 6974 } 6975 6976 resp = &res; 6977 6978 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6979 6980 if (res.status == NFS4ERR_BADOWNER) 6981 nfs4_log_badowner(mi, OP_CREATE); 6982 6983 e.error = geterrno4(res.status); 6984 6985 /* 6986 * This check is left over from when create was implemented 6987 * using a setattr op (instead of createattrs). If the 6988 * putfh/create/getfh failed, the error was returned. If 6989 * setattr/getattr failed, we keep going. 6990 * 6991 * It might be better to get rid of the GETFH also, and just 6992 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6993 * Then if any of the operations failed, we could return the 6994 * error now, and remove much of the error code below. 6995 */ 6996 if (res.array_len <= idx_fattr) { 6997 /* 6998 * Either Putfh, Create or Getfh failed. 6999 */ 7000 PURGE_ATTRCACHE4(dvp); 7001 /* 7002 * nfs4_purge_stale_fh() may generate otw calls through 7003 * nfs4_invalidate_pages. Hence the need to call 7004 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7005 */ 7006 nfs4_end_op(mi, dvp, NULL, &recov_state, 7007 needrecov); 7008 need_end_op = FALSE; 7009 nfs4_purge_stale_fh(e.error, dvp, cr); 7010 goto out; 7011 } 7012 } 7013 7014 resop = &res.array[idx_create]; /* create res */ 7015 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7016 7017 resop = &res.array[idx_create + 1]; /* getfh res */ 7018 gf_res = &resop->nfs_resop4_u.opgetfh; 7019 7020 sfhp = sfh4_get(&gf_res->object, mi); 7021 if (e.error) { 7022 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7023 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7024 if (vp->v_type == VNON) { 7025 vattr.va_mask = AT_TYPE; 7026 /* 7027 * Need to call nfs4_end_op before nfs4getattr to avoid 7028 * potential nfs4_start_op deadlock. See RFE 4777612. 7029 */ 7030 nfs4_end_op(mi, dvp, NULL, &recov_state, 7031 needrecov); 7032 need_end_op = FALSE; 7033 e.error = nfs4getattr(vp, &vattr, cr); 7034 if (e.error) { 7035 VN_RELE(vp); 7036 *vpp = NULL; 7037 goto out; 7038 } 7039 vp->v_type = vattr.va_type; 7040 } 7041 e.error = 0; 7042 } else { 7043 *vpp = vp = makenfs4node(sfhp, 7044 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7045 dvp->v_vfsp, t, cr, 7046 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7047 } 7048 7049 /* 7050 * If compound succeeded, then update dir attrs 7051 */ 7052 if (res.status == NFS4_OK) { 7053 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7054 dinfo.di_cred = cr; 7055 dinfo.di_time_call = t; 7056 dinfop = &dinfo; 7057 } else 7058 dinfop = NULL; 7059 7060 /* Update directory cache attribute, readdir and dnlc caches */ 7061 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7062 7063 out: 7064 if (sfhp != NULL) 7065 sfh4_rele(&sfhp); 7066 nfs_rw_exit(&drp->r_rwlock); 7067 nfs4_fattr4_free(crattr); 7068 if (setgid_flag) { 7069 nfs4args_verify_free(&argop[8]); 7070 nfs4args_setattr_free(&argop[9]); 7071 } 7072 if (resp) 7073 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7074 if (need_end_op) 7075 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7076 7077 kmem_free(argop, argoplist_size); 7078 return (e.error); 7079 } 7080 7081 /* ARGSUSED */ 7082 static int 7083 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7084 int mode, vnode_t **vpp, cred_t *cr) 7085 { 7086 int error; 7087 vnode_t *vp; 7088 nfs_ftype4 type; 7089 specdata4 spec, *specp = NULL; 7090 7091 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7092 7093 switch (va->va_type) { 7094 case VCHR: 7095 case VBLK: 7096 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7097 spec.specdata1 = getmajor(va->va_rdev); 7098 spec.specdata2 = getminor(va->va_rdev); 7099 specp = &spec; 7100 break; 7101 7102 case VFIFO: 7103 type = NF4FIFO; 7104 break; 7105 case VSOCK: 7106 type = NF4SOCK; 7107 break; 7108 7109 default: 7110 return (EINVAL); 7111 } 7112 7113 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7114 if (error) { 7115 return (error); 7116 } 7117 7118 /* 7119 * This might not be needed any more; special case to deal 7120 * with problematic v2/v3 servers. Since create was unable 7121 * to set group correctly, not sure what hope setattr has. 7122 */ 7123 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7124 va->va_mask = AT_GID; 7125 (void) nfs4setattr(vp, va, 0, cr, NULL); 7126 } 7127 7128 /* 7129 * If vnode is a device create special vnode 7130 */ 7131 if (ISVDEV(vp->v_type)) { 7132 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7133 VN_RELE(vp); 7134 } else { 7135 *vpp = vp; 7136 } 7137 return (error); 7138 } 7139 7140 /* 7141 * Remove requires that the current fh be the target directory. 7142 * After the operation, the current fh is unchanged. 7143 * The compound op structure is: 7144 * PUTFH(targetdir), REMOVE 7145 * 7146 * Weirdness: if the vnode to be removed is open 7147 * we rename it instead of removing it and nfs_inactive 7148 * will remove the new name. 7149 */ 7150 /* ARGSUSED */ 7151 static int 7152 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7153 { 7154 COMPOUND4args_clnt args; 7155 COMPOUND4res_clnt res, *resp = NULL; 7156 REMOVE4res *rm_res; 7157 nfs_argop4 argop[3]; 7158 nfs_resop4 *resop; 7159 vnode_t *vp; 7160 char *tmpname; 7161 int doqueue; 7162 mntinfo4_t *mi; 7163 rnode4_t *rp; 7164 rnode4_t *drp; 7165 int needrecov = 0; 7166 nfs4_recov_state_t recov_state; 7167 int isopen; 7168 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7169 dirattr_info_t dinfo; 7170 7171 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7172 return (EPERM); 7173 drp = VTOR4(dvp); 7174 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7175 return (EINTR); 7176 7177 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7178 if (e.error) { 7179 nfs_rw_exit(&drp->r_rwlock); 7180 return (e.error); 7181 } 7182 7183 if (vp->v_type == VDIR) { 7184 VN_RELE(vp); 7185 nfs_rw_exit(&drp->r_rwlock); 7186 return (EISDIR); 7187 } 7188 7189 /* 7190 * First just remove the entry from the name cache, as it 7191 * is most likely the only entry for this vp. 7192 */ 7193 dnlc_remove(dvp, nm); 7194 7195 rp = VTOR4(vp); 7196 7197 /* 7198 * For regular file types, check to see if the file is open by looking 7199 * at the open streams. 7200 * For all other types, check the reference count on the vnode. Since 7201 * they are not opened OTW they never have an open stream. 7202 * 7203 * If the file is open, rename it to .nfsXXXX. 7204 */ 7205 if (vp->v_type != VREG) { 7206 /* 7207 * If the file has a v_count > 1 then there may be more than one 7208 * entry in the name cache due multiple links or an open file, 7209 * but we don't have the real reference count so flush all 7210 * possible entries. 7211 */ 7212 if (vp->v_count > 1) 7213 dnlc_purge_vp(vp); 7214 7215 /* 7216 * Now we have the real reference count. 7217 */ 7218 isopen = vp->v_count > 1; 7219 } else { 7220 mutex_enter(&rp->r_os_lock); 7221 isopen = list_head(&rp->r_open_streams) != NULL; 7222 mutex_exit(&rp->r_os_lock); 7223 } 7224 7225 mutex_enter(&rp->r_statelock); 7226 if (isopen && 7227 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7228 mutex_exit(&rp->r_statelock); 7229 tmpname = newname(); 7230 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7231 if (e.error) 7232 kmem_free(tmpname, MAXNAMELEN); 7233 else { 7234 mutex_enter(&rp->r_statelock); 7235 if (rp->r_unldvp == NULL) { 7236 VN_HOLD(dvp); 7237 rp->r_unldvp = dvp; 7238 if (rp->r_unlcred != NULL) 7239 crfree(rp->r_unlcred); 7240 crhold(cr); 7241 rp->r_unlcred = cr; 7242 rp->r_unlname = tmpname; 7243 } else { 7244 kmem_free(rp->r_unlname, MAXNAMELEN); 7245 rp->r_unlname = tmpname; 7246 } 7247 mutex_exit(&rp->r_statelock); 7248 } 7249 VN_RELE(vp); 7250 nfs_rw_exit(&drp->r_rwlock); 7251 return (e.error); 7252 } 7253 /* 7254 * Actually remove the file/dir 7255 */ 7256 mutex_exit(&rp->r_statelock); 7257 7258 /* 7259 * We need to flush any dirty pages which happen to 7260 * be hanging around before removing the file. 7261 * This shouldn't happen very often since in NFSv4 7262 * we should be close to open consistent. 7263 */ 7264 if (nfs4_has_pages(vp) && 7265 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7266 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7267 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7268 mutex_enter(&rp->r_statelock); 7269 if (!rp->r_error) 7270 rp->r_error = e.error; 7271 mutex_exit(&rp->r_statelock); 7272 } 7273 } 7274 7275 mi = VTOMI4(dvp); 7276 7277 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7278 recov_state.rs_flags = 0; 7279 recov_state.rs_num_retry_despite_err = 0; 7280 7281 recov_retry: 7282 /* 7283 * Remove ops: putfh dir; remove 7284 */ 7285 args.ctag = TAG_REMOVE; 7286 args.array_len = 3; 7287 args.array = argop; 7288 7289 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7290 if (e.error) { 7291 nfs_rw_exit(&drp->r_rwlock); 7292 VN_RELE(vp); 7293 return (e.error); 7294 } 7295 7296 /* putfh directory */ 7297 argop[0].argop = OP_CPUTFH; 7298 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7299 7300 /* remove */ 7301 argop[1].argop = OP_CREMOVE; 7302 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7303 7304 /* getattr dir */ 7305 argop[2].argop = OP_GETATTR; 7306 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7307 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7308 7309 doqueue = 1; 7310 dinfo.di_time_call = gethrtime(); 7311 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7312 7313 PURGE_ATTRCACHE4(vp); 7314 7315 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7316 if (e.error) 7317 PURGE_ATTRCACHE4(dvp); 7318 7319 if (needrecov) { 7320 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7321 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7322 if (!e.error) 7323 (void) xdr_free(xdr_COMPOUND4res_clnt, 7324 (caddr_t)&res); 7325 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7326 needrecov); 7327 goto recov_retry; 7328 } 7329 } 7330 7331 /* 7332 * Matching nfs4_end_op() for start_op() above. 7333 * There is a path in the code below which calls 7334 * nfs4_purge_stale_fh(), which may generate otw calls through 7335 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7336 * here to avoid nfs4_start_op() deadlock. 7337 */ 7338 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7339 7340 if (!e.error) { 7341 resp = &res; 7342 7343 if (res.status) { 7344 e.error = geterrno4(res.status); 7345 PURGE_ATTRCACHE4(dvp); 7346 nfs4_purge_stale_fh(e.error, dvp, cr); 7347 } else { 7348 resop = &res.array[1]; /* remove res */ 7349 rm_res = &resop->nfs_resop4_u.opremove; 7350 7351 dinfo.di_garp = 7352 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7353 dinfo.di_cred = cr; 7354 7355 /* Update directory attr, readdir and dnlc caches */ 7356 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7357 &dinfo); 7358 } 7359 } 7360 nfs_rw_exit(&drp->r_rwlock); 7361 if (resp) 7362 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7363 7364 if (e.error == 0) { 7365 vnode_t *tvp; 7366 rnode4_t *trp; 7367 trp = VTOR4(vp); 7368 tvp = vp; 7369 if (IS_SHADOW(vp, trp)) 7370 tvp = RTOV4(trp); 7371 vnevent_remove(tvp, dvp, nm, ct); 7372 } 7373 VN_RELE(vp); 7374 return (e.error); 7375 } 7376 7377 /* 7378 * Link requires that the current fh be the target directory and the 7379 * saved fh be the source fh. After the operation, the current fh is unchanged. 7380 * Thus the compound op structure is: 7381 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7382 * GETATTR(file) 7383 */ 7384 /* ARGSUSED */ 7385 static int 7386 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7387 caller_context_t *ct, int flags) 7388 { 7389 COMPOUND4args_clnt args; 7390 COMPOUND4res_clnt res, *resp = NULL; 7391 LINK4res *ln_res; 7392 int argoplist_size = 7 * sizeof (nfs_argop4); 7393 nfs_argop4 *argop; 7394 nfs_resop4 *resop; 7395 vnode_t *realvp, *nvp; 7396 int doqueue; 7397 mntinfo4_t *mi; 7398 rnode4_t *tdrp; 7399 bool_t needrecov = FALSE; 7400 nfs4_recov_state_t recov_state; 7401 hrtime_t t; 7402 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7403 dirattr_info_t dinfo; 7404 7405 ASSERT(*tnm != '\0'); 7406 ASSERT(tdvp->v_type == VDIR); 7407 ASSERT(nfs4_consistent_type(tdvp)); 7408 ASSERT(nfs4_consistent_type(svp)); 7409 7410 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7411 return (EPERM); 7412 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7413 svp = realvp; 7414 ASSERT(nfs4_consistent_type(svp)); 7415 } 7416 7417 tdrp = VTOR4(tdvp); 7418 mi = VTOMI4(svp); 7419 7420 if (!(mi->mi_flags & MI4_LINK)) { 7421 return (EOPNOTSUPP); 7422 } 7423 recov_state.rs_flags = 0; 7424 recov_state.rs_num_retry_despite_err = 0; 7425 7426 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7427 return (EINTR); 7428 7429 recov_retry: 7430 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7431 7432 args.ctag = TAG_LINK; 7433 7434 /* 7435 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7436 * restorefh; getattr(fl) 7437 */ 7438 args.array_len = 7; 7439 args.array = argop; 7440 7441 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7442 if (e.error) { 7443 kmem_free(argop, argoplist_size); 7444 nfs_rw_exit(&tdrp->r_rwlock); 7445 return (e.error); 7446 } 7447 7448 /* 0. putfh file */ 7449 argop[0].argop = OP_CPUTFH; 7450 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7451 7452 /* 1. save current fh to free up the space for the dir */ 7453 argop[1].argop = OP_SAVEFH; 7454 7455 /* 2. putfh targetdir */ 7456 argop[2].argop = OP_CPUTFH; 7457 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7458 7459 /* 3. link: current_fh is targetdir, saved_fh is source */ 7460 argop[3].argop = OP_CLINK; 7461 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7462 7463 /* 4. Get attributes of dir */ 7464 argop[4].argop = OP_GETATTR; 7465 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7466 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7467 7468 /* 5. If link was successful, restore current vp to file */ 7469 argop[5].argop = OP_RESTOREFH; 7470 7471 /* 6. Get attributes of linked object */ 7472 argop[6].argop = OP_GETATTR; 7473 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7474 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7475 7476 dnlc_remove(tdvp, tnm); 7477 7478 doqueue = 1; 7479 t = gethrtime(); 7480 7481 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7482 7483 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7484 if (e.error != 0 && !needrecov) { 7485 PURGE_ATTRCACHE4(tdvp); 7486 PURGE_ATTRCACHE4(svp); 7487 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7488 goto out; 7489 } 7490 7491 if (needrecov) { 7492 bool_t abort; 7493 7494 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7495 NULL, NULL, OP_LINK, NULL); 7496 if (abort == FALSE) { 7497 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7498 needrecov); 7499 kmem_free(argop, argoplist_size); 7500 if (!e.error) 7501 (void) xdr_free(xdr_COMPOUND4res_clnt, 7502 (caddr_t)&res); 7503 goto recov_retry; 7504 } else { 7505 if (e.error != 0) { 7506 PURGE_ATTRCACHE4(tdvp); 7507 PURGE_ATTRCACHE4(svp); 7508 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7509 &recov_state, needrecov); 7510 goto out; 7511 } 7512 /* fall through for res.status case */ 7513 } 7514 } 7515 7516 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7517 7518 resp = &res; 7519 if (res.status) { 7520 /* If link succeeded, then don't return error */ 7521 e.error = geterrno4(res.status); 7522 if (res.array_len <= 4) { 7523 /* 7524 * Either Putfh, Savefh, Putfh dir, or Link failed 7525 */ 7526 PURGE_ATTRCACHE4(svp); 7527 PURGE_ATTRCACHE4(tdvp); 7528 if (e.error == EOPNOTSUPP) { 7529 mutex_enter(&mi->mi_lock); 7530 mi->mi_flags &= ~MI4_LINK; 7531 mutex_exit(&mi->mi_lock); 7532 } 7533 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7534 /* XXX-LP */ 7535 if (e.error == EISDIR && crgetuid(cr) != 0) 7536 e.error = EPERM; 7537 goto out; 7538 } 7539 } 7540 7541 /* either no error or one of the postop getattr failed */ 7542 7543 /* 7544 * XXX - if LINK succeeded, but no attrs were returned for link 7545 * file, purge its cache. 7546 * 7547 * XXX Perform a simplified version of wcc checking. Instead of 7548 * have another getattr to get pre-op, just purge cache if 7549 * any of the ops prior to and including the getattr failed. 7550 * If the getattr succeeded then update the attrcache accordingly. 7551 */ 7552 7553 /* 7554 * update cache with link file postattrs. 7555 * Note: at this point resop points to link res. 7556 */ 7557 resop = &res.array[3]; /* link res */ 7558 ln_res = &resop->nfs_resop4_u.oplink; 7559 if (res.status == NFS4_OK) 7560 e.error = nfs4_update_attrcache(res.status, 7561 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7562 t, svp, cr); 7563 7564 /* 7565 * Call makenfs4node to create the new shadow vp for tnm. 7566 * We pass NULL attrs because we just cached attrs for 7567 * the src object. All we're trying to accomplish is to 7568 * to create the new shadow vnode. 7569 */ 7570 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7571 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7572 7573 /* Update target cache attribute, readdir and dnlc caches */ 7574 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7575 dinfo.di_time_call = t; 7576 dinfo.di_cred = cr; 7577 7578 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7579 ASSERT(nfs4_consistent_type(tdvp)); 7580 ASSERT(nfs4_consistent_type(svp)); 7581 ASSERT(nfs4_consistent_type(nvp)); 7582 VN_RELE(nvp); 7583 7584 if (!e.error) { 7585 vnode_t *tvp; 7586 rnode4_t *trp; 7587 /* 7588 * Notify the source file of this link operation. 7589 */ 7590 trp = VTOR4(svp); 7591 tvp = svp; 7592 if (IS_SHADOW(svp, trp)) 7593 tvp = RTOV4(trp); 7594 vnevent_link(tvp, ct); 7595 } 7596 out: 7597 kmem_free(argop, argoplist_size); 7598 if (resp) 7599 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7600 7601 nfs_rw_exit(&tdrp->r_rwlock); 7602 7603 return (e.error); 7604 } 7605 7606 /* ARGSUSED */ 7607 static int 7608 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7609 caller_context_t *ct, int flags) 7610 { 7611 vnode_t *realvp; 7612 7613 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7614 return (EPERM); 7615 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7616 ndvp = realvp; 7617 7618 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7619 } 7620 7621 /* 7622 * nfs4rename does the real work of renaming in NFS Version 4. 7623 * 7624 * A file handle is considered volatile for renaming purposes if either 7625 * of the volatile bits are turned on. However, the compound may differ 7626 * based on the likelihood of the filehandle to change during rename. 7627 */ 7628 static int 7629 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7630 caller_context_t *ct) 7631 { 7632 int error; 7633 mntinfo4_t *mi; 7634 vnode_t *nvp = NULL; 7635 vnode_t *ovp = NULL; 7636 char *tmpname = NULL; 7637 rnode4_t *rp; 7638 rnode4_t *odrp; 7639 rnode4_t *ndrp; 7640 int did_link = 0; 7641 int do_link = 1; 7642 nfsstat4 stat = NFS4_OK; 7643 7644 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7645 ASSERT(nfs4_consistent_type(odvp)); 7646 ASSERT(nfs4_consistent_type(ndvp)); 7647 7648 if (onm[0] == '.' && (onm[1] == '\0' || 7649 (onm[1] == '.' && onm[2] == '\0'))) 7650 return (EINVAL); 7651 7652 if (nnm[0] == '.' && (nnm[1] == '\0' || 7653 (nnm[1] == '.' && nnm[2] == '\0'))) 7654 return (EINVAL); 7655 7656 odrp = VTOR4(odvp); 7657 ndrp = VTOR4(ndvp); 7658 if ((intptr_t)odrp < (intptr_t)ndrp) { 7659 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7660 return (EINTR); 7661 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7662 nfs_rw_exit(&odrp->r_rwlock); 7663 return (EINTR); 7664 } 7665 } else { 7666 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7667 return (EINTR); 7668 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7669 nfs_rw_exit(&ndrp->r_rwlock); 7670 return (EINTR); 7671 } 7672 } 7673 7674 /* 7675 * Lookup the target file. If it exists, it needs to be 7676 * checked to see whether it is a mount point and whether 7677 * it is active (open). 7678 */ 7679 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7680 if (!error) { 7681 int isactive; 7682 7683 ASSERT(nfs4_consistent_type(nvp)); 7684 /* 7685 * If this file has been mounted on, then just 7686 * return busy because renaming to it would remove 7687 * the mounted file system from the name space. 7688 */ 7689 if (vn_ismntpt(nvp)) { 7690 VN_RELE(nvp); 7691 nfs_rw_exit(&odrp->r_rwlock); 7692 nfs_rw_exit(&ndrp->r_rwlock); 7693 return (EBUSY); 7694 } 7695 7696 /* 7697 * First just remove the entry from the name cache, as it 7698 * is most likely the only entry for this vp. 7699 */ 7700 dnlc_remove(ndvp, nnm); 7701 7702 rp = VTOR4(nvp); 7703 7704 if (nvp->v_type != VREG) { 7705 /* 7706 * Purge the name cache of all references to this vnode 7707 * so that we can check the reference count to infer 7708 * whether it is active or not. 7709 */ 7710 if (nvp->v_count > 1) 7711 dnlc_purge_vp(nvp); 7712 7713 isactive = nvp->v_count > 1; 7714 } else { 7715 mutex_enter(&rp->r_os_lock); 7716 isactive = list_head(&rp->r_open_streams) != NULL; 7717 mutex_exit(&rp->r_os_lock); 7718 } 7719 7720 /* 7721 * If the vnode is active and is not a directory, 7722 * arrange to rename it to a 7723 * temporary file so that it will continue to be 7724 * accessible. This implements the "unlink-open-file" 7725 * semantics for the target of a rename operation. 7726 * Before doing this though, make sure that the 7727 * source and target files are not already the same. 7728 */ 7729 if (isactive && nvp->v_type != VDIR) { 7730 /* 7731 * Lookup the source name. 7732 */ 7733 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7734 7735 /* 7736 * The source name *should* already exist. 7737 */ 7738 if (error) { 7739 VN_RELE(nvp); 7740 nfs_rw_exit(&odrp->r_rwlock); 7741 nfs_rw_exit(&ndrp->r_rwlock); 7742 return (error); 7743 } 7744 7745 ASSERT(nfs4_consistent_type(ovp)); 7746 7747 /* 7748 * Compare the two vnodes. If they are the same, 7749 * just release all held vnodes and return success. 7750 */ 7751 if (VN_CMP(ovp, nvp)) { 7752 VN_RELE(ovp); 7753 VN_RELE(nvp); 7754 nfs_rw_exit(&odrp->r_rwlock); 7755 nfs_rw_exit(&ndrp->r_rwlock); 7756 return (0); 7757 } 7758 7759 /* 7760 * Can't mix and match directories and non- 7761 * directories in rename operations. We already 7762 * know that the target is not a directory. If 7763 * the source is a directory, return an error. 7764 */ 7765 if (ovp->v_type == VDIR) { 7766 VN_RELE(ovp); 7767 VN_RELE(nvp); 7768 nfs_rw_exit(&odrp->r_rwlock); 7769 nfs_rw_exit(&ndrp->r_rwlock); 7770 return (ENOTDIR); 7771 } 7772 link_call: 7773 /* 7774 * The target file exists, is not the same as 7775 * the source file, and is active. We first 7776 * try to Link it to a temporary filename to 7777 * avoid having the server removing the file 7778 * completely (which could cause data loss to 7779 * the user's POV in the event the Rename fails 7780 * -- see bug 1165874). 7781 */ 7782 /* 7783 * The do_link and did_link booleans are 7784 * introduced in the event we get NFS4ERR_FILE_OPEN 7785 * returned for the Rename. Some servers can 7786 * not Rename over an Open file, so they return 7787 * this error. The client needs to Remove the 7788 * newly created Link and do two Renames, just 7789 * as if the server didn't support LINK. 7790 */ 7791 tmpname = newname(); 7792 error = 0; 7793 7794 if (do_link) { 7795 error = nfs4_link(ndvp, nvp, tmpname, cr, 7796 NULL, 0); 7797 } 7798 if (error == EOPNOTSUPP || !do_link) { 7799 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7800 cr, NULL, 0); 7801 did_link = 0; 7802 } else { 7803 did_link = 1; 7804 } 7805 if (error) { 7806 kmem_free(tmpname, MAXNAMELEN); 7807 VN_RELE(ovp); 7808 VN_RELE(nvp); 7809 nfs_rw_exit(&odrp->r_rwlock); 7810 nfs_rw_exit(&ndrp->r_rwlock); 7811 return (error); 7812 } 7813 7814 mutex_enter(&rp->r_statelock); 7815 if (rp->r_unldvp == NULL) { 7816 VN_HOLD(ndvp); 7817 rp->r_unldvp = ndvp; 7818 if (rp->r_unlcred != NULL) 7819 crfree(rp->r_unlcred); 7820 crhold(cr); 7821 rp->r_unlcred = cr; 7822 rp->r_unlname = tmpname; 7823 } else { 7824 if (rp->r_unlname) 7825 kmem_free(rp->r_unlname, MAXNAMELEN); 7826 rp->r_unlname = tmpname; 7827 } 7828 mutex_exit(&rp->r_statelock); 7829 } 7830 7831 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7832 7833 ASSERT(nfs4_consistent_type(nvp)); 7834 } 7835 7836 if (ovp == NULL) { 7837 /* 7838 * When renaming directories to be a subdirectory of a 7839 * different parent, the dnlc entry for ".." will no 7840 * longer be valid, so it must be removed. 7841 * 7842 * We do a lookup here to determine whether we are renaming 7843 * a directory and we need to check if we are renaming 7844 * an unlinked file. This might have already been done 7845 * in previous code, so we check ovp == NULL to avoid 7846 * doing it twice. 7847 */ 7848 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7849 /* 7850 * The source name *should* already exist. 7851 */ 7852 if (error) { 7853 nfs_rw_exit(&odrp->r_rwlock); 7854 nfs_rw_exit(&ndrp->r_rwlock); 7855 if (nvp) { 7856 VN_RELE(nvp); 7857 } 7858 return (error); 7859 } 7860 ASSERT(ovp != NULL); 7861 ASSERT(nfs4_consistent_type(ovp)); 7862 } 7863 7864 /* 7865 * Is the object being renamed a dir, and if so, is 7866 * it being renamed to a child of itself? The underlying 7867 * fs should ultimately return EINVAL for this case; 7868 * however, buggy beta non-Solaris NFSv4 servers at 7869 * interop testing events have allowed this behavior, 7870 * and it caused our client to panic due to a recursive 7871 * mutex_enter in fn_move. 7872 * 7873 * The tedious locking in fn_move could be changed to 7874 * deal with this case, and the client could avoid the 7875 * panic; however, the client would just confuse itself 7876 * later and misbehave. A better way to handle the broken 7877 * server is to detect this condition and return EINVAL 7878 * without ever sending the the bogus rename to the server. 7879 * We know the rename is invalid -- just fail it now. 7880 */ 7881 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7882 VN_RELE(ovp); 7883 nfs_rw_exit(&odrp->r_rwlock); 7884 nfs_rw_exit(&ndrp->r_rwlock); 7885 if (nvp) { 7886 VN_RELE(nvp); 7887 } 7888 return (EINVAL); 7889 } 7890 7891 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7892 7893 /* 7894 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7895 * possible for the filehandle to change due to the rename. 7896 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7897 * the fh will not change because of the rename, but we still need 7898 * to update its rnode entry with the new name for 7899 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7900 * has no effect on these for now, but for future improvements, 7901 * we might want to use it too to simplify handling of files 7902 * that are open with that flag on. (XXX) 7903 */ 7904 mi = VTOMI4(odvp); 7905 if (NFS4_VOLATILE_FH(mi)) 7906 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7907 &stat); 7908 else 7909 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7910 &stat); 7911 7912 ASSERT(nfs4_consistent_type(odvp)); 7913 ASSERT(nfs4_consistent_type(ndvp)); 7914 ASSERT(nfs4_consistent_type(ovp)); 7915 7916 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7917 do_link = 0; 7918 /* 7919 * Before the 'link_call' code, we did a nfs4_lookup 7920 * that puts a VN_HOLD on nvp. After the nfs4_link 7921 * call we call VN_RELE to match that hold. We need 7922 * to place an additional VN_HOLD here since we will 7923 * be hitting that VN_RELE again. 7924 */ 7925 VN_HOLD(nvp); 7926 7927 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7928 7929 /* Undo the unlinked file naming stuff we just did */ 7930 mutex_enter(&rp->r_statelock); 7931 if (rp->r_unldvp) { 7932 VN_RELE(ndvp); 7933 rp->r_unldvp = NULL; 7934 if (rp->r_unlcred != NULL) 7935 crfree(rp->r_unlcred); 7936 rp->r_unlcred = NULL; 7937 /* rp->r_unlanme points to tmpname */ 7938 if (rp->r_unlname) 7939 kmem_free(rp->r_unlname, MAXNAMELEN); 7940 rp->r_unlname = NULL; 7941 } 7942 mutex_exit(&rp->r_statelock); 7943 7944 if (nvp) { 7945 VN_RELE(nvp); 7946 } 7947 goto link_call; 7948 } 7949 7950 if (error) { 7951 VN_RELE(ovp); 7952 nfs_rw_exit(&odrp->r_rwlock); 7953 nfs_rw_exit(&ndrp->r_rwlock); 7954 if (nvp) { 7955 VN_RELE(nvp); 7956 } 7957 return (error); 7958 } 7959 7960 /* 7961 * when renaming directories to be a subdirectory of a 7962 * different parent, the dnlc entry for ".." will no 7963 * longer be valid, so it must be removed 7964 */ 7965 rp = VTOR4(ovp); 7966 if (ndvp != odvp) { 7967 if (ovp->v_type == VDIR) { 7968 dnlc_remove(ovp, ".."); 7969 if (rp->r_dir != NULL) 7970 nfs4_purge_rddir_cache(ovp); 7971 } 7972 } 7973 7974 /* 7975 * If we are renaming the unlinked file, update the 7976 * r_unldvp and r_unlname as needed. 7977 */ 7978 mutex_enter(&rp->r_statelock); 7979 if (rp->r_unldvp != NULL) { 7980 if (strcmp(rp->r_unlname, onm) == 0) { 7981 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7982 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7983 if (ndvp != rp->r_unldvp) { 7984 VN_RELE(rp->r_unldvp); 7985 rp->r_unldvp = ndvp; 7986 VN_HOLD(ndvp); 7987 } 7988 } 7989 } 7990 mutex_exit(&rp->r_statelock); 7991 7992 /* 7993 * Notify the rename vnevents to source vnode, and to the target 7994 * vnode if it already existed. 7995 */ 7996 if (error == 0) { 7997 vnode_t *tvp; 7998 rnode4_t *trp; 7999 /* 8000 * Notify the vnode. Each links is represented by 8001 * a different vnode, in nfsv4. 8002 */ 8003 if (nvp) { 8004 trp = VTOR4(nvp); 8005 tvp = nvp; 8006 if (IS_SHADOW(nvp, trp)) 8007 tvp = RTOV4(trp); 8008 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8009 } 8010 8011 /* 8012 * if the source and destination directory are not the 8013 * same notify the destination directory. 8014 */ 8015 if (VTOR4(odvp) != VTOR4(ndvp)) { 8016 trp = VTOR4(ndvp); 8017 tvp = ndvp; 8018 if (IS_SHADOW(ndvp, trp)) 8019 tvp = RTOV4(trp); 8020 vnevent_rename_dest_dir(tvp, ct); 8021 } 8022 8023 trp = VTOR4(ovp); 8024 tvp = ovp; 8025 if (IS_SHADOW(ovp, trp)) 8026 tvp = RTOV4(trp); 8027 vnevent_rename_src(tvp, odvp, onm, ct); 8028 } 8029 8030 if (nvp) { 8031 VN_RELE(nvp); 8032 } 8033 VN_RELE(ovp); 8034 8035 nfs_rw_exit(&odrp->r_rwlock); 8036 nfs_rw_exit(&ndrp->r_rwlock); 8037 8038 return (error); 8039 } 8040 8041 /* 8042 * When the parent directory has changed, sv_dfh must be updated 8043 */ 8044 static void 8045 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8046 { 8047 svnode_t *sv = VTOSV(vp); 8048 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8049 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8050 8051 sfh4_hold(new_dfh); 8052 sv->sv_dfh = new_dfh; 8053 sfh4_rele(&old_dfh); 8054 } 8055 8056 /* 8057 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8058 * when it is known that the filehandle is persistent through rename. 8059 * 8060 * Rename requires that the current fh be the target directory and the 8061 * saved fh be the source directory. After the operation, the current fh 8062 * is unchanged. 8063 * The compound op structure for persistent fh rename is: 8064 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8065 * Rather than bother with the directory postop args, we'll simply 8066 * update that a change occurred in the cache, so no post-op getattrs. 8067 */ 8068 static int 8069 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8070 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8071 { 8072 COMPOUND4args_clnt args; 8073 COMPOUND4res_clnt res, *resp = NULL; 8074 nfs_argop4 *argop; 8075 nfs_resop4 *resop; 8076 int doqueue, argoplist_size; 8077 mntinfo4_t *mi; 8078 rnode4_t *odrp = VTOR4(odvp); 8079 rnode4_t *ndrp = VTOR4(ndvp); 8080 RENAME4res *rn_res; 8081 bool_t needrecov; 8082 nfs4_recov_state_t recov_state; 8083 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8084 dirattr_info_t dinfo, *dinfop; 8085 8086 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8087 8088 recov_state.rs_flags = 0; 8089 recov_state.rs_num_retry_despite_err = 0; 8090 8091 /* 8092 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8093 * 8094 * If source/target are different dirs, then append putfh(src); getattr 8095 */ 8096 args.array_len = (odvp == ndvp) ? 5 : 7; 8097 argoplist_size = args.array_len * sizeof (nfs_argop4); 8098 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8099 8100 recov_retry: 8101 *statp = NFS4_OK; 8102 8103 /* No need to Lookup the file, persistent fh */ 8104 args.ctag = TAG_RENAME; 8105 8106 mi = VTOMI4(odvp); 8107 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8108 if (e.error) { 8109 kmem_free(argop, argoplist_size); 8110 return (e.error); 8111 } 8112 8113 /* 0: putfh source directory */ 8114 argop[0].argop = OP_CPUTFH; 8115 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8116 8117 /* 1: Save source fh to free up current for target */ 8118 argop[1].argop = OP_SAVEFH; 8119 8120 /* 2: putfh targetdir */ 8121 argop[2].argop = OP_CPUTFH; 8122 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8123 8124 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8125 argop[3].argop = OP_CRENAME; 8126 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8127 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8128 8129 /* 4: getattr (targetdir) */ 8130 argop[4].argop = OP_GETATTR; 8131 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8132 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8133 8134 if (ndvp != odvp) { 8135 8136 /* 5: putfh (sourcedir) */ 8137 argop[5].argop = OP_CPUTFH; 8138 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8139 8140 /* 6: getattr (sourcedir) */ 8141 argop[6].argop = OP_GETATTR; 8142 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8143 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8144 } 8145 8146 dnlc_remove(odvp, onm); 8147 dnlc_remove(ndvp, nnm); 8148 8149 doqueue = 1; 8150 dinfo.di_time_call = gethrtime(); 8151 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8152 8153 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8154 if (e.error) { 8155 PURGE_ATTRCACHE4(odvp); 8156 PURGE_ATTRCACHE4(ndvp); 8157 } else { 8158 *statp = res.status; 8159 } 8160 8161 if (needrecov) { 8162 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8163 OP_RENAME, NULL) == FALSE) { 8164 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8165 if (!e.error) 8166 (void) xdr_free(xdr_COMPOUND4res_clnt, 8167 (caddr_t)&res); 8168 goto recov_retry; 8169 } 8170 } 8171 8172 if (!e.error) { 8173 resp = &res; 8174 /* 8175 * as long as OP_RENAME 8176 */ 8177 if (res.status != NFS4_OK && res.array_len <= 4) { 8178 e.error = geterrno4(res.status); 8179 PURGE_ATTRCACHE4(odvp); 8180 PURGE_ATTRCACHE4(ndvp); 8181 /* 8182 * System V defines rename to return EEXIST, not 8183 * ENOTEMPTY if the target directory is not empty. 8184 * Over the wire, the error is NFSERR_ENOTEMPTY 8185 * which geterrno4 maps to ENOTEMPTY. 8186 */ 8187 if (e.error == ENOTEMPTY) 8188 e.error = EEXIST; 8189 } else { 8190 8191 resop = &res.array[3]; /* rename res */ 8192 rn_res = &resop->nfs_resop4_u.oprename; 8193 8194 if (res.status == NFS4_OK) { 8195 /* 8196 * Update target attribute, readdir and dnlc 8197 * caches. 8198 */ 8199 dinfo.di_garp = 8200 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8201 dinfo.di_cred = cr; 8202 dinfop = &dinfo; 8203 } else 8204 dinfop = NULL; 8205 8206 nfs4_update_dircaches(&rn_res->target_cinfo, 8207 ndvp, NULL, NULL, dinfop); 8208 8209 /* 8210 * Update source attribute, readdir and dnlc caches 8211 * 8212 */ 8213 if (ndvp != odvp) { 8214 update_parentdir_sfh(renvp, ndvp); 8215 8216 if (dinfop) 8217 dinfo.di_garp = 8218 &(res.array[6].nfs_resop4_u. 8219 opgetattr.ga_res); 8220 8221 nfs4_update_dircaches(&rn_res->source_cinfo, 8222 odvp, NULL, NULL, dinfop); 8223 } 8224 8225 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8226 nnm); 8227 } 8228 } 8229 8230 if (resp) 8231 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8232 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8233 kmem_free(argop, argoplist_size); 8234 8235 return (e.error); 8236 } 8237 8238 /* 8239 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8240 * it is possible for the filehandle to change due to the rename. 8241 * 8242 * The compound req in this case includes a post-rename lookup and getattr 8243 * to ensure that we have the correct fh and attributes for the object. 8244 * 8245 * Rename requires that the current fh be the target directory and the 8246 * saved fh be the source directory. After the operation, the current fh 8247 * is unchanged. 8248 * 8249 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8250 * update the filehandle for the renamed object. We also get the old 8251 * filehandle for historical reasons; this should be taken out sometime. 8252 * This results in a rather cumbersome compound... 8253 * 8254 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8255 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8256 * 8257 */ 8258 static int 8259 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8260 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8261 { 8262 COMPOUND4args_clnt args; 8263 COMPOUND4res_clnt res, *resp = NULL; 8264 int argoplist_size; 8265 nfs_argop4 *argop; 8266 nfs_resop4 *resop; 8267 int doqueue; 8268 mntinfo4_t *mi; 8269 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8270 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8271 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8272 RENAME4res *rn_res; 8273 GETFH4res *ngf_res; 8274 bool_t needrecov; 8275 nfs4_recov_state_t recov_state; 8276 hrtime_t t; 8277 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8278 dirattr_info_t dinfo, *dinfop = &dinfo; 8279 8280 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8281 8282 recov_state.rs_flags = 0; 8283 recov_state.rs_num_retry_despite_err = 0; 8284 8285 recov_retry: 8286 *statp = NFS4_OK; 8287 8288 /* 8289 * There is a window between the RPC and updating the path and 8290 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8291 * code, so that it doesn't try to use the old path during that 8292 * window. 8293 */ 8294 mutex_enter(&orp->r_statelock); 8295 while (orp->r_flags & R4RECEXPFH) { 8296 klwp_t *lwp = ttolwp(curthread); 8297 8298 if (lwp != NULL) 8299 lwp->lwp_nostop++; 8300 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8301 mutex_exit(&orp->r_statelock); 8302 if (lwp != NULL) 8303 lwp->lwp_nostop--; 8304 return (EINTR); 8305 } 8306 if (lwp != NULL) 8307 lwp->lwp_nostop--; 8308 } 8309 orp->r_flags |= R4RECEXPFH; 8310 mutex_exit(&orp->r_statelock); 8311 8312 mi = VTOMI4(odvp); 8313 8314 args.ctag = TAG_RENAME_VFH; 8315 args.array_len = (odvp == ndvp) ? 10 : 12; 8316 argoplist_size = args.array_len * sizeof (nfs_argop4); 8317 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8318 8319 /* 8320 * Rename ops: 8321 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8322 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8323 * LOOKUP(trgt), GETFH(new), GETATTR, 8324 * 8325 * if (odvp != ndvp) 8326 * add putfh(sourcedir), getattr(sourcedir) } 8327 */ 8328 args.array = argop; 8329 8330 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8331 &recov_state, NULL); 8332 if (e.error) { 8333 kmem_free(argop, argoplist_size); 8334 mutex_enter(&orp->r_statelock); 8335 orp->r_flags &= ~R4RECEXPFH; 8336 cv_broadcast(&orp->r_cv); 8337 mutex_exit(&orp->r_statelock); 8338 return (e.error); 8339 } 8340 8341 /* 0: putfh source directory */ 8342 argop[0].argop = OP_CPUTFH; 8343 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8344 8345 /* 1: Save source fh to free up current for target */ 8346 argop[1].argop = OP_SAVEFH; 8347 8348 /* 2: Lookup pre-rename fh of renamed object */ 8349 argop[2].argop = OP_CLOOKUP; 8350 argop[2].nfs_argop4_u.opclookup.cname = onm; 8351 8352 /* 3: getfh fh of renamed object (before rename) */ 8353 argop[3].argop = OP_GETFH; 8354 8355 /* 4: putfh targetdir */ 8356 argop[4].argop = OP_CPUTFH; 8357 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8358 8359 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8360 argop[5].argop = OP_CRENAME; 8361 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8362 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8363 8364 /* 6: getattr of target dir (post op attrs) */ 8365 argop[6].argop = OP_GETATTR; 8366 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8367 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8368 8369 /* 7: Lookup post-rename fh of renamed object */ 8370 argop[7].argop = OP_CLOOKUP; 8371 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8372 8373 /* 8: getfh fh of renamed object (after rename) */ 8374 argop[8].argop = OP_GETFH; 8375 8376 /* 9: getattr of renamed object */ 8377 argop[9].argop = OP_GETATTR; 8378 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8379 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8380 8381 /* 8382 * If source/target dirs are different, then get new post-op 8383 * attrs for source dir also. 8384 */ 8385 if (ndvp != odvp) { 8386 /* 10: putfh (sourcedir) */ 8387 argop[10].argop = OP_CPUTFH; 8388 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8389 8390 /* 11: getattr (sourcedir) */ 8391 argop[11].argop = OP_GETATTR; 8392 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8393 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8394 } 8395 8396 dnlc_remove(odvp, onm); 8397 dnlc_remove(ndvp, nnm); 8398 8399 doqueue = 1; 8400 t = gethrtime(); 8401 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8402 8403 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8404 if (e.error) { 8405 PURGE_ATTRCACHE4(odvp); 8406 PURGE_ATTRCACHE4(ndvp); 8407 if (!needrecov) { 8408 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8409 &recov_state, needrecov); 8410 goto out; 8411 } 8412 } else { 8413 *statp = res.status; 8414 } 8415 8416 if (needrecov) { 8417 bool_t abort; 8418 8419 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8420 OP_RENAME, NULL); 8421 if (abort == FALSE) { 8422 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8423 &recov_state, needrecov); 8424 kmem_free(argop, argoplist_size); 8425 if (!e.error) 8426 (void) xdr_free(xdr_COMPOUND4res_clnt, 8427 (caddr_t)&res); 8428 mutex_enter(&orp->r_statelock); 8429 orp->r_flags &= ~R4RECEXPFH; 8430 cv_broadcast(&orp->r_cv); 8431 mutex_exit(&orp->r_statelock); 8432 goto recov_retry; 8433 } else { 8434 if (e.error != 0) { 8435 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8436 &recov_state, needrecov); 8437 goto out; 8438 } 8439 /* fall through for res.status case */ 8440 } 8441 } 8442 8443 resp = &res; 8444 /* 8445 * If OP_RENAME (or any prev op) failed, then return an error. 8446 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8447 */ 8448 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8449 /* 8450 * Error in an op other than last Getattr 8451 */ 8452 e.error = geterrno4(res.status); 8453 PURGE_ATTRCACHE4(odvp); 8454 PURGE_ATTRCACHE4(ndvp); 8455 /* 8456 * System V defines rename to return EEXIST, not 8457 * ENOTEMPTY if the target directory is not empty. 8458 * Over the wire, the error is NFSERR_ENOTEMPTY 8459 * which geterrno4 maps to ENOTEMPTY. 8460 */ 8461 if (e.error == ENOTEMPTY) 8462 e.error = EEXIST; 8463 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8464 needrecov); 8465 goto out; 8466 } 8467 8468 /* rename results */ 8469 rn_res = &res.array[5].nfs_resop4_u.oprename; 8470 8471 if (res.status == NFS4_OK) { 8472 /* Update target attribute, readdir and dnlc caches */ 8473 dinfo.di_garp = 8474 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8475 dinfo.di_cred = cr; 8476 dinfo.di_time_call = t; 8477 } else 8478 dinfop = NULL; 8479 8480 /* Update source cache attribute, readdir and dnlc caches */ 8481 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8482 8483 /* Update source cache attribute, readdir and dnlc caches */ 8484 if (ndvp != odvp) { 8485 update_parentdir_sfh(ovp, ndvp); 8486 8487 /* 8488 * If dinfop is non-NULL, then compound succeded, so 8489 * set di_garp to attrs for source dir. dinfop is only 8490 * set to NULL when compound fails. 8491 */ 8492 if (dinfop) 8493 dinfo.di_garp = 8494 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8495 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8496 dinfop); 8497 } 8498 8499 /* 8500 * Update the rnode with the new component name and args, 8501 * and if the file handle changed, also update it with the new fh. 8502 * This is only necessary if the target object has an rnode 8503 * entry and there is no need to create one for it. 8504 */ 8505 resop = &res.array[8]; /* getfh new res */ 8506 ngf_res = &resop->nfs_resop4_u.opgetfh; 8507 8508 /* 8509 * Update the path and filehandle for the renamed object. 8510 */ 8511 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8512 8513 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8514 8515 if (res.status == NFS4_OK) { 8516 resop++; /* getattr res */ 8517 e.error = nfs4_update_attrcache(res.status, 8518 &resop->nfs_resop4_u.opgetattr.ga_res, 8519 t, ovp, cr); 8520 } 8521 8522 out: 8523 kmem_free(argop, argoplist_size); 8524 if (resp) 8525 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8526 mutex_enter(&orp->r_statelock); 8527 orp->r_flags &= ~R4RECEXPFH; 8528 cv_broadcast(&orp->r_cv); 8529 mutex_exit(&orp->r_statelock); 8530 8531 return (e.error); 8532 } 8533 8534 /* ARGSUSED */ 8535 static int 8536 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8537 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8538 { 8539 int error; 8540 vnode_t *vp; 8541 8542 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8543 return (EPERM); 8544 /* 8545 * As ".." has special meaning and rather than send a mkdir 8546 * over the wire to just let the server freak out, we just 8547 * short circuit it here and return EEXIST 8548 */ 8549 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8550 return (EEXIST); 8551 8552 /* 8553 * Decision to get the right gid and setgid bit of the 8554 * new directory is now made in call_nfs4_create_req. 8555 */ 8556 va->va_mask |= AT_MODE; 8557 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8558 if (error) 8559 return (error); 8560 8561 *vpp = vp; 8562 return (0); 8563 } 8564 8565 8566 /* 8567 * rmdir is using the same remove v4 op as does remove. 8568 * Remove requires that the current fh be the target directory. 8569 * After the operation, the current fh is unchanged. 8570 * The compound op structure is: 8571 * PUTFH(targetdir), REMOVE 8572 */ 8573 /*ARGSUSED4*/ 8574 static int 8575 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8576 caller_context_t *ct, int flags) 8577 { 8578 int need_end_op = FALSE; 8579 COMPOUND4args_clnt args; 8580 COMPOUND4res_clnt res, *resp = NULL; 8581 REMOVE4res *rm_res; 8582 nfs_argop4 argop[3]; 8583 nfs_resop4 *resop; 8584 vnode_t *vp; 8585 int doqueue; 8586 mntinfo4_t *mi; 8587 rnode4_t *drp; 8588 bool_t needrecov = FALSE; 8589 nfs4_recov_state_t recov_state; 8590 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8591 dirattr_info_t dinfo, *dinfop; 8592 8593 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8594 return (EPERM); 8595 /* 8596 * As ".." has special meaning and rather than send a rmdir 8597 * over the wire to just let the server freak out, we just 8598 * short circuit it here and return EEXIST 8599 */ 8600 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8601 return (EEXIST); 8602 8603 drp = VTOR4(dvp); 8604 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8605 return (EINTR); 8606 8607 /* 8608 * Attempt to prevent a rmdir(".") from succeeding. 8609 */ 8610 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8611 if (e.error) { 8612 nfs_rw_exit(&drp->r_rwlock); 8613 return (e.error); 8614 } 8615 if (vp == cdir) { 8616 VN_RELE(vp); 8617 nfs_rw_exit(&drp->r_rwlock); 8618 return (EINVAL); 8619 } 8620 8621 /* 8622 * Since nfsv4 remove op works on both files and directories, 8623 * check that the removed object is indeed a directory. 8624 */ 8625 if (vp->v_type != VDIR) { 8626 VN_RELE(vp); 8627 nfs_rw_exit(&drp->r_rwlock); 8628 return (ENOTDIR); 8629 } 8630 8631 /* 8632 * First just remove the entry from the name cache, as it 8633 * is most likely an entry for this vp. 8634 */ 8635 dnlc_remove(dvp, nm); 8636 8637 /* 8638 * If there vnode reference count is greater than one, then 8639 * there may be additional references in the DNLC which will 8640 * need to be purged. First, trying removing the entry for 8641 * the parent directory and see if that removes the additional 8642 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8643 * to completely remove any references to the directory which 8644 * might still exist in the DNLC. 8645 */ 8646 if (vp->v_count > 1) { 8647 dnlc_remove(vp, ".."); 8648 if (vp->v_count > 1) 8649 dnlc_purge_vp(vp); 8650 } 8651 8652 mi = VTOMI4(dvp); 8653 recov_state.rs_flags = 0; 8654 recov_state.rs_num_retry_despite_err = 0; 8655 8656 recov_retry: 8657 args.ctag = TAG_RMDIR; 8658 8659 /* 8660 * Rmdir ops: putfh dir; remove 8661 */ 8662 args.array_len = 3; 8663 args.array = argop; 8664 8665 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8666 if (e.error) { 8667 nfs_rw_exit(&drp->r_rwlock); 8668 return (e.error); 8669 } 8670 need_end_op = TRUE; 8671 8672 /* putfh directory */ 8673 argop[0].argop = OP_CPUTFH; 8674 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8675 8676 /* remove */ 8677 argop[1].argop = OP_CREMOVE; 8678 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8679 8680 /* getattr (postop attrs for dir that contained removed dir) */ 8681 argop[2].argop = OP_GETATTR; 8682 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8683 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8684 8685 dinfo.di_time_call = gethrtime(); 8686 doqueue = 1; 8687 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8688 8689 PURGE_ATTRCACHE4(vp); 8690 8691 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8692 if (e.error) { 8693 PURGE_ATTRCACHE4(dvp); 8694 } 8695 8696 if (needrecov) { 8697 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8698 NULL, OP_REMOVE, NULL) == FALSE) { 8699 if (!e.error) 8700 (void) xdr_free(xdr_COMPOUND4res_clnt, 8701 (caddr_t)&res); 8702 8703 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8704 needrecov); 8705 need_end_op = FALSE; 8706 goto recov_retry; 8707 } 8708 } 8709 8710 if (!e.error) { 8711 resp = &res; 8712 8713 /* 8714 * Only return error if first 2 ops (OP_REMOVE or earlier) 8715 * failed. 8716 */ 8717 if (res.status != NFS4_OK && res.array_len <= 2) { 8718 e.error = geterrno4(res.status); 8719 PURGE_ATTRCACHE4(dvp); 8720 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8721 &recov_state, needrecov); 8722 need_end_op = FALSE; 8723 nfs4_purge_stale_fh(e.error, dvp, cr); 8724 /* 8725 * System V defines rmdir to return EEXIST, not 8726 * ENOTEMPTY if the directory is not empty. Over 8727 * the wire, the error is NFSERR_ENOTEMPTY which 8728 * geterrno4 maps to ENOTEMPTY. 8729 */ 8730 if (e.error == ENOTEMPTY) 8731 e.error = EEXIST; 8732 } else { 8733 resop = &res.array[1]; /* remove res */ 8734 rm_res = &resop->nfs_resop4_u.opremove; 8735 8736 if (res.status == NFS4_OK) { 8737 resop = &res.array[2]; /* dir attrs */ 8738 dinfo.di_garp = 8739 &resop->nfs_resop4_u.opgetattr.ga_res; 8740 dinfo.di_cred = cr; 8741 dinfop = &dinfo; 8742 } else 8743 dinfop = NULL; 8744 8745 /* Update dir attribute, readdir and dnlc caches */ 8746 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8747 dinfop); 8748 8749 /* destroy rddir cache for dir that was removed */ 8750 if (VTOR4(vp)->r_dir != NULL) 8751 nfs4_purge_rddir_cache(vp); 8752 } 8753 } 8754 8755 if (need_end_op) 8756 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8757 8758 nfs_rw_exit(&drp->r_rwlock); 8759 8760 if (resp) 8761 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8762 8763 if (e.error == 0) { 8764 vnode_t *tvp; 8765 rnode4_t *trp; 8766 trp = VTOR4(vp); 8767 tvp = vp; 8768 if (IS_SHADOW(vp, trp)) 8769 tvp = RTOV4(trp); 8770 vnevent_rmdir(tvp, dvp, nm, ct); 8771 } 8772 8773 VN_RELE(vp); 8774 8775 return (e.error); 8776 } 8777 8778 /* ARGSUSED */ 8779 static int 8780 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8781 caller_context_t *ct, int flags) 8782 { 8783 int error; 8784 vnode_t *vp; 8785 rnode4_t *rp; 8786 char *contents; 8787 mntinfo4_t *mi = VTOMI4(dvp); 8788 8789 if (nfs_zone() != mi->mi_zone) 8790 return (EPERM); 8791 if (!(mi->mi_flags & MI4_SYMLINK)) 8792 return (EOPNOTSUPP); 8793 8794 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8795 if (error) 8796 return (error); 8797 8798 ASSERT(nfs4_consistent_type(vp)); 8799 rp = VTOR4(vp); 8800 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8801 8802 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8803 8804 if (contents != NULL) { 8805 mutex_enter(&rp->r_statelock); 8806 if (rp->r_symlink.contents == NULL) { 8807 rp->r_symlink.len = strlen(tnm); 8808 bcopy(tnm, contents, rp->r_symlink.len); 8809 rp->r_symlink.contents = contents; 8810 rp->r_symlink.size = MAXPATHLEN; 8811 mutex_exit(&rp->r_statelock); 8812 } else { 8813 mutex_exit(&rp->r_statelock); 8814 kmem_free((void *)contents, MAXPATHLEN); 8815 } 8816 } 8817 } 8818 VN_RELE(vp); 8819 8820 return (error); 8821 } 8822 8823 8824 /* 8825 * Read directory entries. 8826 * There are some weird things to look out for here. The uio_loffset 8827 * field is either 0 or it is the offset returned from a previous 8828 * readdir. It is an opaque value used by the server to find the 8829 * correct directory block to read. The count field is the number 8830 * of blocks to read on the server. This is advisory only, the server 8831 * may return only one block's worth of entries. Entries may be compressed 8832 * on the server. 8833 */ 8834 /* ARGSUSED */ 8835 static int 8836 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8837 caller_context_t *ct, int flags) 8838 { 8839 int error; 8840 uint_t count; 8841 rnode4_t *rp; 8842 rddir4_cache *rdc; 8843 rddir4_cache *rrdc; 8844 8845 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8846 return (EIO); 8847 rp = VTOR4(vp); 8848 8849 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8850 8851 /* 8852 * Make sure that the directory cache is valid. 8853 */ 8854 if (rp->r_dir != NULL) { 8855 if (nfs_disable_rddir_cache != 0) { 8856 /* 8857 * Setting nfs_disable_rddir_cache in /etc/system 8858 * allows interoperability with servers that do not 8859 * properly update the attributes of directories. 8860 * Any cached information gets purged before an 8861 * access is made to it. 8862 */ 8863 nfs4_purge_rddir_cache(vp); 8864 } 8865 8866 error = nfs4_validate_caches(vp, cr); 8867 if (error) 8868 return (error); 8869 } 8870 8871 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8872 8873 /* 8874 * Short circuit last readdir which always returns 0 bytes. 8875 * This can be done after the directory has been read through 8876 * completely at least once. This will set r_direof which 8877 * can be used to find the value of the last cookie. 8878 */ 8879 mutex_enter(&rp->r_statelock); 8880 if (rp->r_direof != NULL && 8881 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8882 mutex_exit(&rp->r_statelock); 8883 #ifdef DEBUG 8884 nfs4_readdir_cache_shorts++; 8885 #endif 8886 if (eofp) 8887 *eofp = 1; 8888 return (0); 8889 } 8890 8891 /* 8892 * Look for a cache entry. Cache entries are identified 8893 * by the NFS cookie value and the byte count requested. 8894 */ 8895 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8896 8897 /* 8898 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8899 */ 8900 if (rdc == NULL) { 8901 mutex_exit(&rp->r_statelock); 8902 return (EINTR); 8903 } 8904 8905 /* 8906 * Check to see if we need to fill this entry in. 8907 */ 8908 if (rdc->flags & RDDIRREQ) { 8909 rdc->flags &= ~RDDIRREQ; 8910 rdc->flags |= RDDIR; 8911 mutex_exit(&rp->r_statelock); 8912 8913 /* 8914 * Do the readdir. 8915 */ 8916 nfs4readdir(vp, rdc, cr); 8917 8918 /* 8919 * Reacquire the lock, so that we can continue 8920 */ 8921 mutex_enter(&rp->r_statelock); 8922 /* 8923 * The entry is now complete 8924 */ 8925 rdc->flags &= ~RDDIR; 8926 } 8927 8928 ASSERT(!(rdc->flags & RDDIR)); 8929 8930 /* 8931 * If an error occurred while attempting 8932 * to fill the cache entry, mark the entry invalid and 8933 * just return the error. 8934 */ 8935 if (rdc->error) { 8936 error = rdc->error; 8937 rdc->flags |= RDDIRREQ; 8938 rddir4_cache_rele(rp, rdc); 8939 mutex_exit(&rp->r_statelock); 8940 return (error); 8941 } 8942 8943 /* 8944 * The cache entry is complete and good, 8945 * copyout the dirent structs to the calling 8946 * thread. 8947 */ 8948 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8949 8950 /* 8951 * If no error occurred during the copyout, 8952 * update the offset in the uio struct to 8953 * contain the value of the next NFS 4 cookie 8954 * and set the eof value appropriately. 8955 */ 8956 if (!error) { 8957 uiop->uio_loffset = rdc->nfs4_ncookie; 8958 if (eofp) 8959 *eofp = rdc->eof; 8960 } 8961 8962 /* 8963 * Decide whether to do readahead. Don't if we 8964 * have already read to the end of directory. 8965 */ 8966 if (rdc->eof) { 8967 /* 8968 * Make the entry the direof only if it is cached 8969 */ 8970 if (rdc->flags & RDDIRCACHED) 8971 rp->r_direof = rdc; 8972 rddir4_cache_rele(rp, rdc); 8973 mutex_exit(&rp->r_statelock); 8974 return (error); 8975 } 8976 8977 /* Determine if a readdir readahead should be done */ 8978 if (!(rp->r_flags & R4LOOKUP)) { 8979 rddir4_cache_rele(rp, rdc); 8980 mutex_exit(&rp->r_statelock); 8981 return (error); 8982 } 8983 8984 /* 8985 * Now look for a readahead entry. 8986 * 8987 * Check to see whether we found an entry for the readahead. 8988 * If so, we don't need to do anything further, so free the new 8989 * entry if one was allocated. Otherwise, allocate a new entry, add 8990 * it to the cache, and then initiate an asynchronous readdir 8991 * operation to fill it. 8992 */ 8993 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8994 8995 /* 8996 * A readdir cache entry could not be obtained for the readahead. In 8997 * this case we skip the readahead and return. 8998 */ 8999 if (rrdc == NULL) { 9000 rddir4_cache_rele(rp, rdc); 9001 mutex_exit(&rp->r_statelock); 9002 return (error); 9003 } 9004 9005 /* 9006 * Check to see if we need to fill this entry in. 9007 */ 9008 if (rrdc->flags & RDDIRREQ) { 9009 rrdc->flags &= ~RDDIRREQ; 9010 rrdc->flags |= RDDIR; 9011 rddir4_cache_rele(rp, rdc); 9012 mutex_exit(&rp->r_statelock); 9013 #ifdef DEBUG 9014 nfs4_readdir_readahead++; 9015 #endif 9016 /* 9017 * Do the readdir. 9018 */ 9019 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9020 return (error); 9021 } 9022 9023 rddir4_cache_rele(rp, rrdc); 9024 rddir4_cache_rele(rp, rdc); 9025 mutex_exit(&rp->r_statelock); 9026 return (error); 9027 } 9028 9029 static int 9030 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9031 { 9032 int error; 9033 rnode4_t *rp; 9034 9035 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9036 9037 rp = VTOR4(vp); 9038 9039 /* 9040 * Obtain the readdir results for the caller. 9041 */ 9042 nfs4readdir(vp, rdc, cr); 9043 9044 mutex_enter(&rp->r_statelock); 9045 /* 9046 * The entry is now complete 9047 */ 9048 rdc->flags &= ~RDDIR; 9049 9050 error = rdc->error; 9051 if (error) 9052 rdc->flags |= RDDIRREQ; 9053 rddir4_cache_rele(rp, rdc); 9054 mutex_exit(&rp->r_statelock); 9055 9056 return (error); 9057 } 9058 9059 /* 9060 * Read directory entries. 9061 * There are some weird things to look out for here. The uio_loffset 9062 * field is either 0 or it is the offset returned from a previous 9063 * readdir. It is an opaque value used by the server to find the 9064 * correct directory block to read. The count field is the number 9065 * of blocks to read on the server. This is advisory only, the server 9066 * may return only one block's worth of entries. Entries may be compressed 9067 * on the server. 9068 * 9069 * Generates the following compound request: 9070 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9071 * must include a Lookupp as well. In this case, send: 9072 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9073 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9074 * 9075 * Get complete attributes and filehandles for entries if this is the 9076 * first read of the directory. Otherwise, just get fileid's. 9077 */ 9078 static void 9079 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9080 { 9081 COMPOUND4args_clnt args; 9082 COMPOUND4res_clnt res; 9083 READDIR4args *rargs; 9084 READDIR4res_clnt *rd_res; 9085 bitmap4 rd_bitsval; 9086 nfs_argop4 argop[5]; 9087 nfs_resop4 *resop; 9088 rnode4_t *rp = VTOR4(vp); 9089 mntinfo4_t *mi = VTOMI4(vp); 9090 int doqueue; 9091 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9092 vnode_t *dvp; 9093 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9094 int num_ops, res_opcnt; 9095 bool_t needrecov = FALSE; 9096 nfs4_recov_state_t recov_state; 9097 hrtime_t t; 9098 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9099 9100 ASSERT(nfs_zone() == mi->mi_zone); 9101 ASSERT(rdc->flags & RDDIR); 9102 ASSERT(rdc->entries == NULL); 9103 9104 /* 9105 * If rp were a stub, it should have triggered and caused 9106 * a mount for us to get this far. 9107 */ 9108 ASSERT(!RP_ISSTUB(rp)); 9109 9110 num_ops = 2; 9111 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9112 /* 9113 * Since nfsv4 readdir may not return entries for "." and "..", 9114 * the client must recreate them: 9115 * To find the correct nodeid, do the following: 9116 * For current node, get nodeid from dnlc. 9117 * - if current node is rootvp, set pnodeid to nodeid. 9118 * - else if parent is in the dnlc, get its nodeid from there. 9119 * - else add LOOKUPP+GETATTR to compound. 9120 */ 9121 nodeid = rp->r_attr.va_nodeid; 9122 if (vp->v_flag & VROOT) { 9123 pnodeid = nodeid; /* root of mount point */ 9124 } else { 9125 dvp = dnlc_lookup(vp, ".."); 9126 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9127 /* parent in dnlc cache - no need for otw */ 9128 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9129 } else { 9130 /* 9131 * parent not in dnlc cache, 9132 * do lookupp to get its id 9133 */ 9134 num_ops = 5; 9135 pnodeid = 0; /* set later by getattr parent */ 9136 } 9137 if (dvp) 9138 VN_RELE(dvp); 9139 } 9140 } 9141 recov_state.rs_flags = 0; 9142 recov_state.rs_num_retry_despite_err = 0; 9143 9144 /* Save the original mount point security flavor */ 9145 (void) save_mnt_secinfo(mi->mi_curr_serv); 9146 9147 recov_retry: 9148 args.ctag = TAG_READDIR; 9149 9150 args.array = argop; 9151 args.array_len = num_ops; 9152 9153 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9154 &recov_state, NULL)) { 9155 /* 9156 * If readdir a node that is a stub for a crossed mount point, 9157 * keep the original secinfo flavor for the current file 9158 * system, not the crossed one. 9159 */ 9160 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9161 rdc->error = e.error; 9162 return; 9163 } 9164 9165 /* 9166 * Determine which attrs to request for dirents. This code 9167 * must be protected by nfs4_start/end_fop because of r_server 9168 * (which will change during failover recovery). 9169 * 9170 */ 9171 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9172 /* 9173 * Get all vattr attrs plus filehandle and rdattr_error 9174 */ 9175 rd_bitsval = NFS4_VATTR_MASK | 9176 FATTR4_RDATTR_ERROR_MASK | 9177 FATTR4_FILEHANDLE_MASK; 9178 9179 if (rp->r_flags & R4READDIRWATTR) { 9180 mutex_enter(&rp->r_statelock); 9181 rp->r_flags &= ~R4READDIRWATTR; 9182 mutex_exit(&rp->r_statelock); 9183 } 9184 } else { 9185 servinfo4_t *svp = rp->r_server; 9186 9187 /* 9188 * Already read directory. Use readdir with 9189 * no attrs (except for mounted_on_fileid) for updates. 9190 */ 9191 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9192 9193 /* 9194 * request mounted on fileid if supported, else request 9195 * fileid. maybe we should verify that fileid is supported 9196 * and request something else if not. 9197 */ 9198 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9199 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9200 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9201 nfs_rw_exit(&svp->sv_lock); 9202 } 9203 9204 /* putfh directory fh */ 9205 argop[0].argop = OP_CPUTFH; 9206 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9207 9208 argop[1].argop = OP_READDIR; 9209 rargs = &argop[1].nfs_argop4_u.opreaddir; 9210 /* 9211 * 1 and 2 are reserved for client "." and ".." entry offset. 9212 * cookie 0 should be used over-the-wire to start reading at 9213 * the beginning of the directory excluding "." and "..". 9214 */ 9215 if (rdc->nfs4_cookie == 0 || 9216 rdc->nfs4_cookie == 1 || 9217 rdc->nfs4_cookie == 2) { 9218 rargs->cookie = (nfs_cookie4)0; 9219 rargs->cookieverf = 0; 9220 } else { 9221 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9222 mutex_enter(&rp->r_statelock); 9223 rargs->cookieverf = rp->r_cookieverf4; 9224 mutex_exit(&rp->r_statelock); 9225 } 9226 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9227 rargs->maxcount = mi->mi_tsize; 9228 rargs->attr_request = rd_bitsval; 9229 rargs->rdc = rdc; 9230 rargs->dvp = vp; 9231 rargs->mi = mi; 9232 rargs->cr = cr; 9233 9234 9235 /* 9236 * If count < than the minimum required, we return no entries 9237 * and fail with EINVAL 9238 */ 9239 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9240 rdc->error = EINVAL; 9241 goto out; 9242 } 9243 9244 if (args.array_len == 5) { 9245 /* 9246 * Add lookupp and getattr for parent nodeid. 9247 */ 9248 argop[2].argop = OP_LOOKUPP; 9249 9250 argop[3].argop = OP_GETFH; 9251 9252 /* getattr parent */ 9253 argop[4].argop = OP_GETATTR; 9254 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9255 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9256 } 9257 9258 doqueue = 1; 9259 9260 if (mi->mi_io_kstats) { 9261 mutex_enter(&mi->mi_lock); 9262 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9263 mutex_exit(&mi->mi_lock); 9264 } 9265 9266 /* capture the time of this call */ 9267 rargs->t = t = gethrtime(); 9268 9269 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9270 9271 if (mi->mi_io_kstats) { 9272 mutex_enter(&mi->mi_lock); 9273 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9274 mutex_exit(&mi->mi_lock); 9275 } 9276 9277 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9278 9279 /* 9280 * If RPC error occurred and it isn't an error that 9281 * triggers recovery, then go ahead and fail now. 9282 */ 9283 if (e.error != 0 && !needrecov) { 9284 rdc->error = e.error; 9285 goto out; 9286 } 9287 9288 if (needrecov) { 9289 bool_t abort; 9290 9291 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9292 "nfs4readdir: initiating recovery.\n")); 9293 9294 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9295 NULL, OP_READDIR, NULL); 9296 if (abort == FALSE) { 9297 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9298 &recov_state, needrecov); 9299 if (!e.error) 9300 (void) xdr_free(xdr_COMPOUND4res_clnt, 9301 (caddr_t)&res); 9302 if (rdc->entries != NULL) { 9303 kmem_free(rdc->entries, rdc->entlen); 9304 rdc->entries = NULL; 9305 } 9306 goto recov_retry; 9307 } 9308 9309 if (e.error != 0) { 9310 rdc->error = e.error; 9311 goto out; 9312 } 9313 9314 /* fall through for res.status case */ 9315 } 9316 9317 res_opcnt = res.array_len; 9318 9319 /* 9320 * If compound failed first 2 ops (PUTFH+READDIR), then return 9321 * failure here. Subsequent ops are for filling out dot-dot 9322 * dirent, and if they fail, we still want to give the caller 9323 * the dirents returned by (the successful) READDIR op, so we need 9324 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9325 * 9326 * One example where PUTFH+READDIR ops would succeed but 9327 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9328 * but lacks x. In this case, a POSIX server's VOP_READDIR 9329 * would succeed; however, VOP_LOOKUP(..) would fail since no 9330 * x perm. We need to come up with a non-vendor-specific way 9331 * for a POSIX server to return d_ino from dotdot's dirent if 9332 * client only requests mounted_on_fileid, and just say the 9333 * LOOKUPP succeeded and fill out the GETATTR. However, if 9334 * client requested any mandatory attrs, server would be required 9335 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9336 * for dotdot. 9337 */ 9338 9339 if (res.status) { 9340 if (res_opcnt <= 2) { 9341 e.error = geterrno4(res.status); 9342 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9343 &recov_state, needrecov); 9344 nfs4_purge_stale_fh(e.error, vp, cr); 9345 rdc->error = e.error; 9346 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9347 if (rdc->entries != NULL) { 9348 kmem_free(rdc->entries, rdc->entlen); 9349 rdc->entries = NULL; 9350 } 9351 /* 9352 * If readdir a node that is a stub for a 9353 * crossed mount point, keep the original 9354 * secinfo flavor for the current file system, 9355 * not the crossed one. 9356 */ 9357 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9358 return; 9359 } 9360 } 9361 9362 resop = &res.array[1]; /* readdir res */ 9363 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9364 9365 mutex_enter(&rp->r_statelock); 9366 rp->r_cookieverf4 = rd_res->cookieverf; 9367 mutex_exit(&rp->r_statelock); 9368 9369 /* 9370 * For "." and ".." entries 9371 * e.g. 9372 * seek(cookie=0) -> "." entry with d_off = 1 9373 * seek(cookie=1) -> ".." entry with d_off = 2 9374 */ 9375 if (cookie == (nfs_cookie4) 0) { 9376 if (rd_res->dotp) 9377 rd_res->dotp->d_ino = nodeid; 9378 if (rd_res->dotdotp) 9379 rd_res->dotdotp->d_ino = pnodeid; 9380 } 9381 if (cookie == (nfs_cookie4) 1) { 9382 if (rd_res->dotdotp) 9383 rd_res->dotdotp->d_ino = pnodeid; 9384 } 9385 9386 9387 /* LOOKUPP+GETATTR attemped */ 9388 if (args.array_len == 5 && rd_res->dotdotp) { 9389 if (res.status == NFS4_OK && res_opcnt == 5) { 9390 nfs_fh4 *fhp; 9391 nfs4_sharedfh_t *sfhp; 9392 vnode_t *pvp; 9393 nfs4_ga_res_t *garp; 9394 9395 resop++; /* lookupp */ 9396 resop++; /* getfh */ 9397 fhp = &resop->nfs_resop4_u.opgetfh.object; 9398 9399 resop++; /* getattr of parent */ 9400 9401 /* 9402 * First, take care of finishing the 9403 * readdir results. 9404 */ 9405 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9406 /* 9407 * The d_ino of .. must be the inode number 9408 * of the mounted filesystem. 9409 */ 9410 if (garp->n4g_va.va_mask & AT_NODEID) 9411 rd_res->dotdotp->d_ino = 9412 garp->n4g_va.va_nodeid; 9413 9414 9415 /* 9416 * Next, create the ".." dnlc entry 9417 */ 9418 sfhp = sfh4_get(fhp, mi); 9419 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9420 dnlc_update(vp, "..", pvp); 9421 VN_RELE(pvp); 9422 } 9423 sfh4_rele(&sfhp); 9424 } 9425 } 9426 9427 if (mi->mi_io_kstats) { 9428 mutex_enter(&mi->mi_lock); 9429 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9430 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9431 mutex_exit(&mi->mi_lock); 9432 } 9433 9434 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9435 9436 out: 9437 /* 9438 * If readdir a node that is a stub for a crossed mount point, 9439 * keep the original secinfo flavor for the current file system, 9440 * not the crossed one. 9441 */ 9442 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9443 9444 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9445 } 9446 9447 9448 static int 9449 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9450 { 9451 rnode4_t *rp = VTOR4(bp->b_vp); 9452 int count; 9453 int error; 9454 cred_t *cred_otw = NULL; 9455 offset_t offset; 9456 nfs4_open_stream_t *osp = NULL; 9457 bool_t first_time = TRUE; /* first time getting otw cred */ 9458 bool_t last_time = FALSE; /* last time getting otw cred */ 9459 9460 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9461 9462 DTRACE_IO1(start, struct buf *, bp); 9463 offset = ldbtob(bp->b_lblkno); 9464 9465 if (bp->b_flags & B_READ) { 9466 read_again: 9467 /* 9468 * Releases the osp, if it is provided. 9469 * Puts a hold on the cred_otw and the new osp (if found). 9470 */ 9471 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9472 &first_time, &last_time); 9473 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9474 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9475 readahead, NULL); 9476 crfree(cred_otw); 9477 if (!error) { 9478 if (bp->b_resid) { 9479 /* 9480 * Didn't get it all because we hit EOF, 9481 * zero all the memory beyond the EOF. 9482 */ 9483 /* bzero(rdaddr + */ 9484 bzero(bp->b_un.b_addr + 9485 bp->b_bcount - bp->b_resid, bp->b_resid); 9486 } 9487 mutex_enter(&rp->r_statelock); 9488 if (bp->b_resid == bp->b_bcount && 9489 offset >= rp->r_size) { 9490 /* 9491 * We didn't read anything at all as we are 9492 * past EOF. Return an error indicator back 9493 * but don't destroy the pages (yet). 9494 */ 9495 error = NFS_EOF; 9496 } 9497 mutex_exit(&rp->r_statelock); 9498 } else if (error == EACCES && last_time == FALSE) { 9499 goto read_again; 9500 } 9501 } else { 9502 if (!(rp->r_flags & R4STALE)) { 9503 write_again: 9504 /* 9505 * Releases the osp, if it is provided. 9506 * Puts a hold on the cred_otw and the new 9507 * osp (if found). 9508 */ 9509 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9510 &first_time, &last_time); 9511 mutex_enter(&rp->r_statelock); 9512 count = MIN(bp->b_bcount, rp->r_size - offset); 9513 mutex_exit(&rp->r_statelock); 9514 if (count < 0) 9515 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9516 #ifdef DEBUG 9517 if (count == 0) { 9518 zoneid_t zoneid = getzoneid(); 9519 9520 zcmn_err(zoneid, CE_WARN, 9521 "nfs4_bio: zero length write at %lld", 9522 offset); 9523 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9524 "b_bcount=%ld, file size=%lld", 9525 rp->r_flags, (long)bp->b_bcount, 9526 rp->r_size); 9527 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9528 if (nfs4_bio_do_stop) 9529 debug_enter("nfs4_bio"); 9530 } 9531 #endif 9532 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9533 count, cred_otw, stab_comm); 9534 if (error == EACCES && last_time == FALSE) { 9535 crfree(cred_otw); 9536 goto write_again; 9537 } 9538 bp->b_error = error; 9539 if (error && error != EINTR && 9540 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9541 /* 9542 * Don't print EDQUOT errors on the console. 9543 * Don't print asynchronous EACCES errors. 9544 * Don't print EFBIG errors. 9545 * Print all other write errors. 9546 */ 9547 if (error != EDQUOT && error != EFBIG && 9548 (error != EACCES || 9549 !(bp->b_flags & B_ASYNC))) 9550 nfs4_write_error(bp->b_vp, 9551 error, cred_otw); 9552 /* 9553 * Update r_error and r_flags as appropriate. 9554 * If the error was ESTALE, then mark the 9555 * rnode as not being writeable and save 9556 * the error status. Otherwise, save any 9557 * errors which occur from asynchronous 9558 * page invalidations. Any errors occurring 9559 * from other operations should be saved 9560 * by the caller. 9561 */ 9562 mutex_enter(&rp->r_statelock); 9563 if (error == ESTALE) { 9564 rp->r_flags |= R4STALE; 9565 if (!rp->r_error) 9566 rp->r_error = error; 9567 } else if (!rp->r_error && 9568 (bp->b_flags & 9569 (B_INVAL|B_FORCE|B_ASYNC)) == 9570 (B_INVAL|B_FORCE|B_ASYNC)) { 9571 rp->r_error = error; 9572 } 9573 mutex_exit(&rp->r_statelock); 9574 } 9575 crfree(cred_otw); 9576 } else 9577 error = rp->r_error; 9578 } 9579 9580 if (error != 0 && error != NFS_EOF) 9581 bp->b_flags |= B_ERROR; 9582 9583 if (osp) 9584 open_stream_rele(osp, rp); 9585 9586 DTRACE_IO1(done, struct buf *, bp); 9587 9588 return (error); 9589 } 9590 9591 /* ARGSUSED */ 9592 int 9593 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9594 { 9595 return (EREMOTE); 9596 } 9597 9598 /* ARGSUSED2 */ 9599 int 9600 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9601 { 9602 rnode4_t *rp = VTOR4(vp); 9603 9604 if (!write_lock) { 9605 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9606 return (V_WRITELOCK_FALSE); 9607 } 9608 9609 if ((rp->r_flags & R4DIRECTIO) || 9610 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9611 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9612 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9613 return (V_WRITELOCK_FALSE); 9614 nfs_rw_exit(&rp->r_rwlock); 9615 } 9616 9617 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9618 return (V_WRITELOCK_TRUE); 9619 } 9620 9621 /* ARGSUSED */ 9622 void 9623 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9624 { 9625 rnode4_t *rp = VTOR4(vp); 9626 9627 nfs_rw_exit(&rp->r_rwlock); 9628 } 9629 9630 /* ARGSUSED */ 9631 static int 9632 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9633 { 9634 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9635 return (EIO); 9636 9637 /* 9638 * Because we stuff the readdir cookie into the offset field 9639 * someone may attempt to do an lseek with the cookie which 9640 * we want to succeed. 9641 */ 9642 if (vp->v_type == VDIR) 9643 return (0); 9644 if (*noffp < 0) 9645 return (EINVAL); 9646 return (0); 9647 } 9648 9649 9650 /* 9651 * Return all the pages from [off..off+len) in file 9652 */ 9653 /* ARGSUSED */ 9654 static int 9655 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9656 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9657 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9658 { 9659 rnode4_t *rp; 9660 int error; 9661 mntinfo4_t *mi; 9662 9663 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9664 return (EIO); 9665 rp = VTOR4(vp); 9666 if (IS_SHADOW(vp, rp)) 9667 vp = RTOV4(rp); 9668 9669 if (vp->v_flag & VNOMAP) 9670 return (ENOSYS); 9671 9672 if (protp != NULL) 9673 *protp = PROT_ALL; 9674 9675 /* 9676 * Now validate that the caches are up to date. 9677 */ 9678 if (error = nfs4_validate_caches(vp, cr)) 9679 return (error); 9680 9681 mi = VTOMI4(vp); 9682 retry: 9683 mutex_enter(&rp->r_statelock); 9684 9685 /* 9686 * Don't create dirty pages faster than they 9687 * can be cleaned so that the system doesn't 9688 * get imbalanced. If the async queue is 9689 * maxed out, then wait for it to drain before 9690 * creating more dirty pages. Also, wait for 9691 * any threads doing pagewalks in the vop_getattr 9692 * entry points so that they don't block for 9693 * long periods. 9694 */ 9695 if (rw == S_CREATE) { 9696 while ((mi->mi_max_threads != 0 && 9697 rp->r_awcount > 2 * mi->mi_max_threads) || 9698 rp->r_gcount > 0) 9699 cv_wait(&rp->r_cv, &rp->r_statelock); 9700 } 9701 9702 /* 9703 * If we are getting called as a side effect of an nfs_write() 9704 * operation the local file size might not be extended yet. 9705 * In this case we want to be able to return pages of zeroes. 9706 */ 9707 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9708 NFS4_DEBUG(nfs4_pageio_debug, 9709 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9710 "len=%llu, size=%llu, attrsize =%llu", off, 9711 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9712 mutex_exit(&rp->r_statelock); 9713 return (EFAULT); /* beyond EOF */ 9714 } 9715 9716 mutex_exit(&rp->r_statelock); 9717 9718 if (len <= PAGESIZE) { 9719 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9720 seg, addr, rw, cr); 9721 NFS4_DEBUG(nfs4_pageio_debug && error, 9722 (CE_NOTE, "getpage error %d; off=%lld, " 9723 "len=%lld", error, off, (u_longlong_t)len)); 9724 } else { 9725 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9726 pl, plsz, seg, addr, rw, cr); 9727 NFS4_DEBUG(nfs4_pageio_debug && error, 9728 (CE_NOTE, "getpages error %d; off=%lld, " 9729 "len=%lld", error, off, (u_longlong_t)len)); 9730 } 9731 9732 switch (error) { 9733 case NFS_EOF: 9734 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9735 goto retry; 9736 case ESTALE: 9737 nfs4_purge_stale_fh(error, vp, cr); 9738 } 9739 9740 return (error); 9741 } 9742 9743 /* 9744 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9745 */ 9746 /* ARGSUSED */ 9747 static int 9748 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9749 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9750 enum seg_rw rw, cred_t *cr) 9751 { 9752 rnode4_t *rp; 9753 uint_t bsize; 9754 struct buf *bp; 9755 page_t *pp; 9756 u_offset_t lbn; 9757 u_offset_t io_off; 9758 u_offset_t blkoff; 9759 u_offset_t rablkoff; 9760 size_t io_len; 9761 uint_t blksize; 9762 int error; 9763 int readahead; 9764 int readahead_issued = 0; 9765 int ra_window; /* readahead window */ 9766 page_t *pagefound; 9767 page_t *savepp; 9768 9769 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9770 return (EIO); 9771 9772 rp = VTOR4(vp); 9773 ASSERT(!IS_SHADOW(vp, rp)); 9774 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9775 9776 reread: 9777 bp = NULL; 9778 pp = NULL; 9779 pagefound = NULL; 9780 9781 if (pl != NULL) 9782 pl[0] = NULL; 9783 9784 error = 0; 9785 lbn = off / bsize; 9786 blkoff = lbn * bsize; 9787 9788 /* 9789 * Queueing up the readahead before doing the synchronous read 9790 * results in a significant increase in read throughput because 9791 * of the increased parallelism between the async threads and 9792 * the process context. 9793 */ 9794 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9795 rw != S_CREATE && 9796 !(vp->v_flag & VNOCACHE)) { 9797 mutex_enter(&rp->r_statelock); 9798 9799 /* 9800 * Calculate the number of readaheads to do. 9801 * a) No readaheads at offset = 0. 9802 * b) Do maximum(nfs4_nra) readaheads when the readahead 9803 * window is closed. 9804 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9805 * upon how far the readahead window is open or close. 9806 * d) No readaheads if rp->r_nextr is not within the scope 9807 * of the readahead window (random i/o). 9808 */ 9809 9810 if (off == 0) 9811 readahead = 0; 9812 else if (blkoff == rp->r_nextr) 9813 readahead = nfs4_nra; 9814 else if (rp->r_nextr > blkoff && 9815 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9816 <= (nfs4_nra - 1))) 9817 readahead = nfs4_nra - ra_window; 9818 else 9819 readahead = 0; 9820 9821 rablkoff = rp->r_nextr; 9822 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9823 mutex_exit(&rp->r_statelock); 9824 if (nfs4_async_readahead(vp, rablkoff + bsize, 9825 addr + (rablkoff + bsize - off), 9826 seg, cr, nfs4_readahead) < 0) { 9827 mutex_enter(&rp->r_statelock); 9828 break; 9829 } 9830 readahead--; 9831 rablkoff += bsize; 9832 /* 9833 * Indicate that we did a readahead so 9834 * readahead offset is not updated 9835 * by the synchronous read below. 9836 */ 9837 readahead_issued = 1; 9838 mutex_enter(&rp->r_statelock); 9839 /* 9840 * set readahead offset to 9841 * offset of last async readahead 9842 * request. 9843 */ 9844 rp->r_nextr = rablkoff; 9845 } 9846 mutex_exit(&rp->r_statelock); 9847 } 9848 9849 again: 9850 if ((pagefound = page_exists(vp, off)) == NULL) { 9851 if (pl == NULL) { 9852 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9853 nfs4_readahead); 9854 } else if (rw == S_CREATE) { 9855 /* 9856 * Block for this page is not allocated, or the offset 9857 * is beyond the current allocation size, or we're 9858 * allocating a swap slot and the page was not found, 9859 * so allocate it and return a zero page. 9860 */ 9861 if ((pp = page_create_va(vp, off, 9862 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9863 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9864 io_len = PAGESIZE; 9865 mutex_enter(&rp->r_statelock); 9866 rp->r_nextr = off + PAGESIZE; 9867 mutex_exit(&rp->r_statelock); 9868 } else { 9869 /* 9870 * Need to go to server to get a block 9871 */ 9872 mutex_enter(&rp->r_statelock); 9873 if (blkoff < rp->r_size && 9874 blkoff + bsize > rp->r_size) { 9875 /* 9876 * If less than a block left in 9877 * file read less than a block. 9878 */ 9879 if (rp->r_size <= off) { 9880 /* 9881 * Trying to access beyond EOF, 9882 * set up to get at least one page. 9883 */ 9884 blksize = off + PAGESIZE - blkoff; 9885 } else 9886 blksize = rp->r_size - blkoff; 9887 } else if ((off == 0) || 9888 (off != rp->r_nextr && !readahead_issued)) { 9889 blksize = PAGESIZE; 9890 blkoff = off; /* block = page here */ 9891 } else 9892 blksize = bsize; 9893 mutex_exit(&rp->r_statelock); 9894 9895 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9896 &io_len, blkoff, blksize, 0); 9897 9898 /* 9899 * Some other thread has entered the page, 9900 * so just use it. 9901 */ 9902 if (pp == NULL) 9903 goto again; 9904 9905 /* 9906 * Now round the request size up to page boundaries. 9907 * This ensures that the entire page will be 9908 * initialized to zeroes if EOF is encountered. 9909 */ 9910 io_len = ptob(btopr(io_len)); 9911 9912 bp = pageio_setup(pp, io_len, vp, B_READ); 9913 ASSERT(bp != NULL); 9914 9915 /* 9916 * pageio_setup should have set b_addr to 0. This 9917 * is correct since we want to do I/O on a page 9918 * boundary. bp_mapin will use this addr to calculate 9919 * an offset, and then set b_addr to the kernel virtual 9920 * address it allocated for us. 9921 */ 9922 ASSERT(bp->b_un.b_addr == 0); 9923 9924 bp->b_edev = 0; 9925 bp->b_dev = 0; 9926 bp->b_lblkno = lbtodb(io_off); 9927 bp->b_file = vp; 9928 bp->b_offset = (offset_t)off; 9929 bp_mapin(bp); 9930 9931 /* 9932 * If doing a write beyond what we believe is EOF, 9933 * don't bother trying to read the pages from the 9934 * server, we'll just zero the pages here. We 9935 * don't check that the rw flag is S_WRITE here 9936 * because some implementations may attempt a 9937 * read access to the buffer before copying data. 9938 */ 9939 mutex_enter(&rp->r_statelock); 9940 if (io_off >= rp->r_size && seg == segkmap) { 9941 mutex_exit(&rp->r_statelock); 9942 bzero(bp->b_un.b_addr, io_len); 9943 } else { 9944 mutex_exit(&rp->r_statelock); 9945 error = nfs4_bio(bp, NULL, cr, FALSE); 9946 } 9947 9948 /* 9949 * Unmap the buffer before freeing it. 9950 */ 9951 bp_mapout(bp); 9952 pageio_done(bp); 9953 9954 savepp = pp; 9955 do { 9956 pp->p_fsdata = C_NOCOMMIT; 9957 } while ((pp = pp->p_next) != savepp); 9958 9959 if (error == NFS_EOF) { 9960 /* 9961 * If doing a write system call just return 9962 * zeroed pages, else user tried to get pages 9963 * beyond EOF, return error. We don't check 9964 * that the rw flag is S_WRITE here because 9965 * some implementations may attempt a read 9966 * access to the buffer before copying data. 9967 */ 9968 if (seg == segkmap) 9969 error = 0; 9970 else 9971 error = EFAULT; 9972 } 9973 9974 if (!readahead_issued && !error) { 9975 mutex_enter(&rp->r_statelock); 9976 rp->r_nextr = io_off + io_len; 9977 mutex_exit(&rp->r_statelock); 9978 } 9979 } 9980 } 9981 9982 out: 9983 if (pl == NULL) 9984 return (error); 9985 9986 if (error) { 9987 if (pp != NULL) 9988 pvn_read_done(pp, B_ERROR); 9989 return (error); 9990 } 9991 9992 if (pagefound) { 9993 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9994 9995 /* 9996 * Page exists in the cache, acquire the appropriate lock. 9997 * If this fails, start all over again. 9998 */ 9999 if ((pp = page_lookup(vp, off, se)) == NULL) { 10000 #ifdef DEBUG 10001 nfs4_lostpage++; 10002 #endif 10003 goto reread; 10004 } 10005 pl[0] = pp; 10006 pl[1] = NULL; 10007 return (0); 10008 } 10009 10010 if (pp != NULL) 10011 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10012 10013 return (error); 10014 } 10015 10016 static void 10017 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10018 cred_t *cr) 10019 { 10020 int error; 10021 page_t *pp; 10022 u_offset_t io_off; 10023 size_t io_len; 10024 struct buf *bp; 10025 uint_t bsize, blksize; 10026 rnode4_t *rp = VTOR4(vp); 10027 page_t *savepp; 10028 10029 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10030 10031 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10032 10033 mutex_enter(&rp->r_statelock); 10034 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10035 /* 10036 * If less than a block left in file read less 10037 * than a block. 10038 */ 10039 blksize = rp->r_size - blkoff; 10040 } else 10041 blksize = bsize; 10042 mutex_exit(&rp->r_statelock); 10043 10044 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10045 &io_off, &io_len, blkoff, blksize, 1); 10046 /* 10047 * The isra flag passed to the kluster function is 1, we may have 10048 * gotten a return value of NULL for a variety of reasons (# of free 10049 * pages < minfree, someone entered the page on the vnode etc). In all 10050 * cases, we want to punt on the readahead. 10051 */ 10052 if (pp == NULL) 10053 return; 10054 10055 /* 10056 * Now round the request size up to page boundaries. 10057 * This ensures that the entire page will be 10058 * initialized to zeroes if EOF is encountered. 10059 */ 10060 io_len = ptob(btopr(io_len)); 10061 10062 bp = pageio_setup(pp, io_len, vp, B_READ); 10063 ASSERT(bp != NULL); 10064 10065 /* 10066 * pageio_setup should have set b_addr to 0. This is correct since 10067 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10068 * to calculate an offset, and then set b_addr to the kernel virtual 10069 * address it allocated for us. 10070 */ 10071 ASSERT(bp->b_un.b_addr == 0); 10072 10073 bp->b_edev = 0; 10074 bp->b_dev = 0; 10075 bp->b_lblkno = lbtodb(io_off); 10076 bp->b_file = vp; 10077 bp->b_offset = (offset_t)blkoff; 10078 bp_mapin(bp); 10079 10080 /* 10081 * If doing a write beyond what we believe is EOF, don't bother trying 10082 * to read the pages from the server, we'll just zero the pages here. 10083 * We don't check that the rw flag is S_WRITE here because some 10084 * implementations may attempt a read access to the buffer before 10085 * copying data. 10086 */ 10087 mutex_enter(&rp->r_statelock); 10088 if (io_off >= rp->r_size && seg == segkmap) { 10089 mutex_exit(&rp->r_statelock); 10090 bzero(bp->b_un.b_addr, io_len); 10091 error = 0; 10092 } else { 10093 mutex_exit(&rp->r_statelock); 10094 error = nfs4_bio(bp, NULL, cr, TRUE); 10095 if (error == NFS_EOF) 10096 error = 0; 10097 } 10098 10099 /* 10100 * Unmap the buffer before freeing it. 10101 */ 10102 bp_mapout(bp); 10103 pageio_done(bp); 10104 10105 savepp = pp; 10106 do { 10107 pp->p_fsdata = C_NOCOMMIT; 10108 } while ((pp = pp->p_next) != savepp); 10109 10110 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10111 10112 /* 10113 * In case of error set readahead offset 10114 * to the lowest offset. 10115 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10116 */ 10117 if (error && rp->r_nextr > io_off) { 10118 mutex_enter(&rp->r_statelock); 10119 if (rp->r_nextr > io_off) 10120 rp->r_nextr = io_off; 10121 mutex_exit(&rp->r_statelock); 10122 } 10123 } 10124 10125 /* 10126 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10127 * If len == 0, do from off to EOF. 10128 * 10129 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10130 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10131 * (from pageout). 10132 */ 10133 /* ARGSUSED */ 10134 static int 10135 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10136 caller_context_t *ct) 10137 { 10138 int error; 10139 rnode4_t *rp; 10140 10141 ASSERT(cr != NULL); 10142 10143 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10144 return (EIO); 10145 10146 rp = VTOR4(vp); 10147 if (IS_SHADOW(vp, rp)) 10148 vp = RTOV4(rp); 10149 10150 /* 10151 * XXX - Why should this check be made here? 10152 */ 10153 if (vp->v_flag & VNOMAP) 10154 return (ENOSYS); 10155 10156 if (len == 0 && !(flags & B_INVAL) && 10157 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10158 return (0); 10159 10160 mutex_enter(&rp->r_statelock); 10161 rp->r_count++; 10162 mutex_exit(&rp->r_statelock); 10163 error = nfs4_putpages(vp, off, len, flags, cr); 10164 mutex_enter(&rp->r_statelock); 10165 rp->r_count--; 10166 cv_broadcast(&rp->r_cv); 10167 mutex_exit(&rp->r_statelock); 10168 10169 return (error); 10170 } 10171 10172 /* 10173 * Write out a single page, possibly klustering adjacent dirty pages. 10174 */ 10175 int 10176 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10177 int flags, cred_t *cr) 10178 { 10179 u_offset_t io_off; 10180 u_offset_t lbn_off; 10181 u_offset_t lbn; 10182 size_t io_len; 10183 uint_t bsize; 10184 int error; 10185 rnode4_t *rp; 10186 10187 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10188 ASSERT(pp != NULL); 10189 ASSERT(cr != NULL); 10190 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10191 10192 rp = VTOR4(vp); 10193 ASSERT(rp->r_count > 0); 10194 ASSERT(!IS_SHADOW(vp, rp)); 10195 10196 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10197 lbn = pp->p_offset / bsize; 10198 lbn_off = lbn * bsize; 10199 10200 /* 10201 * Find a kluster that fits in one block, or in 10202 * one page if pages are bigger than blocks. If 10203 * there is less file space allocated than a whole 10204 * page, we'll shorten the i/o request below. 10205 */ 10206 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10207 roundup(bsize, PAGESIZE), flags); 10208 10209 /* 10210 * pvn_write_kluster shouldn't have returned a page with offset 10211 * behind the original page we were given. Verify that. 10212 */ 10213 ASSERT((pp->p_offset / bsize) >= lbn); 10214 10215 /* 10216 * Now pp will have the list of kept dirty pages marked for 10217 * write back. It will also handle invalidation and freeing 10218 * of pages that are not dirty. Check for page length rounding 10219 * problems. 10220 */ 10221 if (io_off + io_len > lbn_off + bsize) { 10222 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10223 io_len = lbn_off + bsize - io_off; 10224 } 10225 /* 10226 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10227 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10228 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10229 * progress and the r_size has not been made consistent with the 10230 * new size of the file. When the uiomove() completes the r_size is 10231 * updated and the R4MODINPROGRESS flag is cleared. 10232 * 10233 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10234 * consistent value of r_size. Without this handshaking, it is 10235 * possible that nfs4_bio() picks up the old value of r_size 10236 * before the uiomove() in writerp4() completes. This will result 10237 * in the write through nfs4_bio() being dropped. 10238 * 10239 * More precisely, there is a window between the time the uiomove() 10240 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10241 * operation intervenes in this window, the page will be picked up, 10242 * because it is dirty (it will be unlocked, unless it was 10243 * pagecreate'd). When the page is picked up as dirty, the dirty 10244 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10245 * checked. This will still be the old size. Therefore the page will 10246 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10247 * the page will be found to be clean and the write will be dropped. 10248 */ 10249 if (rp->r_flags & R4MODINPROGRESS) { 10250 mutex_enter(&rp->r_statelock); 10251 if ((rp->r_flags & R4MODINPROGRESS) && 10252 rp->r_modaddr + MAXBSIZE > io_off && 10253 rp->r_modaddr < io_off + io_len) { 10254 page_t *plist; 10255 /* 10256 * A write is in progress for this region of the file. 10257 * If we did not detect R4MODINPROGRESS here then this 10258 * path through nfs_putapage() would eventually go to 10259 * nfs4_bio() and may not write out all of the data 10260 * in the pages. We end up losing data. So we decide 10261 * to set the modified bit on each page in the page 10262 * list and mark the rnode with R4DIRTY. This write 10263 * will be restarted at some later time. 10264 */ 10265 plist = pp; 10266 while (plist != NULL) { 10267 pp = plist; 10268 page_sub(&plist, pp); 10269 hat_setmod(pp); 10270 page_io_unlock(pp); 10271 page_unlock(pp); 10272 } 10273 rp->r_flags |= R4DIRTY; 10274 mutex_exit(&rp->r_statelock); 10275 if (offp) 10276 *offp = io_off; 10277 if (lenp) 10278 *lenp = io_len; 10279 return (0); 10280 } 10281 mutex_exit(&rp->r_statelock); 10282 } 10283 10284 if (flags & B_ASYNC) { 10285 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10286 nfs4_sync_putapage); 10287 } else 10288 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10289 10290 if (offp) 10291 *offp = io_off; 10292 if (lenp) 10293 *lenp = io_len; 10294 return (error); 10295 } 10296 10297 static int 10298 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10299 int flags, cred_t *cr) 10300 { 10301 int error; 10302 rnode4_t *rp; 10303 10304 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10305 10306 flags |= B_WRITE; 10307 10308 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10309 10310 rp = VTOR4(vp); 10311 10312 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10313 error == EACCES) && 10314 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10315 if (!(rp->r_flags & R4OUTOFSPACE)) { 10316 mutex_enter(&rp->r_statelock); 10317 rp->r_flags |= R4OUTOFSPACE; 10318 mutex_exit(&rp->r_statelock); 10319 } 10320 flags |= B_ERROR; 10321 pvn_write_done(pp, flags); 10322 /* 10323 * If this was not an async thread, then try again to 10324 * write out the pages, but this time, also destroy 10325 * them whether or not the write is successful. This 10326 * will prevent memory from filling up with these 10327 * pages and destroying them is the only alternative 10328 * if they can't be written out. 10329 * 10330 * Don't do this if this is an async thread because 10331 * when the pages are unlocked in pvn_write_done, 10332 * some other thread could have come along, locked 10333 * them, and queued for an async thread. It would be 10334 * possible for all of the async threads to be tied 10335 * up waiting to lock the pages again and they would 10336 * all already be locked and waiting for an async 10337 * thread to handle them. Deadlock. 10338 */ 10339 if (!(flags & B_ASYNC)) { 10340 error = nfs4_putpage(vp, io_off, io_len, 10341 B_INVAL | B_FORCE, cr, NULL); 10342 } 10343 } else { 10344 if (error) 10345 flags |= B_ERROR; 10346 else if (rp->r_flags & R4OUTOFSPACE) { 10347 mutex_enter(&rp->r_statelock); 10348 rp->r_flags &= ~R4OUTOFSPACE; 10349 mutex_exit(&rp->r_statelock); 10350 } 10351 pvn_write_done(pp, flags); 10352 if (freemem < desfree) 10353 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10354 NFS4_WRITE_NOWAIT); 10355 } 10356 10357 return (error); 10358 } 10359 10360 #ifdef DEBUG 10361 int nfs4_force_open_before_mmap = 0; 10362 #endif 10363 10364 /* ARGSUSED */ 10365 static int 10366 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10367 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10368 caller_context_t *ct) 10369 { 10370 struct segvn_crargs vn_a; 10371 int error = 0; 10372 rnode4_t *rp = VTOR4(vp); 10373 mntinfo4_t *mi = VTOMI4(vp); 10374 10375 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10376 return (EIO); 10377 10378 if (vp->v_flag & VNOMAP) 10379 return (ENOSYS); 10380 10381 if (off < 0 || (off + len) < 0) 10382 return (ENXIO); 10383 10384 if (vp->v_type != VREG) 10385 return (ENODEV); 10386 10387 /* 10388 * If the file is delegated to the client don't do anything. 10389 * If the file is not delegated, then validate the data cache. 10390 */ 10391 mutex_enter(&rp->r_statev4_lock); 10392 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10393 mutex_exit(&rp->r_statev4_lock); 10394 error = nfs4_validate_caches(vp, cr); 10395 if (error) 10396 return (error); 10397 } else { 10398 mutex_exit(&rp->r_statev4_lock); 10399 } 10400 10401 /* 10402 * Check to see if the vnode is currently marked as not cachable. 10403 * This means portions of the file are locked (through VOP_FRLOCK). 10404 * In this case the map request must be refused. We use 10405 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10406 * 10407 * Atomically increment r_inmap after acquiring r_rwlock. The 10408 * idea here is to acquire r_rwlock to block read/write and 10409 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10410 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10411 * and we can prevent the deadlock that would have occurred 10412 * when nfs4_addmap() would have acquired it out of order. 10413 * 10414 * Since we are not protecting r_inmap by any lock, we do not 10415 * hold any lock when we decrement it. We atomically decrement 10416 * r_inmap after we release r_lkserlock. 10417 */ 10418 10419 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10420 return (EINTR); 10421 atomic_add_int(&rp->r_inmap, 1); 10422 nfs_rw_exit(&rp->r_rwlock); 10423 10424 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10425 atomic_add_int(&rp->r_inmap, -1); 10426 return (EINTR); 10427 } 10428 10429 10430 if (vp->v_flag & VNOCACHE) { 10431 error = EAGAIN; 10432 goto done; 10433 } 10434 10435 /* 10436 * Don't allow concurrent locks and mapping if mandatory locking is 10437 * enabled. 10438 */ 10439 if (flk_has_remote_locks(vp)) { 10440 struct vattr va; 10441 va.va_mask = AT_MODE; 10442 error = nfs4getattr(vp, &va, cr); 10443 if (error != 0) 10444 goto done; 10445 if (MANDLOCK(vp, va.va_mode)) { 10446 error = EAGAIN; 10447 goto done; 10448 } 10449 } 10450 10451 /* 10452 * It is possible that the rnode has a lost lock request that we 10453 * are still trying to recover, and that the request conflicts with 10454 * this map request. 10455 * 10456 * An alternative approach would be for nfs4_safemap() to consider 10457 * queued lock requests when deciding whether to set or clear 10458 * VNOCACHE. This would require the frlock code path to call 10459 * nfs4_safemap() after enqueing a lost request. 10460 */ 10461 if (nfs4_map_lost_lock_conflict(vp)) { 10462 error = EAGAIN; 10463 goto done; 10464 } 10465 10466 as_rangelock(as); 10467 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10468 if (error != 0) { 10469 as_rangeunlock(as); 10470 goto done; 10471 } 10472 10473 if (vp->v_type == VREG) { 10474 /* 10475 * We need to retrieve the open stream 10476 */ 10477 nfs4_open_stream_t *osp = NULL; 10478 nfs4_open_owner_t *oop = NULL; 10479 10480 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10481 if (oop != NULL) { 10482 /* returns with 'os_sync_lock' held */ 10483 osp = find_open_stream(oop, rp); 10484 open_owner_rele(oop); 10485 } 10486 if (osp == NULL) { 10487 #ifdef DEBUG 10488 if (nfs4_force_open_before_mmap) { 10489 error = EIO; 10490 goto done; 10491 } 10492 #endif 10493 /* returns with 'os_sync_lock' held */ 10494 error = open_and_get_osp(vp, cr, &osp); 10495 if (osp == NULL) { 10496 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10497 "nfs4_map: we tried to OPEN the file " 10498 "but again no osp, so fail with EIO")); 10499 goto done; 10500 } 10501 } 10502 10503 if (osp->os_failed_reopen) { 10504 mutex_exit(&osp->os_sync_lock); 10505 open_stream_rele(osp, rp); 10506 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10507 "nfs4_map: os_failed_reopen set on " 10508 "osp %p, cr %p, rp %s", (void *)osp, 10509 (void *)cr, rnode4info(rp))); 10510 error = EIO; 10511 goto done; 10512 } 10513 mutex_exit(&osp->os_sync_lock); 10514 open_stream_rele(osp, rp); 10515 } 10516 10517 vn_a.vp = vp; 10518 vn_a.offset = off; 10519 vn_a.type = (flags & MAP_TYPE); 10520 vn_a.prot = (uchar_t)prot; 10521 vn_a.maxprot = (uchar_t)maxprot; 10522 vn_a.flags = (flags & ~MAP_TYPE); 10523 vn_a.cred = cr; 10524 vn_a.amp = NULL; 10525 vn_a.szc = 0; 10526 vn_a.lgrp_mem_policy_flags = 0; 10527 10528 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10529 as_rangeunlock(as); 10530 10531 done: 10532 nfs_rw_exit(&rp->r_lkserlock); 10533 atomic_add_int(&rp->r_inmap, -1); 10534 return (error); 10535 } 10536 10537 /* 10538 * We're most likely dealing with a kernel module that likes to READ 10539 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10540 * officially OPEN the file to create the necessary client state 10541 * for bookkeeping of os_mmap_read/write counts. 10542 * 10543 * Since VOP_MAP only passes in a pointer to the vnode rather than 10544 * a double pointer, we can't handle the case where nfs4open_otw() 10545 * returns a different vnode than the one passed into VOP_MAP (since 10546 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10547 * we return NULL and let nfs4_map() fail. Note: the only case where 10548 * this should happen is if the file got removed and replaced with the 10549 * same name on the server (in addition to the fact that we're trying 10550 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10551 */ 10552 static int 10553 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10554 { 10555 rnode4_t *rp, *drp; 10556 vnode_t *dvp, *open_vp; 10557 char file_name[MAXNAMELEN]; 10558 int just_created; 10559 nfs4_open_stream_t *osp; 10560 nfs4_open_owner_t *oop; 10561 int error; 10562 10563 *ospp = NULL; 10564 open_vp = map_vp; 10565 10566 rp = VTOR4(open_vp); 10567 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10568 return (error); 10569 drp = VTOR4(dvp); 10570 10571 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10572 VN_RELE(dvp); 10573 return (EINTR); 10574 } 10575 10576 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10577 nfs_rw_exit(&drp->r_rwlock); 10578 VN_RELE(dvp); 10579 return (error); 10580 } 10581 10582 mutex_enter(&rp->r_statev4_lock); 10583 if (rp->created_v4) { 10584 rp->created_v4 = 0; 10585 mutex_exit(&rp->r_statev4_lock); 10586 10587 dnlc_update(dvp, file_name, open_vp); 10588 /* This is needed so we don't bump the open ref count */ 10589 just_created = 1; 10590 } else { 10591 mutex_exit(&rp->r_statev4_lock); 10592 just_created = 0; 10593 } 10594 10595 VN_HOLD(map_vp); 10596 10597 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10598 just_created); 10599 if (error) { 10600 nfs_rw_exit(&drp->r_rwlock); 10601 VN_RELE(dvp); 10602 VN_RELE(map_vp); 10603 return (error); 10604 } 10605 10606 nfs_rw_exit(&drp->r_rwlock); 10607 VN_RELE(dvp); 10608 10609 /* 10610 * If nfs4open_otw() returned a different vnode then "undo" 10611 * the open and return failure to the caller. 10612 */ 10613 if (!VN_CMP(open_vp, map_vp)) { 10614 nfs4_error_t e; 10615 10616 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10617 "open returned a different vnode")); 10618 /* 10619 * If there's an error, ignore it, 10620 * and let VOP_INACTIVE handle it. 10621 */ 10622 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10623 CLOSE_NORM, 0, 0, 0); 10624 VN_RELE(map_vp); 10625 return (EIO); 10626 } 10627 10628 VN_RELE(map_vp); 10629 10630 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10631 if (!oop) { 10632 nfs4_error_t e; 10633 10634 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10635 "no open owner")); 10636 /* 10637 * If there's an error, ignore it, 10638 * and let VOP_INACTIVE handle it. 10639 */ 10640 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10641 CLOSE_NORM, 0, 0, 0); 10642 return (EIO); 10643 } 10644 osp = find_open_stream(oop, rp); 10645 open_owner_rele(oop); 10646 *ospp = osp; 10647 return (0); 10648 } 10649 10650 /* 10651 * Please be aware that when this function is called, the address space write 10652 * a_lock is held. Do not put over the wire calls in this function. 10653 */ 10654 /* ARGSUSED */ 10655 static int 10656 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10657 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10658 caller_context_t *ct) 10659 { 10660 rnode4_t *rp; 10661 int error = 0; 10662 mntinfo4_t *mi; 10663 10664 mi = VTOMI4(vp); 10665 rp = VTOR4(vp); 10666 10667 if (nfs_zone() != mi->mi_zone) 10668 return (EIO); 10669 if (vp->v_flag & VNOMAP) 10670 return (ENOSYS); 10671 10672 /* 10673 * Don't need to update the open stream first, since this 10674 * mmap can't add any additional share access that isn't 10675 * already contained in the open stream (for the case where we 10676 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10677 * take into account os_mmap_read[write] counts). 10678 */ 10679 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10680 10681 if (vp->v_type == VREG) { 10682 /* 10683 * We need to retrieve the open stream and update the counts. 10684 * If there is no open stream here, something is wrong. 10685 */ 10686 nfs4_open_stream_t *osp = NULL; 10687 nfs4_open_owner_t *oop = NULL; 10688 10689 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10690 if (oop != NULL) { 10691 /* returns with 'os_sync_lock' held */ 10692 osp = find_open_stream(oop, rp); 10693 open_owner_rele(oop); 10694 } 10695 if (osp == NULL) { 10696 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10697 "nfs4_addmap: we should have an osp" 10698 "but we don't, so fail with EIO")); 10699 error = EIO; 10700 goto out; 10701 } 10702 10703 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10704 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10705 10706 /* 10707 * Update the map count in the open stream. 10708 * This is necessary in the case where we 10709 * open/mmap/close/, then the server reboots, and we 10710 * attempt to reopen. If the mmap doesn't add share 10711 * access then we send an invalid reopen with 10712 * access = NONE. 10713 * 10714 * We need to specifically check each PROT_* so a mmap 10715 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10716 * read and write access. A simple comparison of prot 10717 * to ~PROT_WRITE to determine read access is insufficient 10718 * since prot can be |= with PROT_USER, etc. 10719 */ 10720 10721 /* 10722 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10723 */ 10724 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10725 osp->os_mmap_write += btopr(len); 10726 if (maxprot & PROT_READ) 10727 osp->os_mmap_read += btopr(len); 10728 if (maxprot & PROT_EXEC) 10729 osp->os_mmap_read += btopr(len); 10730 /* 10731 * Ensure that os_mmap_read gets incremented, even if 10732 * maxprot were to look like PROT_NONE. 10733 */ 10734 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10735 !(maxprot & PROT_EXEC)) 10736 osp->os_mmap_read += btopr(len); 10737 osp->os_mapcnt += btopr(len); 10738 mutex_exit(&osp->os_sync_lock); 10739 open_stream_rele(osp, rp); 10740 } 10741 10742 out: 10743 /* 10744 * If we got an error, then undo our 10745 * incrementing of 'r_mapcnt'. 10746 */ 10747 10748 if (error) { 10749 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10750 ASSERT(rp->r_mapcnt >= 0); 10751 } 10752 return (error); 10753 } 10754 10755 /* ARGSUSED */ 10756 static int 10757 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10758 { 10759 10760 return (VTOR4(vp1) == VTOR4(vp2)); 10761 } 10762 10763 /* ARGSUSED */ 10764 static int 10765 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10766 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10767 caller_context_t *ct) 10768 { 10769 int rc; 10770 u_offset_t start, end; 10771 rnode4_t *rp; 10772 int error = 0, intr = INTR4(vp); 10773 nfs4_error_t e; 10774 10775 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10776 return (EIO); 10777 10778 /* check for valid cmd parameter */ 10779 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10780 return (EINVAL); 10781 10782 /* Verify l_type. */ 10783 switch (bfp->l_type) { 10784 case F_RDLCK: 10785 if (cmd != F_GETLK && !(flag & FREAD)) 10786 return (EBADF); 10787 break; 10788 case F_WRLCK: 10789 if (cmd != F_GETLK && !(flag & FWRITE)) 10790 return (EBADF); 10791 break; 10792 case F_UNLCK: 10793 intr = 0; 10794 break; 10795 10796 default: 10797 return (EINVAL); 10798 } 10799 10800 /* check the validity of the lock range */ 10801 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10802 return (rc); 10803 if (rc = flk_check_lock_data(start, end, MAXEND)) 10804 return (rc); 10805 10806 /* 10807 * If the filesystem is mounted using local locking, pass the 10808 * request off to the local locking code. 10809 */ 10810 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10811 if (cmd == F_SETLK || cmd == F_SETLKW) { 10812 /* 10813 * For complete safety, we should be holding 10814 * r_lkserlock. However, we can't call 10815 * nfs4_safelock and then fs_frlock while 10816 * holding r_lkserlock, so just invoke 10817 * nfs4_safelock and expect that this will 10818 * catch enough of the cases. 10819 */ 10820 if (!nfs4_safelock(vp, bfp, cr)) 10821 return (EAGAIN); 10822 } 10823 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10824 } 10825 10826 rp = VTOR4(vp); 10827 10828 /* 10829 * Check whether the given lock request can proceed, given the 10830 * current file mappings. 10831 */ 10832 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10833 return (EINTR); 10834 if (cmd == F_SETLK || cmd == F_SETLKW) { 10835 if (!nfs4_safelock(vp, bfp, cr)) { 10836 rc = EAGAIN; 10837 goto done; 10838 } 10839 } 10840 10841 /* 10842 * Flush the cache after waiting for async I/O to finish. For new 10843 * locks, this is so that the process gets the latest bits from the 10844 * server. For unlocks, this is so that other clients see the 10845 * latest bits once the file has been unlocked. If currently dirty 10846 * pages can't be flushed, then don't allow a lock to be set. But 10847 * allow unlocks to succeed, to avoid having orphan locks on the 10848 * server. 10849 */ 10850 if (cmd != F_GETLK) { 10851 mutex_enter(&rp->r_statelock); 10852 while (rp->r_count > 0) { 10853 if (intr) { 10854 klwp_t *lwp = ttolwp(curthread); 10855 10856 if (lwp != NULL) 10857 lwp->lwp_nostop++; 10858 if (cv_wait_sig(&rp->r_cv, 10859 &rp->r_statelock) == 0) { 10860 if (lwp != NULL) 10861 lwp->lwp_nostop--; 10862 rc = EINTR; 10863 break; 10864 } 10865 if (lwp != NULL) 10866 lwp->lwp_nostop--; 10867 } else 10868 cv_wait(&rp->r_cv, &rp->r_statelock); 10869 } 10870 mutex_exit(&rp->r_statelock); 10871 if (rc != 0) 10872 goto done; 10873 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10874 if (error) { 10875 if (error == ENOSPC || error == EDQUOT) { 10876 mutex_enter(&rp->r_statelock); 10877 if (!rp->r_error) 10878 rp->r_error = error; 10879 mutex_exit(&rp->r_statelock); 10880 } 10881 if (bfp->l_type != F_UNLCK) { 10882 rc = ENOLCK; 10883 goto done; 10884 } 10885 } 10886 } 10887 10888 /* 10889 * Call the lock manager to do the real work of contacting 10890 * the server and obtaining the lock. 10891 */ 10892 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10893 cr, &e, NULL, NULL); 10894 rc = e.error; 10895 10896 if (rc == 0) 10897 nfs4_lockcompletion(vp, cmd); 10898 10899 done: 10900 nfs_rw_exit(&rp->r_lkserlock); 10901 10902 return (rc); 10903 } 10904 10905 /* 10906 * Free storage space associated with the specified vnode. The portion 10907 * to be freed is specified by bfp->l_start and bfp->l_len (already 10908 * normalized to a "whence" of 0). 10909 * 10910 * This is an experimental facility whose continued existence is not 10911 * guaranteed. Currently, we only support the special case 10912 * of l_len == 0, meaning free to end of file. 10913 */ 10914 /* ARGSUSED */ 10915 static int 10916 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10917 offset_t offset, cred_t *cr, caller_context_t *ct) 10918 { 10919 int error; 10920 10921 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10922 return (EIO); 10923 ASSERT(vp->v_type == VREG); 10924 if (cmd != F_FREESP) 10925 return (EINVAL); 10926 10927 error = convoff(vp, bfp, 0, offset); 10928 if (!error) { 10929 ASSERT(bfp->l_start >= 0); 10930 if (bfp->l_len == 0) { 10931 struct vattr va; 10932 10933 va.va_mask = AT_SIZE; 10934 va.va_size = bfp->l_start; 10935 error = nfs4setattr(vp, &va, 0, cr, NULL); 10936 } else 10937 error = EINVAL; 10938 } 10939 10940 return (error); 10941 } 10942 10943 /* ARGSUSED */ 10944 int 10945 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 10946 { 10947 rnode4_t *rp; 10948 rp = VTOR4(vp); 10949 10950 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 10951 vp = RTOV4(rp); 10952 } 10953 *vpp = vp; 10954 return (0); 10955 } 10956 10957 /* 10958 * Setup and add an address space callback to do the work of the delmap call. 10959 * The callback will (and must be) deleted in the actual callback function. 10960 * 10961 * This is done in order to take care of the problem that we have with holding 10962 * the address space's a_lock for a long period of time (e.g. if the NFS server 10963 * is down). Callbacks will be executed in the address space code while the 10964 * a_lock is not held. Holding the address space's a_lock causes things such 10965 * as ps and fork to hang because they are trying to acquire this lock as well. 10966 */ 10967 /* ARGSUSED */ 10968 static int 10969 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10970 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 10971 caller_context_t *ct) 10972 { 10973 int caller_found; 10974 int error; 10975 rnode4_t *rp; 10976 nfs4_delmap_args_t *dmapp; 10977 nfs4_delmapcall_t *delmap_call; 10978 10979 if (vp->v_flag & VNOMAP) 10980 return (ENOSYS); 10981 10982 /* 10983 * A process may not change zones if it has NFS pages mmap'ed 10984 * in, so we can't legitimately get here from the wrong zone. 10985 */ 10986 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10987 10988 rp = VTOR4(vp); 10989 10990 /* 10991 * The way that the address space of this process deletes its mapping 10992 * of this file is via the following call chains: 10993 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10994 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10995 * 10996 * With the use of address space callbacks we are allowed to drop the 10997 * address space lock, a_lock, while executing the NFS operations that 10998 * need to go over the wire. Returning EAGAIN to the caller of this 10999 * function is what drives the execution of the callback that we add 11000 * below. The callback will be executed by the address space code 11001 * after dropping the a_lock. When the callback is finished, since 11002 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11003 * is called again on the same segment to finish the rest of the work 11004 * that needs to happen during unmapping. 11005 * 11006 * This action of calling back into the segment driver causes 11007 * nfs4_delmap() to get called again, but since the callback was 11008 * already executed at this point, it already did the work and there 11009 * is nothing left for us to do. 11010 * 11011 * To Summarize: 11012 * - The first time nfs4_delmap is called by the current thread is when 11013 * we add the caller associated with this delmap to the delmap caller 11014 * list, add the callback, and return EAGAIN. 11015 * - The second time in this call chain when nfs4_delmap is called we 11016 * will find this caller in the delmap caller list and realize there 11017 * is no more work to do thus removing this caller from the list and 11018 * returning the error that was set in the callback execution. 11019 */ 11020 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11021 if (caller_found) { 11022 /* 11023 * 'error' is from the actual delmap operations. To avoid 11024 * hangs, we need to handle the return of EAGAIN differently 11025 * since this is what drives the callback execution. 11026 * In this case, we don't want to return EAGAIN and do the 11027 * callback execution because there are none to execute. 11028 */ 11029 if (error == EAGAIN) 11030 return (0); 11031 else 11032 return (error); 11033 } 11034 11035 /* current caller was not in the list */ 11036 delmap_call = nfs4_init_delmapcall(); 11037 11038 mutex_enter(&rp->r_statelock); 11039 list_insert_tail(&rp->r_indelmap, delmap_call); 11040 mutex_exit(&rp->r_statelock); 11041 11042 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11043 11044 dmapp->vp = vp; 11045 dmapp->off = off; 11046 dmapp->addr = addr; 11047 dmapp->len = len; 11048 dmapp->prot = prot; 11049 dmapp->maxprot = maxprot; 11050 dmapp->flags = flags; 11051 dmapp->cr = cr; 11052 dmapp->caller = delmap_call; 11053 11054 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11055 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11056 11057 return (error ? error : EAGAIN); 11058 } 11059 11060 static nfs4_delmapcall_t * 11061 nfs4_init_delmapcall() 11062 { 11063 nfs4_delmapcall_t *delmap_call; 11064 11065 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11066 delmap_call->call_id = curthread; 11067 delmap_call->error = 0; 11068 11069 return (delmap_call); 11070 } 11071 11072 static void 11073 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11074 { 11075 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11076 } 11077 11078 /* 11079 * Searches for the current delmap caller (based on curthread) in the list of 11080 * callers. If it is found, we remove it and free the delmap caller. 11081 * Returns: 11082 * 0 if the caller wasn't found 11083 * 1 if the caller was found, removed and freed. *errp will be set 11084 * to what the result of the delmap was. 11085 */ 11086 static int 11087 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11088 { 11089 nfs4_delmapcall_t *delmap_call; 11090 11091 /* 11092 * If the list doesn't exist yet, we create it and return 11093 * that the caller wasn't found. No list = no callers. 11094 */ 11095 mutex_enter(&rp->r_statelock); 11096 if (!(rp->r_flags & R4DELMAPLIST)) { 11097 /* The list does not exist */ 11098 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11099 offsetof(nfs4_delmapcall_t, call_node)); 11100 rp->r_flags |= R4DELMAPLIST; 11101 mutex_exit(&rp->r_statelock); 11102 return (0); 11103 } else { 11104 /* The list exists so search it */ 11105 for (delmap_call = list_head(&rp->r_indelmap); 11106 delmap_call != NULL; 11107 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11108 if (delmap_call->call_id == curthread) { 11109 /* current caller is in the list */ 11110 *errp = delmap_call->error; 11111 list_remove(&rp->r_indelmap, delmap_call); 11112 mutex_exit(&rp->r_statelock); 11113 nfs4_free_delmapcall(delmap_call); 11114 return (1); 11115 } 11116 } 11117 } 11118 mutex_exit(&rp->r_statelock); 11119 return (0); 11120 } 11121 11122 /* 11123 * Remove some pages from an mmap'd vnode. Just update the 11124 * count of pages. If doing close-to-open, then flush and 11125 * commit all of the pages associated with this file. 11126 * Otherwise, start an asynchronous page flush to write out 11127 * any dirty pages. This will also associate a credential 11128 * with the rnode which can be used to write the pages. 11129 */ 11130 /* ARGSUSED */ 11131 static void 11132 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11133 { 11134 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11135 rnode4_t *rp; 11136 mntinfo4_t *mi; 11137 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11138 11139 rp = VTOR4(dmapp->vp); 11140 mi = VTOMI4(dmapp->vp); 11141 11142 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11143 ASSERT(rp->r_mapcnt >= 0); 11144 11145 /* 11146 * Initiate a page flush and potential commit if there are 11147 * pages, the file system was not mounted readonly, the segment 11148 * was mapped shared, and the pages themselves were writeable. 11149 */ 11150 if (nfs4_has_pages(dmapp->vp) && 11151 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11152 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11153 mutex_enter(&rp->r_statelock); 11154 rp->r_flags |= R4DIRTY; 11155 mutex_exit(&rp->r_statelock); 11156 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11157 dmapp->len, dmapp->cr); 11158 if (!e.error) { 11159 mutex_enter(&rp->r_statelock); 11160 e.error = rp->r_error; 11161 rp->r_error = 0; 11162 mutex_exit(&rp->r_statelock); 11163 } 11164 } else 11165 e.error = 0; 11166 11167 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11168 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11169 B_INVAL, dmapp->cr, NULL); 11170 11171 if (e.error) { 11172 e.stat = puterrno4(e.error); 11173 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11174 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11175 dmapp->caller->error = e.error; 11176 } 11177 11178 /* Check to see if we need to close the file */ 11179 11180 if (dmapp->vp->v_type == VREG) { 11181 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11182 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11183 11184 if (e.error != 0 || e.stat != NFS4_OK) { 11185 /* 11186 * Since it is possible that e.error == 0 and 11187 * e.stat != NFS4_OK (and vice versa), 11188 * we do the proper checking in order to get both 11189 * e.error and e.stat reporting the correct info. 11190 */ 11191 if (e.stat == NFS4_OK) 11192 e.stat = puterrno4(e.error); 11193 if (e.error == 0) 11194 e.error = geterrno4(e.stat); 11195 11196 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11197 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11198 dmapp->caller->error = e.error; 11199 } 11200 } 11201 11202 (void) as_delete_callback(as, arg); 11203 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11204 } 11205 11206 11207 static uint_t 11208 fattr4_maxfilesize_to_bits(uint64_t ll) 11209 { 11210 uint_t l = 1; 11211 11212 if (ll == 0) { 11213 return (0); 11214 } 11215 11216 if (ll & 0xffffffff00000000) { 11217 l += 32; ll >>= 32; 11218 } 11219 if (ll & 0xffff0000) { 11220 l += 16; ll >>= 16; 11221 } 11222 if (ll & 0xff00) { 11223 l += 8; ll >>= 8; 11224 } 11225 if (ll & 0xf0) { 11226 l += 4; ll >>= 4; 11227 } 11228 if (ll & 0xc) { 11229 l += 2; ll >>= 2; 11230 } 11231 if (ll & 0x2) { 11232 l += 1; 11233 } 11234 return (l); 11235 } 11236 11237 static int 11238 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11239 { 11240 vnode_t *avp = NULL; 11241 int error; 11242 11243 if ((error = nfs4lookup_xattr(vp, "", &avp, 11244 LOOKUP_XATTR, cr)) == 0) 11245 error = do_xattr_exists_check(avp, valp, cr); 11246 if (avp) 11247 VN_RELE(avp); 11248 11249 return (error); 11250 } 11251 11252 /* ARGSUSED */ 11253 int 11254 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11255 caller_context_t *ct) 11256 { 11257 int error; 11258 hrtime_t t; 11259 rnode4_t *rp; 11260 nfs4_ga_res_t gar; 11261 nfs4_ga_ext_res_t ger; 11262 11263 gar.n4g_ext_res = &ger; 11264 11265 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11266 return (EIO); 11267 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11268 *valp = MAXPATHLEN; 11269 return (0); 11270 } 11271 if (cmd == _PC_ACL_ENABLED) { 11272 *valp = _ACL_ACE_ENABLED; 11273 return (0); 11274 } 11275 11276 rp = VTOR4(vp); 11277 if (cmd == _PC_XATTR_EXISTS) { 11278 /* 11279 * The existence of the xattr directory is not sufficient 11280 * for determining whether generic user attributes exists. 11281 * The attribute directory could only be a transient directory 11282 * used for Solaris sysattr support. Do a small readdir 11283 * to verify if the only entries are sysattrs or not. 11284 * 11285 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11286 * is NULL. Once the xadir vp exists, we can create xattrs, 11287 * and we don't have any way to update the "base" object's 11288 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11289 * could help out. 11290 */ 11291 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11292 rp->r_xattr_dir == NULL) { 11293 return (nfs4_have_xattrs(vp, valp, cr)); 11294 } 11295 } else { /* OLD CODE */ 11296 if (ATTRCACHE4_VALID(vp)) { 11297 mutex_enter(&rp->r_statelock); 11298 if (rp->r_pathconf.pc4_cache_valid) { 11299 error = 0; 11300 switch (cmd) { 11301 case _PC_FILESIZEBITS: 11302 *valp = 11303 rp->r_pathconf.pc4_filesizebits; 11304 break; 11305 case _PC_LINK_MAX: 11306 *valp = 11307 rp->r_pathconf.pc4_link_max; 11308 break; 11309 case _PC_NAME_MAX: 11310 *valp = 11311 rp->r_pathconf.pc4_name_max; 11312 break; 11313 case _PC_CHOWN_RESTRICTED: 11314 *valp = 11315 rp->r_pathconf.pc4_chown_restricted; 11316 break; 11317 case _PC_NO_TRUNC: 11318 *valp = 11319 rp->r_pathconf.pc4_no_trunc; 11320 break; 11321 default: 11322 error = EINVAL; 11323 break; 11324 } 11325 mutex_exit(&rp->r_statelock); 11326 #ifdef DEBUG 11327 nfs4_pathconf_cache_hits++; 11328 #endif 11329 return (error); 11330 } 11331 mutex_exit(&rp->r_statelock); 11332 } 11333 } 11334 #ifdef DEBUG 11335 nfs4_pathconf_cache_misses++; 11336 #endif 11337 11338 t = gethrtime(); 11339 11340 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11341 11342 if (error) { 11343 mutex_enter(&rp->r_statelock); 11344 rp->r_pathconf.pc4_cache_valid = FALSE; 11345 rp->r_pathconf.pc4_xattr_valid = FALSE; 11346 mutex_exit(&rp->r_statelock); 11347 return (error); 11348 } 11349 11350 /* interpret the max filesize */ 11351 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11352 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11353 11354 /* Store the attributes we just received */ 11355 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11356 11357 switch (cmd) { 11358 case _PC_FILESIZEBITS: 11359 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11360 break; 11361 case _PC_LINK_MAX: 11362 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11363 break; 11364 case _PC_NAME_MAX: 11365 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11366 break; 11367 case _PC_CHOWN_RESTRICTED: 11368 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11369 break; 11370 case _PC_NO_TRUNC: 11371 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11372 break; 11373 case _PC_XATTR_EXISTS: 11374 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11375 if (error = nfs4_have_xattrs(vp, valp, cr)) 11376 return (error); 11377 } 11378 break; 11379 default: 11380 return (EINVAL); 11381 } 11382 11383 return (0); 11384 } 11385 11386 /* 11387 * Called by async thread to do synchronous pageio. Do the i/o, wait 11388 * for it to complete, and cleanup the page list when done. 11389 */ 11390 static int 11391 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11392 int flags, cred_t *cr) 11393 { 11394 int error; 11395 11396 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11397 11398 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11399 if (flags & B_READ) 11400 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11401 else 11402 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11403 return (error); 11404 } 11405 11406 /* ARGSUSED */ 11407 static int 11408 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11409 int flags, cred_t *cr, caller_context_t *ct) 11410 { 11411 int error; 11412 rnode4_t *rp; 11413 11414 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11415 return (EIO); 11416 11417 if (pp == NULL) 11418 return (EINVAL); 11419 11420 rp = VTOR4(vp); 11421 mutex_enter(&rp->r_statelock); 11422 rp->r_count++; 11423 mutex_exit(&rp->r_statelock); 11424 11425 if (flags & B_ASYNC) { 11426 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11427 nfs4_sync_pageio); 11428 } else 11429 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11430 mutex_enter(&rp->r_statelock); 11431 rp->r_count--; 11432 cv_broadcast(&rp->r_cv); 11433 mutex_exit(&rp->r_statelock); 11434 return (error); 11435 } 11436 11437 /* ARGSUSED */ 11438 static void 11439 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11440 caller_context_t *ct) 11441 { 11442 int error; 11443 rnode4_t *rp; 11444 page_t *plist; 11445 page_t *pptr; 11446 offset3 offset; 11447 count3 len; 11448 k_sigset_t smask; 11449 11450 /* 11451 * We should get called with fl equal to either B_FREE or 11452 * B_INVAL. Any other value is illegal. 11453 * 11454 * The page that we are either supposed to free or destroy 11455 * should be exclusive locked and its io lock should not 11456 * be held. 11457 */ 11458 ASSERT(fl == B_FREE || fl == B_INVAL); 11459 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11460 11461 rp = VTOR4(vp); 11462 11463 /* 11464 * If the page doesn't need to be committed or we shouldn't 11465 * even bother attempting to commit it, then just make sure 11466 * that the p_fsdata byte is clear and then either free or 11467 * destroy the page as appropriate. 11468 */ 11469 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11470 pp->p_fsdata = C_NOCOMMIT; 11471 if (fl == B_FREE) 11472 page_free(pp, dn); 11473 else 11474 page_destroy(pp, dn); 11475 return; 11476 } 11477 11478 /* 11479 * If there is a page invalidation operation going on, then 11480 * if this is one of the pages being destroyed, then just 11481 * clear the p_fsdata byte and then either free or destroy 11482 * the page as appropriate. 11483 */ 11484 mutex_enter(&rp->r_statelock); 11485 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11486 mutex_exit(&rp->r_statelock); 11487 pp->p_fsdata = C_NOCOMMIT; 11488 if (fl == B_FREE) 11489 page_free(pp, dn); 11490 else 11491 page_destroy(pp, dn); 11492 return; 11493 } 11494 11495 /* 11496 * If we are freeing this page and someone else is already 11497 * waiting to do a commit, then just unlock the page and 11498 * return. That other thread will take care of commiting 11499 * this page. The page can be freed sometime after the 11500 * commit has finished. Otherwise, if the page is marked 11501 * as delay commit, then we may be getting called from 11502 * pvn_write_done, one page at a time. This could result 11503 * in one commit per page, so we end up doing lots of small 11504 * commits instead of fewer larger commits. This is bad, 11505 * we want do as few commits as possible. 11506 */ 11507 if (fl == B_FREE) { 11508 if (rp->r_flags & R4COMMITWAIT) { 11509 page_unlock(pp); 11510 mutex_exit(&rp->r_statelock); 11511 return; 11512 } 11513 if (pp->p_fsdata == C_DELAYCOMMIT) { 11514 pp->p_fsdata = C_COMMIT; 11515 page_unlock(pp); 11516 mutex_exit(&rp->r_statelock); 11517 return; 11518 } 11519 } 11520 11521 /* 11522 * Check to see if there is a signal which would prevent an 11523 * attempt to commit the pages from being successful. If so, 11524 * then don't bother with all of the work to gather pages and 11525 * generate the unsuccessful RPC. Just return from here and 11526 * let the page be committed at some later time. 11527 */ 11528 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11529 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11530 sigunintr(&smask); 11531 page_unlock(pp); 11532 mutex_exit(&rp->r_statelock); 11533 return; 11534 } 11535 sigunintr(&smask); 11536 11537 /* 11538 * We are starting to need to commit pages, so let's try 11539 * to commit as many as possible at once to reduce the 11540 * overhead. 11541 * 11542 * Set the `commit inprogress' state bit. We must 11543 * first wait until any current one finishes. Then 11544 * we initialize the c_pages list with this page. 11545 */ 11546 while (rp->r_flags & R4COMMIT) { 11547 rp->r_flags |= R4COMMITWAIT; 11548 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11549 rp->r_flags &= ~R4COMMITWAIT; 11550 } 11551 rp->r_flags |= R4COMMIT; 11552 mutex_exit(&rp->r_statelock); 11553 ASSERT(rp->r_commit.c_pages == NULL); 11554 rp->r_commit.c_pages = pp; 11555 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11556 rp->r_commit.c_commlen = PAGESIZE; 11557 11558 /* 11559 * Gather together all other pages which can be committed. 11560 * They will all be chained off r_commit.c_pages. 11561 */ 11562 nfs4_get_commit(vp); 11563 11564 /* 11565 * Clear the `commit inprogress' status and disconnect 11566 * the list of pages to be committed from the rnode. 11567 * At this same time, we also save the starting offset 11568 * and length of data to be committed on the server. 11569 */ 11570 plist = rp->r_commit.c_pages; 11571 rp->r_commit.c_pages = NULL; 11572 offset = rp->r_commit.c_commbase; 11573 len = rp->r_commit.c_commlen; 11574 mutex_enter(&rp->r_statelock); 11575 rp->r_flags &= ~R4COMMIT; 11576 cv_broadcast(&rp->r_commit.c_cv); 11577 mutex_exit(&rp->r_statelock); 11578 11579 if (curproc == proc_pageout || curproc == proc_fsflush || 11580 nfs_zone() != VTOMI4(vp)->mi_zone) { 11581 nfs4_async_commit(vp, plist, offset, len, 11582 cr, do_nfs4_async_commit); 11583 return; 11584 } 11585 11586 /* 11587 * Actually generate the COMMIT op over the wire operation. 11588 */ 11589 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11590 11591 /* 11592 * If we got an error during the commit, just unlock all 11593 * of the pages. The pages will get retransmitted to the 11594 * server during a putpage operation. 11595 */ 11596 if (error) { 11597 while (plist != NULL) { 11598 pptr = plist; 11599 page_sub(&plist, pptr); 11600 page_unlock(pptr); 11601 } 11602 return; 11603 } 11604 11605 /* 11606 * We've tried as hard as we can to commit the data to stable 11607 * storage on the server. We just unlock the rest of the pages 11608 * and clear the commit required state. They will be put 11609 * onto the tail of the cachelist if they are nolonger 11610 * mapped. 11611 */ 11612 while (plist != pp) { 11613 pptr = plist; 11614 page_sub(&plist, pptr); 11615 pptr->p_fsdata = C_NOCOMMIT; 11616 page_unlock(pptr); 11617 } 11618 11619 /* 11620 * It is possible that nfs4_commit didn't return error but 11621 * some other thread has modified the page we are going 11622 * to free/destroy. 11623 * In this case we need to rewrite the page. Do an explicit check 11624 * before attempting to free/destroy the page. If modified, needs to 11625 * be rewritten so unlock the page and return. 11626 */ 11627 if (hat_ismod(pp)) { 11628 pp->p_fsdata = C_NOCOMMIT; 11629 page_unlock(pp); 11630 return; 11631 } 11632 11633 /* 11634 * Now, as appropriate, either free or destroy the page 11635 * that we were called with. 11636 */ 11637 pp->p_fsdata = C_NOCOMMIT; 11638 if (fl == B_FREE) 11639 page_free(pp, dn); 11640 else 11641 page_destroy(pp, dn); 11642 } 11643 11644 /* 11645 * Commit requires that the current fh be the file written to. 11646 * The compound op structure is: 11647 * PUTFH(file), COMMIT 11648 */ 11649 static int 11650 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11651 { 11652 COMPOUND4args_clnt args; 11653 COMPOUND4res_clnt res; 11654 COMMIT4res *cm_res; 11655 nfs_argop4 argop[2]; 11656 nfs_resop4 *resop; 11657 int doqueue; 11658 mntinfo4_t *mi; 11659 rnode4_t *rp; 11660 cred_t *cred_otw = NULL; 11661 bool_t needrecov = FALSE; 11662 nfs4_recov_state_t recov_state; 11663 nfs4_open_stream_t *osp = NULL; 11664 bool_t first_time = TRUE; /* first time getting OTW cred */ 11665 bool_t last_time = FALSE; /* last time getting OTW cred */ 11666 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11667 11668 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11669 11670 rp = VTOR4(vp); 11671 11672 mi = VTOMI4(vp); 11673 recov_state.rs_flags = 0; 11674 recov_state.rs_num_retry_despite_err = 0; 11675 get_commit_cred: 11676 /* 11677 * Releases the osp, if a valid open stream is provided. 11678 * Puts a hold on the cred_otw and the new osp (if found). 11679 */ 11680 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11681 &first_time, &last_time); 11682 args.ctag = TAG_COMMIT; 11683 recov_retry: 11684 /* 11685 * Commit ops: putfh file; commit 11686 */ 11687 args.array_len = 2; 11688 args.array = argop; 11689 11690 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11691 &recov_state, NULL); 11692 if (e.error) { 11693 crfree(cred_otw); 11694 if (osp != NULL) 11695 open_stream_rele(osp, rp); 11696 return (e.error); 11697 } 11698 11699 /* putfh directory */ 11700 argop[0].argop = OP_CPUTFH; 11701 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11702 11703 /* commit */ 11704 argop[1].argop = OP_COMMIT; 11705 argop[1].nfs_argop4_u.opcommit.offset = offset; 11706 argop[1].nfs_argop4_u.opcommit.count = count; 11707 11708 doqueue = 1; 11709 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11710 11711 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11712 if (!needrecov && e.error) { 11713 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11714 needrecov); 11715 crfree(cred_otw); 11716 if (e.error == EACCES && last_time == FALSE) 11717 goto get_commit_cred; 11718 if (osp != NULL) 11719 open_stream_rele(osp, rp); 11720 return (e.error); 11721 } 11722 11723 if (needrecov) { 11724 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11725 NULL, OP_COMMIT, NULL) == FALSE) { 11726 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11727 &recov_state, needrecov); 11728 if (!e.error) 11729 (void) xdr_free(xdr_COMPOUND4res_clnt, 11730 (caddr_t)&res); 11731 goto recov_retry; 11732 } 11733 if (e.error) { 11734 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11735 &recov_state, needrecov); 11736 crfree(cred_otw); 11737 if (osp != NULL) 11738 open_stream_rele(osp, rp); 11739 return (e.error); 11740 } 11741 /* fall through for res.status case */ 11742 } 11743 11744 if (res.status) { 11745 e.error = geterrno4(res.status); 11746 if (e.error == EACCES && last_time == FALSE) { 11747 crfree(cred_otw); 11748 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11749 &recov_state, needrecov); 11750 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11751 goto get_commit_cred; 11752 } 11753 /* 11754 * Can't do a nfs4_purge_stale_fh here because this 11755 * can cause a deadlock. nfs4_commit can 11756 * be called from nfs4_dispose which can be called 11757 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11758 * can call back to pvn_vplist_dirty. 11759 */ 11760 if (e.error == ESTALE) { 11761 mutex_enter(&rp->r_statelock); 11762 rp->r_flags |= R4STALE; 11763 if (!rp->r_error) 11764 rp->r_error = e.error; 11765 mutex_exit(&rp->r_statelock); 11766 PURGE_ATTRCACHE4(vp); 11767 } else { 11768 mutex_enter(&rp->r_statelock); 11769 if (!rp->r_error) 11770 rp->r_error = e.error; 11771 mutex_exit(&rp->r_statelock); 11772 } 11773 } else { 11774 ASSERT(rp->r_flags & R4HAVEVERF); 11775 resop = &res.array[1]; /* commit res */ 11776 cm_res = &resop->nfs_resop4_u.opcommit; 11777 mutex_enter(&rp->r_statelock); 11778 if (cm_res->writeverf == rp->r_writeverf) { 11779 mutex_exit(&rp->r_statelock); 11780 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11781 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11782 &recov_state, needrecov); 11783 crfree(cred_otw); 11784 if (osp != NULL) 11785 open_stream_rele(osp, rp); 11786 return (0); 11787 } 11788 nfs4_set_mod(vp); 11789 rp->r_writeverf = cm_res->writeverf; 11790 mutex_exit(&rp->r_statelock); 11791 e.error = NFS_VERF_MISMATCH; 11792 } 11793 11794 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11795 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11796 crfree(cred_otw); 11797 if (osp != NULL) 11798 open_stream_rele(osp, rp); 11799 11800 return (e.error); 11801 } 11802 11803 static void 11804 nfs4_set_mod(vnode_t *vp) 11805 { 11806 page_t *pp; 11807 kmutex_t *vphm; 11808 rnode4_t *rp; 11809 11810 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11811 11812 /* make sure we're looking at the master vnode, not a shadow */ 11813 11814 rp = VTOR4(vp); 11815 if (IS_SHADOW(vp, rp)) 11816 vp = RTOV4(rp); 11817 11818 vphm = page_vnode_mutex(vp); 11819 mutex_enter(vphm); 11820 /* 11821 * If there are no pages associated with this vnode, then 11822 * just return. 11823 */ 11824 if ((pp = vp->v_pages) == NULL) { 11825 mutex_exit(vphm); 11826 return; 11827 } 11828 11829 do { 11830 if (pp->p_fsdata != C_NOCOMMIT) { 11831 hat_setmod(pp); 11832 pp->p_fsdata = C_NOCOMMIT; 11833 } 11834 } while ((pp = pp->p_vpnext) != vp->v_pages); 11835 mutex_exit(vphm); 11836 } 11837 11838 /* 11839 * This function is used to gather a page list of the pages which 11840 * can be committed on the server. 11841 * 11842 * The calling thread must have set R4COMMIT. This bit is used to 11843 * serialize access to the commit structure in the rnode. As long 11844 * as the thread has set R4COMMIT, then it can manipulate the commit 11845 * structure without requiring any other locks. 11846 * 11847 * When this function is called from nfs4_dispose() the page passed 11848 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11849 * will skip it. This is not a problem since we initially add the 11850 * page to the r_commit page list. 11851 * 11852 */ 11853 static void 11854 nfs4_get_commit(vnode_t *vp) 11855 { 11856 rnode4_t *rp; 11857 page_t *pp; 11858 kmutex_t *vphm; 11859 11860 rp = VTOR4(vp); 11861 11862 ASSERT(rp->r_flags & R4COMMIT); 11863 11864 /* make sure we're looking at the master vnode, not a shadow */ 11865 11866 if (IS_SHADOW(vp, rp)) 11867 vp = RTOV4(rp); 11868 11869 vphm = page_vnode_mutex(vp); 11870 mutex_enter(vphm); 11871 11872 /* 11873 * If there are no pages associated with this vnode, then 11874 * just return. 11875 */ 11876 if ((pp = vp->v_pages) == NULL) { 11877 mutex_exit(vphm); 11878 return; 11879 } 11880 11881 /* 11882 * Step through all of the pages associated with this vnode 11883 * looking for pages which need to be committed. 11884 */ 11885 do { 11886 /* 11887 * First short-cut everything (without the page_lock) 11888 * and see if this page does not need to be committed 11889 * or is modified if so then we'll just skip it. 11890 */ 11891 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11892 continue; 11893 11894 /* 11895 * Attempt to lock the page. If we can't, then 11896 * someone else is messing with it or we have been 11897 * called from nfs4_dispose and this is the page that 11898 * nfs4_dispose was called with.. anyway just skip it. 11899 */ 11900 if (!page_trylock(pp, SE_EXCL)) 11901 continue; 11902 11903 /* 11904 * Lets check again now that we have the page lock. 11905 */ 11906 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11907 page_unlock(pp); 11908 continue; 11909 } 11910 11911 /* this had better not be a free page */ 11912 ASSERT(PP_ISFREE(pp) == 0); 11913 11914 /* 11915 * The page needs to be committed and we locked it. 11916 * Update the base and length parameters and add it 11917 * to r_pages. 11918 */ 11919 if (rp->r_commit.c_pages == NULL) { 11920 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11921 rp->r_commit.c_commlen = PAGESIZE; 11922 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11923 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11924 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11925 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11926 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11927 <= pp->p_offset) { 11928 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11929 rp->r_commit.c_commbase + PAGESIZE; 11930 } 11931 page_add(&rp->r_commit.c_pages, pp); 11932 } while ((pp = pp->p_vpnext) != vp->v_pages); 11933 11934 mutex_exit(vphm); 11935 } 11936 11937 /* 11938 * This routine is used to gather together a page list of the pages 11939 * which are to be committed on the server. This routine must not 11940 * be called if the calling thread holds any locked pages. 11941 * 11942 * The calling thread must have set R4COMMIT. This bit is used to 11943 * serialize access to the commit structure in the rnode. As long 11944 * as the thread has set R4COMMIT, then it can manipulate the commit 11945 * structure without requiring any other locks. 11946 */ 11947 static void 11948 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11949 { 11950 11951 rnode4_t *rp; 11952 page_t *pp; 11953 u_offset_t end; 11954 u_offset_t off; 11955 ASSERT(len != 0); 11956 rp = VTOR4(vp); 11957 ASSERT(rp->r_flags & R4COMMIT); 11958 11959 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11960 11961 /* make sure we're looking at the master vnode, not a shadow */ 11962 11963 if (IS_SHADOW(vp, rp)) 11964 vp = RTOV4(rp); 11965 11966 /* 11967 * If there are no pages associated with this vnode, then 11968 * just return. 11969 */ 11970 if ((pp = vp->v_pages) == NULL) 11971 return; 11972 /* 11973 * Calculate the ending offset. 11974 */ 11975 end = soff + len; 11976 for (off = soff; off < end; off += PAGESIZE) { 11977 /* 11978 * Lookup each page by vp, offset. 11979 */ 11980 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11981 continue; 11982 /* 11983 * If this page does not need to be committed or is 11984 * modified, then just skip it. 11985 */ 11986 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11987 page_unlock(pp); 11988 continue; 11989 } 11990 11991 ASSERT(PP_ISFREE(pp) == 0); 11992 /* 11993 * The page needs to be committed and we locked it. 11994 * Update the base and length parameters and add it 11995 * to r_pages. 11996 */ 11997 if (rp->r_commit.c_pages == NULL) { 11998 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11999 rp->r_commit.c_commlen = PAGESIZE; 12000 } else { 12001 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12002 rp->r_commit.c_commbase + PAGESIZE; 12003 } 12004 page_add(&rp->r_commit.c_pages, pp); 12005 } 12006 } 12007 12008 /* 12009 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12010 * Flushes and commits data to the server. 12011 */ 12012 static int 12013 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12014 { 12015 int error; 12016 verifier4 write_verf; 12017 rnode4_t *rp = VTOR4(vp); 12018 12019 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12020 12021 /* 12022 * Flush the data portion of the file and then commit any 12023 * portions which need to be committed. This may need to 12024 * be done twice if the server has changed state since 12025 * data was last written. The data will need to be 12026 * rewritten to the server and then a new commit done. 12027 * 12028 * In fact, this may need to be done several times if the 12029 * server is having problems and crashing while we are 12030 * attempting to do this. 12031 */ 12032 12033 top: 12034 /* 12035 * Do a flush based on the poff and plen arguments. This 12036 * will synchronously write out any modified pages in the 12037 * range specified by (poff, plen). This starts all of the 12038 * i/o operations which will be waited for in the next 12039 * call to nfs4_putpage 12040 */ 12041 12042 mutex_enter(&rp->r_statelock); 12043 write_verf = rp->r_writeverf; 12044 mutex_exit(&rp->r_statelock); 12045 12046 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12047 if (error == EAGAIN) 12048 error = 0; 12049 12050 /* 12051 * Do a flush based on the poff and plen arguments. This 12052 * will synchronously write out any modified pages in the 12053 * range specified by (poff, plen) and wait until all of 12054 * the asynchronous i/o's in that range are done as well. 12055 */ 12056 if (!error) 12057 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12058 12059 if (error) 12060 return (error); 12061 12062 mutex_enter(&rp->r_statelock); 12063 if (rp->r_writeverf != write_verf) { 12064 mutex_exit(&rp->r_statelock); 12065 goto top; 12066 } 12067 mutex_exit(&rp->r_statelock); 12068 12069 /* 12070 * Now commit any pages which might need to be committed. 12071 * If the error, NFS_VERF_MISMATCH, is returned, then 12072 * start over with the flush operation. 12073 */ 12074 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12075 12076 if (error == NFS_VERF_MISMATCH) 12077 goto top; 12078 12079 return (error); 12080 } 12081 12082 /* 12083 * nfs4_commit_vp() will wait for other pending commits and 12084 * will either commit the whole file or a range, plen dictates 12085 * if we commit whole file. a value of zero indicates the whole 12086 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12087 */ 12088 static int 12089 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12090 cred_t *cr, int wait_on_writes) 12091 { 12092 rnode4_t *rp; 12093 page_t *plist; 12094 offset3 offset; 12095 count3 len; 12096 12097 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12098 12099 rp = VTOR4(vp); 12100 12101 /* 12102 * before we gather commitable pages make 12103 * sure there are no outstanding async writes 12104 */ 12105 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12106 mutex_enter(&rp->r_statelock); 12107 while (rp->r_count > 0) { 12108 cv_wait(&rp->r_cv, &rp->r_statelock); 12109 } 12110 mutex_exit(&rp->r_statelock); 12111 } 12112 12113 /* 12114 * Set the `commit inprogress' state bit. We must 12115 * first wait until any current one finishes. 12116 */ 12117 mutex_enter(&rp->r_statelock); 12118 while (rp->r_flags & R4COMMIT) { 12119 rp->r_flags |= R4COMMITWAIT; 12120 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12121 rp->r_flags &= ~R4COMMITWAIT; 12122 } 12123 rp->r_flags |= R4COMMIT; 12124 mutex_exit(&rp->r_statelock); 12125 12126 /* 12127 * Gather all of the pages which need to be 12128 * committed. 12129 */ 12130 if (plen == 0) 12131 nfs4_get_commit(vp); 12132 else 12133 nfs4_get_commit_range(vp, poff, plen); 12134 12135 /* 12136 * Clear the `commit inprogress' bit and disconnect the 12137 * page list which was gathered by nfs4_get_commit. 12138 */ 12139 plist = rp->r_commit.c_pages; 12140 rp->r_commit.c_pages = NULL; 12141 offset = rp->r_commit.c_commbase; 12142 len = rp->r_commit.c_commlen; 12143 mutex_enter(&rp->r_statelock); 12144 rp->r_flags &= ~R4COMMIT; 12145 cv_broadcast(&rp->r_commit.c_cv); 12146 mutex_exit(&rp->r_statelock); 12147 12148 /* 12149 * If any pages need to be committed, commit them and 12150 * then unlock them so that they can be freed some 12151 * time later. 12152 */ 12153 if (plist == NULL) 12154 return (0); 12155 12156 /* 12157 * No error occurred during the flush portion 12158 * of this operation, so now attempt to commit 12159 * the data to stable storage on the server. 12160 * 12161 * This will unlock all of the pages on the list. 12162 */ 12163 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12164 } 12165 12166 static int 12167 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12168 cred_t *cr) 12169 { 12170 int error; 12171 page_t *pp; 12172 12173 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12174 12175 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12176 12177 /* 12178 * If we got an error, then just unlock all of the pages 12179 * on the list. 12180 */ 12181 if (error) { 12182 while (plist != NULL) { 12183 pp = plist; 12184 page_sub(&plist, pp); 12185 page_unlock(pp); 12186 } 12187 return (error); 12188 } 12189 /* 12190 * We've tried as hard as we can to commit the data to stable 12191 * storage on the server. We just unlock the pages and clear 12192 * the commit required state. They will get freed later. 12193 */ 12194 while (plist != NULL) { 12195 pp = plist; 12196 page_sub(&plist, pp); 12197 pp->p_fsdata = C_NOCOMMIT; 12198 page_unlock(pp); 12199 } 12200 12201 return (error); 12202 } 12203 12204 static void 12205 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12206 cred_t *cr) 12207 { 12208 12209 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12210 } 12211 12212 /*ARGSUSED*/ 12213 static int 12214 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12215 caller_context_t *ct) 12216 { 12217 int error = 0; 12218 mntinfo4_t *mi; 12219 vattr_t va; 12220 vsecattr_t nfsace4_vsap; 12221 12222 mi = VTOMI4(vp); 12223 if (nfs_zone() != mi->mi_zone) 12224 return (EIO); 12225 if (mi->mi_flags & MI4_ACL) { 12226 /* if we have a delegation, return it */ 12227 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12228 (void) nfs4delegreturn(VTOR4(vp), 12229 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12230 12231 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12232 NFS4_ACL_SET); 12233 if (error) /* EINVAL */ 12234 return (error); 12235 12236 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12237 /* 12238 * These are aclent_t type entries. 12239 */ 12240 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12241 vp->v_type == VDIR, FALSE); 12242 if (error) 12243 return (error); 12244 } else { 12245 /* 12246 * These are ace_t type entries. 12247 */ 12248 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12249 FALSE); 12250 if (error) 12251 return (error); 12252 } 12253 bzero(&va, sizeof (va)); 12254 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12255 vs_ace4_destroy(&nfsace4_vsap); 12256 return (error); 12257 } 12258 return (ENOSYS); 12259 } 12260 12261 /* ARGSUSED */ 12262 int 12263 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12264 caller_context_t *ct) 12265 { 12266 int error; 12267 mntinfo4_t *mi; 12268 nfs4_ga_res_t gar; 12269 rnode4_t *rp = VTOR4(vp); 12270 12271 mi = VTOMI4(vp); 12272 if (nfs_zone() != mi->mi_zone) 12273 return (EIO); 12274 12275 bzero(&gar, sizeof (gar)); 12276 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12277 12278 /* 12279 * vsecattr->vsa_mask holds the original acl request mask. 12280 * This is needed when determining what to return. 12281 * (See: nfs4_create_getsecattr_return()) 12282 */ 12283 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12284 if (error) /* EINVAL */ 12285 return (error); 12286 12287 if (mi->mi_flags & MI4_ACL) { 12288 /* 12289 * Check if the data is cached and the cache is valid. If it 12290 * is we don't go over the wire. 12291 */ 12292 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12293 mutex_enter(&rp->r_statelock); 12294 if (rp->r_secattr != NULL) { 12295 error = nfs4_create_getsecattr_return( 12296 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12297 rp->r_attr.va_gid, 12298 vp->v_type == VDIR); 12299 if (!error) { /* error == 0 - Success! */ 12300 mutex_exit(&rp->r_statelock); 12301 return (error); 12302 } 12303 } 12304 mutex_exit(&rp->r_statelock); 12305 } 12306 12307 /* 12308 * The getattr otw call will always get both the acl, in 12309 * the form of a list of nfsace4's, and the number of acl 12310 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12311 */ 12312 gar.n4g_va.va_mask = AT_ALL; 12313 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12314 if (error) { 12315 vs_ace4_destroy(&gar.n4g_vsa); 12316 if (error == ENOTSUP || error == EOPNOTSUPP) 12317 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12318 return (error); 12319 } 12320 12321 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12322 /* 12323 * No error was returned, but according to the response 12324 * bitmap, neither was an acl. 12325 */ 12326 vs_ace4_destroy(&gar.n4g_vsa); 12327 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12328 return (error); 12329 } 12330 12331 /* 12332 * Update the cache with the ACL. 12333 */ 12334 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12335 12336 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12337 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12338 vp->v_type == VDIR); 12339 vs_ace4_destroy(&gar.n4g_vsa); 12340 if ((error) && (vsecattr->vsa_mask & 12341 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12342 (error != EACCES)) { 12343 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12344 } 12345 return (error); 12346 } 12347 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12348 return (error); 12349 } 12350 12351 /* 12352 * The function returns: 12353 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12354 * - EINVAL if the passed in "acl_mask" is an invalid request. 12355 * 12356 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12357 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12358 * 12359 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12360 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12361 * - We have a count field set without the corresponding acl field set. (e.g. - 12362 * VSA_ACECNT is set, but VSA_ACE is not) 12363 */ 12364 static int 12365 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12366 { 12367 /* Shortcut the masks that are always valid. */ 12368 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12369 return (0); 12370 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12371 return (0); 12372 12373 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12374 /* 12375 * We can't have any VSA_ACL type stuff in the mask now. 12376 */ 12377 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12378 VSA_DFACLCNT)) 12379 return (EINVAL); 12380 12381 if (op == NFS4_ACL_SET) { 12382 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12383 return (EINVAL); 12384 } 12385 } 12386 12387 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12388 /* 12389 * We can't have any VSA_ACE type stuff in the mask now. 12390 */ 12391 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12392 return (EINVAL); 12393 12394 if (op == NFS4_ACL_SET) { 12395 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12396 return (EINVAL); 12397 12398 if ((acl_mask & VSA_DFACLCNT) && 12399 !(acl_mask & VSA_DFACL)) 12400 return (EINVAL); 12401 } 12402 } 12403 return (0); 12404 } 12405 12406 /* 12407 * The theory behind creating the correct getsecattr return is simply this: 12408 * "Don't return anything that the caller is not expecting to have to free." 12409 */ 12410 static int 12411 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12412 uid_t uid, gid_t gid, int isdir) 12413 { 12414 int error = 0; 12415 /* Save the mask since the translators modify it. */ 12416 uint_t orig_mask = vsap->vsa_mask; 12417 12418 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12419 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12420 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12421 12422 if (error) 12423 return (error); 12424 12425 /* 12426 * If the caller only asked for the ace count (VSA_ACECNT) 12427 * don't give them the full acl (VSA_ACE), free it. 12428 */ 12429 if (!orig_mask & VSA_ACE) { 12430 if (vsap->vsa_aclentp != NULL) { 12431 kmem_free(vsap->vsa_aclentp, 12432 vsap->vsa_aclcnt * sizeof (ace_t)); 12433 vsap->vsa_aclentp = NULL; 12434 } 12435 } 12436 vsap->vsa_mask = orig_mask; 12437 12438 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12439 VSA_DFACLCNT)) { 12440 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12441 isdir, FALSE, 12442 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12443 12444 if (error) 12445 return (error); 12446 12447 /* 12448 * If the caller only asked for the acl count (VSA_ACLCNT) 12449 * and/or the default acl count (VSA_DFACLCNT) don't give them 12450 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12451 */ 12452 if (!orig_mask & VSA_ACL) { 12453 if (vsap->vsa_aclentp != NULL) { 12454 kmem_free(vsap->vsa_aclentp, 12455 vsap->vsa_aclcnt * sizeof (aclent_t)); 12456 vsap->vsa_aclentp = NULL; 12457 } 12458 } 12459 12460 if (!orig_mask & VSA_DFACL) { 12461 if (vsap->vsa_dfaclentp != NULL) { 12462 kmem_free(vsap->vsa_dfaclentp, 12463 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12464 vsap->vsa_dfaclentp = NULL; 12465 } 12466 } 12467 vsap->vsa_mask = orig_mask; 12468 } 12469 return (0); 12470 } 12471 12472 /* ARGSUSED */ 12473 int 12474 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12475 caller_context_t *ct) 12476 { 12477 int error; 12478 12479 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12480 return (EIO); 12481 /* 12482 * check for valid cmd parameter 12483 */ 12484 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12485 return (EINVAL); 12486 12487 /* 12488 * Check access permissions 12489 */ 12490 if ((cmd & F_SHARE) && 12491 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12492 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12493 return (EBADF); 12494 12495 /* 12496 * If the filesystem is mounted using local locking, pass the 12497 * request off to the local share code. 12498 */ 12499 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12500 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12501 12502 switch (cmd) { 12503 case F_SHARE: 12504 case F_UNSHARE: 12505 /* 12506 * This will be properly implemented later, 12507 * see RFE: 4823948 . 12508 */ 12509 error = EAGAIN; 12510 break; 12511 12512 case F_HASREMOTELOCKS: 12513 /* 12514 * NFS client can't store remote locks itself 12515 */ 12516 shr->s_access = 0; 12517 error = 0; 12518 break; 12519 12520 default: 12521 error = EINVAL; 12522 break; 12523 } 12524 12525 return (error); 12526 } 12527 12528 /* 12529 * Common code called by directory ops to update the attrcache 12530 */ 12531 static int 12532 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12533 hrtime_t t, vnode_t *vp, cred_t *cr) 12534 { 12535 int error = 0; 12536 12537 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12538 12539 if (status != NFS4_OK) { 12540 /* getattr not done or failed */ 12541 PURGE_ATTRCACHE4(vp); 12542 return (error); 12543 } 12544 12545 if (garp) { 12546 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12547 } else { 12548 PURGE_ATTRCACHE4(vp); 12549 } 12550 return (error); 12551 } 12552 12553 /* 12554 * Update directory caches for directory modification ops (link, rename, etc.) 12555 * When dinfo is NULL, manage dircaches in the old way. 12556 */ 12557 static void 12558 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12559 dirattr_info_t *dinfo) 12560 { 12561 rnode4_t *drp = VTOR4(dvp); 12562 12563 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12564 12565 /* Purge rddir cache for dir since it changed */ 12566 if (drp->r_dir != NULL) 12567 nfs4_purge_rddir_cache(dvp); 12568 12569 /* 12570 * If caller provided dinfo, then use it to manage dir caches. 12571 */ 12572 if (dinfo != NULL) { 12573 if (vp != NULL) { 12574 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12575 if (!VTOR4(vp)->created_v4) { 12576 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12577 dnlc_update(dvp, nm, vp); 12578 } else { 12579 /* 12580 * XXX don't update if the created_v4 flag is 12581 * set 12582 */ 12583 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12584 NFS4_DEBUG(nfs4_client_state_debug, 12585 (CE_NOTE, "nfs4_update_dircaches: " 12586 "don't update dnlc: created_v4 flag")); 12587 } 12588 } 12589 12590 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12591 dinfo->di_cred, FALSE, cinfo); 12592 12593 return; 12594 } 12595 12596 /* 12597 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12598 * Since caller modified dir but didn't receive post-dirmod-op dir 12599 * attrs, the dir's attrs must be purged. 12600 * 12601 * XXX this check and dnlc update/purge should really be atomic, 12602 * XXX but can't use rnode statelock because it'll deadlock in 12603 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12604 * XXX does occur. 12605 * 12606 * XXX We also may want to check that atomic is true in the 12607 * XXX change_info struct. If it is not, the change_info may 12608 * XXX reflect changes by more than one clients which means that 12609 * XXX our cache may not be valid. 12610 */ 12611 PURGE_ATTRCACHE4(dvp); 12612 if (drp->r_change == cinfo->before) { 12613 /* no changes took place in the directory prior to our link */ 12614 if (vp != NULL) { 12615 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12616 if (!VTOR4(vp)->created_v4) { 12617 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12618 dnlc_update(dvp, nm, vp); 12619 } else { 12620 /* 12621 * XXX dont' update if the created_v4 flag 12622 * is set 12623 */ 12624 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12625 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12626 "nfs4_update_dircaches: don't" 12627 " update dnlc: created_v4 flag")); 12628 } 12629 } 12630 } else { 12631 /* Another client modified directory - purge its dnlc cache */ 12632 dnlc_purge_vp(dvp); 12633 } 12634 } 12635 12636 /* 12637 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12638 * file. 12639 * 12640 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12641 * file (ie: client recovery) and otherwise set to FALSE. 12642 * 12643 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12644 * initiated) calling functions. 12645 * 12646 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12647 * of resending a 'lost' open request. 12648 * 12649 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12650 * server that hands out BAD_SEQID on open confirm. 12651 * 12652 * Errors are returned via the nfs4_error_t parameter. 12653 */ 12654 void 12655 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12656 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12657 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12658 { 12659 COMPOUND4args_clnt args; 12660 COMPOUND4res_clnt res; 12661 nfs_argop4 argop[2]; 12662 nfs_resop4 *resop; 12663 int doqueue = 1; 12664 mntinfo4_t *mi; 12665 OPEN_CONFIRM4args *open_confirm_args; 12666 int needrecov; 12667 12668 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12669 #if DEBUG 12670 mutex_enter(&oop->oo_lock); 12671 ASSERT(oop->oo_seqid_inuse); 12672 mutex_exit(&oop->oo_lock); 12673 #endif 12674 12675 recov_retry_confirm: 12676 nfs4_error_zinit(ep); 12677 *retry_open = FALSE; 12678 12679 if (resend) 12680 args.ctag = TAG_OPEN_CONFIRM_LOST; 12681 else 12682 args.ctag = TAG_OPEN_CONFIRM; 12683 12684 args.array_len = 2; 12685 args.array = argop; 12686 12687 /* putfh target fh */ 12688 argop[0].argop = OP_CPUTFH; 12689 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12690 12691 argop[1].argop = OP_OPEN_CONFIRM; 12692 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12693 12694 (*seqid) += 1; 12695 open_confirm_args->seqid = *seqid; 12696 open_confirm_args->open_stateid = *stateid; 12697 12698 mi = VTOMI4(vp); 12699 12700 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12701 12702 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12703 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12704 } 12705 12706 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12707 if (!needrecov && ep->error) 12708 return; 12709 12710 if (needrecov) { 12711 bool_t abort = FALSE; 12712 12713 if (reopening_file == FALSE) { 12714 nfs4_bseqid_entry_t *bsep = NULL; 12715 12716 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12717 bsep = nfs4_create_bseqid_entry(oop, NULL, 12718 vp, 0, args.ctag, 12719 open_confirm_args->seqid); 12720 12721 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12722 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12723 if (bsep) { 12724 kmem_free(bsep, sizeof (*bsep)); 12725 if (num_bseqid_retryp && 12726 --(*num_bseqid_retryp) == 0) 12727 abort = TRUE; 12728 } 12729 } 12730 if ((ep->error == ETIMEDOUT || 12731 res.status == NFS4ERR_RESOURCE) && 12732 abort == FALSE && resend == FALSE) { 12733 if (!ep->error) 12734 (void) xdr_free(xdr_COMPOUND4res_clnt, 12735 (caddr_t)&res); 12736 12737 delay(SEC_TO_TICK(confirm_retry_sec)); 12738 goto recov_retry_confirm; 12739 } 12740 /* State may have changed so retry the entire OPEN op */ 12741 if (abort == FALSE) 12742 *retry_open = TRUE; 12743 else 12744 *retry_open = FALSE; 12745 if (!ep->error) 12746 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12747 return; 12748 } 12749 12750 if (res.status) { 12751 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12752 return; 12753 } 12754 12755 resop = &res.array[1]; /* open confirm res */ 12756 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12757 stateid, sizeof (*stateid)); 12758 12759 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12760 } 12761 12762 /* 12763 * Return the credentials associated with a client state object. The 12764 * caller is responsible for freeing the credentials. 12765 */ 12766 12767 static cred_t * 12768 state_to_cred(nfs4_open_stream_t *osp) 12769 { 12770 cred_t *cr; 12771 12772 /* 12773 * It's ok to not lock the open stream and open owner to get 12774 * the oo_cred since this is only written once (upon creation) 12775 * and will not change. 12776 */ 12777 cr = osp->os_open_owner->oo_cred; 12778 crhold(cr); 12779 12780 return (cr); 12781 } 12782 12783 /* 12784 * nfs4_find_sysid 12785 * 12786 * Find the sysid for the knetconfig associated with the given mi. 12787 */ 12788 static struct lm_sysid * 12789 nfs4_find_sysid(mntinfo4_t *mi) 12790 { 12791 ASSERT(nfs_zone() == mi->mi_zone); 12792 12793 /* 12794 * Switch from RDMA knconf to original mount knconf 12795 */ 12796 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12797 mi->mi_curr_serv->sv_hostname, NULL)); 12798 } 12799 12800 #ifdef DEBUG 12801 /* 12802 * Return a string version of the call type for easy reading. 12803 */ 12804 static char * 12805 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12806 { 12807 switch (ctype) { 12808 case NFS4_LCK_CTYPE_NORM: 12809 return ("NORMAL"); 12810 case NFS4_LCK_CTYPE_RECLAIM: 12811 return ("RECLAIM"); 12812 case NFS4_LCK_CTYPE_RESEND: 12813 return ("RESEND"); 12814 case NFS4_LCK_CTYPE_REINSTATE: 12815 return ("REINSTATE"); 12816 default: 12817 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12818 "type %d", ctype); 12819 return (""); 12820 } 12821 } 12822 #endif 12823 12824 /* 12825 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12826 * Unlock requests don't have an over-the-wire locktype, so we just return 12827 * something non-threatening. 12828 */ 12829 12830 static nfs_lock_type4 12831 flk_to_locktype(int cmd, int l_type) 12832 { 12833 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12834 12835 switch (l_type) { 12836 case F_UNLCK: 12837 return (READ_LT); 12838 case F_RDLCK: 12839 if (cmd == F_SETLK) 12840 return (READ_LT); 12841 else 12842 return (READW_LT); 12843 case F_WRLCK: 12844 if (cmd == F_SETLK) 12845 return (WRITE_LT); 12846 else 12847 return (WRITEW_LT); 12848 } 12849 panic("flk_to_locktype"); 12850 /*NOTREACHED*/ 12851 } 12852 12853 /* 12854 * Do some preliminary checks for nfs4frlock. 12855 */ 12856 static int 12857 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12858 u_offset_t offset) 12859 { 12860 int error = 0; 12861 12862 /* 12863 * If we are setting a lock, check that the file is opened 12864 * with the correct mode. 12865 */ 12866 if (cmd == F_SETLK || cmd == F_SETLKW) { 12867 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12868 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12869 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12870 "nfs4frlock_validate_args: file was opened with " 12871 "incorrect mode")); 12872 return (EBADF); 12873 } 12874 } 12875 12876 /* Convert the offset. It may need to be restored before returning. */ 12877 if (error = convoff(vp, flk, 0, offset)) { 12878 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12879 "nfs4frlock_validate_args: convoff => error= %d\n", 12880 error)); 12881 return (error); 12882 } 12883 12884 return (error); 12885 } 12886 12887 /* 12888 * Set the flock64's lm_sysid for nfs4frlock. 12889 */ 12890 static int 12891 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12892 { 12893 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12894 12895 /* Find the lm_sysid */ 12896 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12897 12898 if (*lspp == NULL) { 12899 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12900 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12901 return (ENOLCK); 12902 } 12903 12904 flk->l_sysid = lm_sysidt(*lspp); 12905 12906 return (0); 12907 } 12908 12909 /* 12910 * Do the remaining preliminary setup for nfs4frlock. 12911 */ 12912 static void 12913 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12914 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12915 cred_t **cred_otw) 12916 { 12917 /* 12918 * set tick_delay to the base delay time. 12919 * (NFS4_BASE_WAIT_TIME is in secs) 12920 */ 12921 12922 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12923 12924 /* 12925 * If lock is relative to EOF, we need the newest length of the 12926 * file. Therefore invalidate the ATTR_CACHE. 12927 */ 12928 12929 *whencep = flk->l_whence; 12930 12931 if (*whencep == 2) /* SEEK_END */ 12932 PURGE_ATTRCACHE4(vp); 12933 12934 recov_statep->rs_flags = 0; 12935 recov_statep->rs_num_retry_despite_err = 0; 12936 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12937 } 12938 12939 /* 12940 * Initialize and allocate the data structures necessary for 12941 * the nfs4frlock call. 12942 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12943 */ 12944 static void 12945 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12946 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12947 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12948 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12949 { 12950 int argoplist_size; 12951 int num_ops = 2; 12952 12953 *retry = FALSE; 12954 *did_start_fop = FALSE; 12955 *skip_get_err = FALSE; 12956 lost_rqstp->lr_op = 0; 12957 argoplist_size = num_ops * sizeof (nfs_argop4); 12958 /* fill array with zero */ 12959 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12960 12961 *argspp = argsp; 12962 *respp = NULL; 12963 12964 argsp->array_len = num_ops; 12965 argsp->array = *argopp; 12966 12967 /* initialize in case of error; will get real value down below */ 12968 argsp->ctag = TAG_NONE; 12969 12970 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12971 *op_hintp = OH_LOCKU; 12972 else 12973 *op_hintp = OH_OTHER; 12974 } 12975 12976 /* 12977 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12978 * the proper nfs4_server_t for this instance of nfs4frlock. 12979 * Returns 0 (success) or an errno value. 12980 */ 12981 static int 12982 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12983 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12984 bool_t *did_start_fop, bool_t *startrecovp) 12985 { 12986 int error = 0; 12987 rnode4_t *rp; 12988 12989 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12990 12991 if (ctype == NFS4_LCK_CTYPE_NORM) { 12992 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12993 recov_statep, startrecovp); 12994 if (error) 12995 return (error); 12996 *did_start_fop = TRUE; 12997 } else { 12998 *did_start_fop = FALSE; 12999 *startrecovp = FALSE; 13000 } 13001 13002 if (!error) { 13003 rp = VTOR4(vp); 13004 13005 /* If the file failed recovery, just quit. */ 13006 mutex_enter(&rp->r_statelock); 13007 if (rp->r_flags & R4RECOVERR) { 13008 error = EIO; 13009 } 13010 mutex_exit(&rp->r_statelock); 13011 } 13012 13013 return (error); 13014 } 13015 13016 /* 13017 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13018 * resend nfs4frlock call is initiated by the recovery framework. 13019 * Acquires the lop and oop seqid synchronization. 13020 */ 13021 static void 13022 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13023 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13024 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13025 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13026 { 13027 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13028 int error; 13029 13030 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13031 (CE_NOTE, 13032 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13033 ASSERT(resend_rqstp != NULL); 13034 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13035 resend_rqstp->lr_op == OP_LOCKU); 13036 13037 *oopp = resend_rqstp->lr_oop; 13038 if (resend_rqstp->lr_oop) { 13039 open_owner_hold(resend_rqstp->lr_oop); 13040 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13041 ASSERT(error == 0); /* recov thread always succeeds */ 13042 } 13043 13044 /* Must resend this lost lock/locku request. */ 13045 ASSERT(resend_rqstp->lr_lop != NULL); 13046 *lopp = resend_rqstp->lr_lop; 13047 lock_owner_hold(resend_rqstp->lr_lop); 13048 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13049 ASSERT(error == 0); /* recov thread always succeeds */ 13050 13051 *ospp = resend_rqstp->lr_osp; 13052 if (*ospp) 13053 open_stream_hold(resend_rqstp->lr_osp); 13054 13055 if (resend_rqstp->lr_op == OP_LOCK) { 13056 LOCK4args *lock_args; 13057 13058 argop->argop = OP_LOCK; 13059 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13060 lock_args->locktype = resend_rqstp->lr_locktype; 13061 lock_args->reclaim = 13062 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13063 lock_args->offset = resend_rqstp->lr_flk->l_start; 13064 lock_args->length = resend_rqstp->lr_flk->l_len; 13065 if (lock_args->length == 0) 13066 lock_args->length = ~lock_args->length; 13067 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13068 mi2clientid(mi), &lock_args->locker); 13069 13070 switch (resend_rqstp->lr_ctype) { 13071 case NFS4_LCK_CTYPE_RESEND: 13072 argsp->ctag = TAG_LOCK_RESEND; 13073 break; 13074 case NFS4_LCK_CTYPE_REINSTATE: 13075 argsp->ctag = TAG_LOCK_REINSTATE; 13076 break; 13077 case NFS4_LCK_CTYPE_RECLAIM: 13078 argsp->ctag = TAG_LOCK_RECLAIM; 13079 break; 13080 default: 13081 argsp->ctag = TAG_LOCK_UNKNOWN; 13082 break; 13083 } 13084 } else { 13085 LOCKU4args *locku_args; 13086 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13087 13088 argop->argop = OP_LOCKU; 13089 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13090 locku_args->locktype = READ_LT; 13091 locku_args->seqid = lop->lock_seqid + 1; 13092 mutex_enter(&lop->lo_lock); 13093 locku_args->lock_stateid = lop->lock_stateid; 13094 mutex_exit(&lop->lo_lock); 13095 locku_args->offset = resend_rqstp->lr_flk->l_start; 13096 locku_args->length = resend_rqstp->lr_flk->l_len; 13097 if (locku_args->length == 0) 13098 locku_args->length = ~locku_args->length; 13099 13100 switch (resend_rqstp->lr_ctype) { 13101 case NFS4_LCK_CTYPE_RESEND: 13102 argsp->ctag = TAG_LOCKU_RESEND; 13103 break; 13104 case NFS4_LCK_CTYPE_REINSTATE: 13105 argsp->ctag = TAG_LOCKU_REINSTATE; 13106 break; 13107 default: 13108 argsp->ctag = TAG_LOCK_UNKNOWN; 13109 break; 13110 } 13111 } 13112 } 13113 13114 /* 13115 * Setup the LOCKT4 arguments. 13116 */ 13117 static void 13118 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13119 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13120 rnode4_t *rp) 13121 { 13122 LOCKT4args *lockt_args; 13123 13124 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13125 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13126 argop->argop = OP_LOCKT; 13127 argsp->ctag = TAG_LOCKT; 13128 lockt_args = &argop->nfs_argop4_u.oplockt; 13129 13130 /* 13131 * The locktype will be READ_LT unless it's 13132 * a write lock. We do this because the Solaris 13133 * system call allows the combination of 13134 * F_UNLCK and F_GETLK* and so in that case the 13135 * unlock is mapped to a read. 13136 */ 13137 if (flk->l_type == F_WRLCK) 13138 lockt_args->locktype = WRITE_LT; 13139 else 13140 lockt_args->locktype = READ_LT; 13141 13142 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13143 /* set the lock owner4 args */ 13144 nfs4_setlockowner_args(&lockt_args->owner, rp, 13145 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13146 flk->l_pid); 13147 lockt_args->offset = flk->l_start; 13148 lockt_args->length = flk->l_len; 13149 if (flk->l_len == 0) 13150 lockt_args->length = ~lockt_args->length; 13151 13152 *lockt_argsp = lockt_args; 13153 } 13154 13155 /* 13156 * If the client is holding a delegation, and the open stream to be used 13157 * with this lock request is a delegation open stream, then re-open the stream. 13158 * Sets the nfs4_error_t to all zeros unless the open stream has already 13159 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13160 * means the caller should retry (like a recovery retry). 13161 */ 13162 static void 13163 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13164 { 13165 open_delegation_type4 dt; 13166 bool_t reopen_needed, force; 13167 nfs4_open_stream_t *osp; 13168 open_claim_type4 oclaim; 13169 rnode4_t *rp = VTOR4(vp); 13170 mntinfo4_t *mi = VTOMI4(vp); 13171 13172 ASSERT(nfs_zone() == mi->mi_zone); 13173 13174 nfs4_error_zinit(ep); 13175 13176 mutex_enter(&rp->r_statev4_lock); 13177 dt = rp->r_deleg_type; 13178 mutex_exit(&rp->r_statev4_lock); 13179 13180 if (dt != OPEN_DELEGATE_NONE) { 13181 nfs4_open_owner_t *oop; 13182 13183 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13184 if (!oop) { 13185 ep->stat = NFS4ERR_IO; 13186 return; 13187 } 13188 /* returns with 'os_sync_lock' held */ 13189 osp = find_open_stream(oop, rp); 13190 if (!osp) { 13191 open_owner_rele(oop); 13192 ep->stat = NFS4ERR_IO; 13193 return; 13194 } 13195 13196 if (osp->os_failed_reopen) { 13197 NFS4_DEBUG((nfs4_open_stream_debug || 13198 nfs4_client_lock_debug), (CE_NOTE, 13199 "nfs4frlock_check_deleg: os_failed_reopen set " 13200 "for osp %p, cr %p, rp %s", (void *)osp, 13201 (void *)cr, rnode4info(rp))); 13202 mutex_exit(&osp->os_sync_lock); 13203 open_stream_rele(osp, rp); 13204 open_owner_rele(oop); 13205 ep->stat = NFS4ERR_IO; 13206 return; 13207 } 13208 13209 /* 13210 * Determine whether a reopen is needed. If this 13211 * is a delegation open stream, then send the open 13212 * to the server to give visibility to the open owner. 13213 * Even if it isn't a delegation open stream, we need 13214 * to check if the previous open CLAIM_DELEGATE_CUR 13215 * was sufficient. 13216 */ 13217 13218 reopen_needed = osp->os_delegation || 13219 ((lt == F_RDLCK && 13220 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13221 (lt == F_WRLCK && 13222 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13223 13224 mutex_exit(&osp->os_sync_lock); 13225 open_owner_rele(oop); 13226 13227 if (reopen_needed) { 13228 /* 13229 * Always use CLAIM_PREVIOUS after server reboot. 13230 * The server will reject CLAIM_DELEGATE_CUR if 13231 * it is used during the grace period. 13232 */ 13233 mutex_enter(&mi->mi_lock); 13234 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13235 oclaim = CLAIM_PREVIOUS; 13236 force = TRUE; 13237 } else { 13238 oclaim = CLAIM_DELEGATE_CUR; 13239 force = FALSE; 13240 } 13241 mutex_exit(&mi->mi_lock); 13242 13243 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13244 if (ep->error == EAGAIN) { 13245 nfs4_error_zinit(ep); 13246 ep->stat = NFS4ERR_DELAY; 13247 } 13248 } 13249 open_stream_rele(osp, rp); 13250 osp = NULL; 13251 } 13252 } 13253 13254 /* 13255 * Setup the LOCKU4 arguments. 13256 * Returns errors via the nfs4_error_t. 13257 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13258 * over-the-wire. The caller must release the 13259 * reference on *lopp. 13260 * NFS4ERR_DELAY caller should retry (like recovery retry) 13261 * (other) unrecoverable error. 13262 */ 13263 static void 13264 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13265 LOCKU4args **locku_argsp, flock64_t *flk, 13266 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13267 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13268 bool_t *skip_get_err, bool_t *go_otwp) 13269 { 13270 nfs4_lock_owner_t *lop = NULL; 13271 LOCKU4args *locku_args; 13272 pid_t pid; 13273 bool_t is_spec = FALSE; 13274 rnode4_t *rp = VTOR4(vp); 13275 13276 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13277 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13278 13279 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13280 if (ep->error || ep->stat) 13281 return; 13282 13283 argop->argop = OP_LOCKU; 13284 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13285 argsp->ctag = TAG_LOCKU_REINSTATE; 13286 else 13287 argsp->ctag = TAG_LOCKU; 13288 locku_args = &argop->nfs_argop4_u.oplocku; 13289 *locku_argsp = locku_args; 13290 13291 /* 13292 * XXX what should locku_args->locktype be? 13293 * setting to ALWAYS be READ_LT so at least 13294 * it is a valid locktype. 13295 */ 13296 13297 locku_args->locktype = READ_LT; 13298 13299 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13300 flk->l_pid; 13301 13302 /* 13303 * Get the lock owner stateid. If no lock owner 13304 * exists, return success. 13305 */ 13306 lop = find_lock_owner(rp, pid, LOWN_ANY); 13307 *lopp = lop; 13308 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13309 is_spec = TRUE; 13310 if (!lop || is_spec) { 13311 /* 13312 * No lock owner so no locks to unlock. 13313 * Return success. If there was a failed 13314 * reclaim earlier, the lock might still be 13315 * registered with the local locking code, 13316 * so notify it of the unlock. 13317 * 13318 * If the lockowner is using a special stateid, 13319 * then the original lock request (that created 13320 * this lockowner) was never successful, so we 13321 * have no lock to undo OTW. 13322 */ 13323 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13324 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13325 "(%ld) so return success", (long)pid)); 13326 13327 if (ctype == NFS4_LCK_CTYPE_NORM) 13328 flk->l_pid = curproc->p_pid; 13329 nfs4_register_lock_locally(vp, flk, flag, offset); 13330 /* 13331 * Release our hold and NULL out so final_cleanup 13332 * doesn't try to end a lock seqid sync we 13333 * never started. 13334 */ 13335 if (is_spec) { 13336 lock_owner_rele(lop); 13337 *lopp = NULL; 13338 } 13339 *skip_get_err = TRUE; 13340 *go_otwp = FALSE; 13341 return; 13342 } 13343 13344 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13345 if (ep->error == EAGAIN) { 13346 lock_owner_rele(lop); 13347 *lopp = NULL; 13348 return; 13349 } 13350 13351 mutex_enter(&lop->lo_lock); 13352 locku_args->lock_stateid = lop->lock_stateid; 13353 mutex_exit(&lop->lo_lock); 13354 locku_args->seqid = lop->lock_seqid + 1; 13355 13356 /* leave the ref count on lop, rele after RPC call */ 13357 13358 locku_args->offset = flk->l_start; 13359 locku_args->length = flk->l_len; 13360 if (flk->l_len == 0) 13361 locku_args->length = ~locku_args->length; 13362 13363 *go_otwp = TRUE; 13364 } 13365 13366 /* 13367 * Setup the LOCK4 arguments. 13368 * 13369 * Returns errors via the nfs4_error_t. 13370 * NFS4_OK no problems 13371 * NFS4ERR_DELAY caller should retry (like recovery retry) 13372 * (other) unrecoverable error 13373 */ 13374 static void 13375 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13376 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13377 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13378 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13379 { 13380 LOCK4args *lock_args; 13381 nfs4_open_owner_t *oop = NULL; 13382 nfs4_open_stream_t *osp = NULL; 13383 nfs4_lock_owner_t *lop = NULL; 13384 pid_t pid; 13385 rnode4_t *rp = VTOR4(vp); 13386 13387 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13388 13389 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13390 if (ep->error || ep->stat != NFS4_OK) 13391 return; 13392 13393 argop->argop = OP_LOCK; 13394 if (ctype == NFS4_LCK_CTYPE_NORM) 13395 argsp->ctag = TAG_LOCK; 13396 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13397 argsp->ctag = TAG_RELOCK; 13398 else 13399 argsp->ctag = TAG_LOCK_REINSTATE; 13400 lock_args = &argop->nfs_argop4_u.oplock; 13401 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13402 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13403 /* 13404 * Get the lock owner. If no lock owner exists, 13405 * create a 'temporary' one and grab the open seqid 13406 * synchronization (which puts a hold on the open 13407 * owner and open stream). 13408 * This also grabs the lock seqid synchronization. 13409 */ 13410 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13411 ep->stat = 13412 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13413 13414 if (ep->stat != NFS4_OK) 13415 goto out; 13416 13417 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13418 &lock_args->locker); 13419 13420 lock_args->offset = flk->l_start; 13421 lock_args->length = flk->l_len; 13422 if (flk->l_len == 0) 13423 lock_args->length = ~lock_args->length; 13424 *lock_argsp = lock_args; 13425 out: 13426 *oopp = oop; 13427 *ospp = osp; 13428 *lopp = lop; 13429 } 13430 13431 /* 13432 * After we get the reply from the server, record the proper information 13433 * for possible resend lock requests. 13434 * 13435 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13436 */ 13437 static void 13438 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13439 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13440 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13441 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13442 { 13443 bool_t unlock = (flk->l_type == F_UNLCK); 13444 13445 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13446 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13447 ctype == NFS4_LCK_CTYPE_REINSTATE); 13448 13449 if (error != 0 && !unlock) { 13450 NFS4_DEBUG((nfs4_lost_rqst_debug || 13451 nfs4_client_lock_debug), (CE_NOTE, 13452 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13453 " for lop %p", (void *)lop)); 13454 ASSERT(lop != NULL); 13455 mutex_enter(&lop->lo_lock); 13456 lop->lo_pending_rqsts = 1; 13457 mutex_exit(&lop->lo_lock); 13458 } 13459 13460 lost_rqstp->lr_putfirst = FALSE; 13461 lost_rqstp->lr_op = 0; 13462 13463 /* 13464 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13465 * recovery purposes so that the lock request that was sent 13466 * can be saved and re-issued later. Ditto for EIO from a forced 13467 * unmount. This is done to have the client's local locking state 13468 * match the v4 server's state; that is, the request was 13469 * potentially received and accepted by the server but the client 13470 * thinks it was not. 13471 */ 13472 if (error == ETIMEDOUT || error == EINTR || 13473 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13474 NFS4_DEBUG((nfs4_lost_rqst_debug || 13475 nfs4_client_lock_debug), (CE_NOTE, 13476 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13477 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13478 (void *)lop, (void *)oop, (void *)osp)); 13479 if (unlock) 13480 lost_rqstp->lr_op = OP_LOCKU; 13481 else { 13482 lost_rqstp->lr_op = OP_LOCK; 13483 lost_rqstp->lr_locktype = locktype; 13484 } 13485 /* 13486 * Objects are held and rele'd via the recovery code. 13487 * See nfs4_save_lost_rqst. 13488 */ 13489 lost_rqstp->lr_vp = vp; 13490 lost_rqstp->lr_dvp = NULL; 13491 lost_rqstp->lr_oop = oop; 13492 lost_rqstp->lr_osp = osp; 13493 lost_rqstp->lr_lop = lop; 13494 lost_rqstp->lr_cr = cr; 13495 switch (ctype) { 13496 case NFS4_LCK_CTYPE_NORM: 13497 flk->l_pid = ttoproc(curthread)->p_pid; 13498 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13499 break; 13500 case NFS4_LCK_CTYPE_REINSTATE: 13501 lost_rqstp->lr_putfirst = TRUE; 13502 lost_rqstp->lr_ctype = ctype; 13503 break; 13504 default: 13505 break; 13506 } 13507 lost_rqstp->lr_flk = flk; 13508 } 13509 } 13510 13511 /* 13512 * Update lop's seqid. Also update the seqid stored in a resend request, 13513 * if any. (Some recovery errors increment the seqid, and we may have to 13514 * send the resend request again.) 13515 */ 13516 13517 static void 13518 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13519 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13520 { 13521 if (lock_args) { 13522 if (lock_args->locker.new_lock_owner == TRUE) 13523 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13524 else { 13525 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13526 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13527 } 13528 } else if (locku_args) { 13529 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13530 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13531 } 13532 } 13533 13534 /* 13535 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13536 * COMPOUND4 args/res for calls that need to retry. 13537 * Switches the *cred_otwp to base_cr. 13538 */ 13539 static void 13540 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13541 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13542 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13543 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13544 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13545 { 13546 nfs4_open_owner_t *oop = *oopp; 13547 nfs4_open_stream_t *osp = *ospp; 13548 nfs4_lock_owner_t *lop = *lopp; 13549 nfs_argop4 *argop = (*argspp)->array; 13550 13551 if (*did_start_fop) { 13552 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13553 needrecov); 13554 *did_start_fop = FALSE; 13555 } 13556 ASSERT((*argspp)->array_len == 2); 13557 if (argop[1].argop == OP_LOCK) 13558 nfs4args_lock_free(&argop[1]); 13559 else if (argop[1].argop == OP_LOCKT) 13560 nfs4args_lockt_free(&argop[1]); 13561 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13562 if (!error) 13563 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13564 *argspp = NULL; 13565 *respp = NULL; 13566 13567 if (lop) { 13568 nfs4_end_lock_seqid_sync(lop); 13569 lock_owner_rele(lop); 13570 *lopp = NULL; 13571 } 13572 13573 /* need to free up the reference on osp for lock args */ 13574 if (osp != NULL) { 13575 open_stream_rele(osp, VTOR4(vp)); 13576 *ospp = NULL; 13577 } 13578 13579 /* need to free up the reference on oop for lock args */ 13580 if (oop != NULL) { 13581 nfs4_end_open_seqid_sync(oop); 13582 open_owner_rele(oop); 13583 *oopp = NULL; 13584 } 13585 13586 crfree(*cred_otwp); 13587 *cred_otwp = base_cr; 13588 crhold(*cred_otwp); 13589 } 13590 13591 /* 13592 * Function to process the client's recovery for nfs4frlock. 13593 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13594 * 13595 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13596 * COMPOUND4 args/res for calls that need to retry. 13597 * 13598 * Note: the rp's r_lkserlock is *not* dropped during this path. 13599 */ 13600 static bool_t 13601 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13602 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13603 LOCK4args *lock_args, LOCKU4args *locku_args, 13604 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13605 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13606 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13607 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13608 { 13609 nfs4_open_owner_t *oop = *oopp; 13610 nfs4_open_stream_t *osp = *ospp; 13611 nfs4_lock_owner_t *lop = *lopp; 13612 13613 bool_t abort, retry; 13614 13615 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13616 ASSERT((*argspp) != NULL); 13617 ASSERT((*respp) != NULL); 13618 if (lock_args || locku_args) 13619 ASSERT(lop != NULL); 13620 13621 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13622 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13623 13624 retry = TRUE; 13625 abort = FALSE; 13626 if (needrecov) { 13627 nfs4_bseqid_entry_t *bsep = NULL; 13628 nfs_opnum4 op; 13629 13630 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13631 13632 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13633 seqid4 seqid; 13634 13635 if (lock_args) { 13636 if (lock_args->locker.new_lock_owner == TRUE) 13637 seqid = lock_args->locker.locker4_u. 13638 open_owner.open_seqid; 13639 else 13640 seqid = lock_args->locker.locker4_u. 13641 lock_owner.lock_seqid; 13642 } else if (locku_args) { 13643 seqid = locku_args->seqid; 13644 } else { 13645 seqid = 0; 13646 } 13647 13648 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13649 flk->l_pid, (*argspp)->ctag, seqid); 13650 } 13651 13652 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13653 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13654 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13655 NULL, op, bsep); 13656 13657 if (bsep) 13658 kmem_free(bsep, sizeof (*bsep)); 13659 } 13660 13661 /* 13662 * Return that we do not want to retry the request for 3 cases: 13663 * 1. If we received EINTR or are bailing out because of a forced 13664 * unmount, we came into this code path just for the sake of 13665 * initiating recovery, we now need to return the error. 13666 * 2. If we have aborted recovery. 13667 * 3. We received NFS4ERR_BAD_SEQID. 13668 */ 13669 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13670 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13671 retry = FALSE; 13672 13673 if (*did_start_fop == TRUE) { 13674 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13675 needrecov); 13676 *did_start_fop = FALSE; 13677 } 13678 13679 if (retry == TRUE) { 13680 nfs_argop4 *argop; 13681 13682 argop = (*argspp)->array; 13683 ASSERT((*argspp)->array_len == 2); 13684 13685 if (argop[1].argop == OP_LOCK) 13686 nfs4args_lock_free(&argop[1]); 13687 else if (argop[1].argop == OP_LOCKT) 13688 nfs4args_lockt_free(&argop[1]); 13689 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13690 if (!ep->error) 13691 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13692 *respp = NULL; 13693 *argspp = NULL; 13694 } 13695 13696 if (lop != NULL) { 13697 nfs4_end_lock_seqid_sync(lop); 13698 lock_owner_rele(lop); 13699 } 13700 13701 *lopp = NULL; 13702 13703 /* need to free up the reference on osp for lock args */ 13704 if (osp != NULL) { 13705 open_stream_rele(osp, rp); 13706 *ospp = NULL; 13707 } 13708 13709 /* need to free up the reference on oop for lock args */ 13710 if (oop != NULL) { 13711 nfs4_end_open_seqid_sync(oop); 13712 open_owner_rele(oop); 13713 *oopp = NULL; 13714 } 13715 13716 return (retry); 13717 } 13718 13719 /* 13720 * Handles the successful reply from the server for nfs4frlock. 13721 */ 13722 static void 13723 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13724 vnode_t *vp, int flag, u_offset_t offset, 13725 nfs4_lost_rqst_t *resend_rqstp) 13726 { 13727 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13728 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13729 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13730 if (ctype == NFS4_LCK_CTYPE_NORM) { 13731 flk->l_pid = ttoproc(curthread)->p_pid; 13732 /* 13733 * We do not register lost locks locally in 13734 * the 'resend' case since the user/application 13735 * doesn't think we have the lock. 13736 */ 13737 ASSERT(!resend_rqstp); 13738 nfs4_register_lock_locally(vp, flk, flag, offset); 13739 } 13740 } 13741 } 13742 13743 /* 13744 * Handle the DENIED reply from the server for nfs4frlock. 13745 * Returns TRUE if we should retry the request; FALSE otherwise. 13746 * 13747 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13748 * COMPOUND4 args/res for calls that need to retry. Can also 13749 * drop and regrab the r_lkserlock. 13750 */ 13751 static bool_t 13752 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13753 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13754 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13755 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13756 nfs4_recov_state_t *recov_statep, int needrecov, 13757 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13758 clock_t *tick_delayp, short *whencep, int *errorp, 13759 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13760 bool_t *skip_get_err) 13761 { 13762 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13763 13764 if (lock_args) { 13765 nfs4_open_owner_t *oop = *oopp; 13766 nfs4_open_stream_t *osp = *ospp; 13767 nfs4_lock_owner_t *lop = *lopp; 13768 int intr; 13769 13770 /* 13771 * Blocking lock needs to sleep and retry from the request. 13772 * 13773 * Do not block and wait for 'resend' or 'reinstate' 13774 * lock requests, just return the error. 13775 * 13776 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13777 */ 13778 if (cmd == F_SETLKW) { 13779 rnode4_t *rp = VTOR4(vp); 13780 nfs_argop4 *argop = (*argspp)->array; 13781 13782 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13783 13784 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13785 recov_statep, needrecov); 13786 *did_start_fop = FALSE; 13787 ASSERT((*argspp)->array_len == 2); 13788 if (argop[1].argop == OP_LOCK) 13789 nfs4args_lock_free(&argop[1]); 13790 else if (argop[1].argop == OP_LOCKT) 13791 nfs4args_lockt_free(&argop[1]); 13792 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13793 if (*respp) 13794 (void) xdr_free(xdr_COMPOUND4res_clnt, 13795 (caddr_t)*respp); 13796 *argspp = NULL; 13797 *respp = NULL; 13798 nfs4_end_lock_seqid_sync(lop); 13799 lock_owner_rele(lop); 13800 *lopp = NULL; 13801 if (osp != NULL) { 13802 open_stream_rele(osp, rp); 13803 *ospp = NULL; 13804 } 13805 if (oop != NULL) { 13806 nfs4_end_open_seqid_sync(oop); 13807 open_owner_rele(oop); 13808 *oopp = NULL; 13809 } 13810 13811 nfs_rw_exit(&rp->r_lkserlock); 13812 13813 intr = nfs4_block_and_wait(tick_delayp, rp); 13814 13815 if (intr) { 13816 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13817 RW_WRITER, FALSE); 13818 *errorp = EINTR; 13819 return (FALSE); 13820 } 13821 13822 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13823 RW_WRITER, FALSE); 13824 13825 /* 13826 * Make sure we are still safe to lock with 13827 * regards to mmapping. 13828 */ 13829 if (!nfs4_safelock(vp, flk, cr)) { 13830 *errorp = EAGAIN; 13831 return (FALSE); 13832 } 13833 13834 return (TRUE); 13835 } 13836 if (ctype == NFS4_LCK_CTYPE_NORM) 13837 *errorp = EAGAIN; 13838 *skip_get_err = TRUE; 13839 flk->l_whence = 0; 13840 *whencep = 0; 13841 return (FALSE); 13842 } else if (lockt_args) { 13843 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13844 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13845 13846 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13847 flk, lockt_args); 13848 13849 /* according to NLM code */ 13850 *errorp = 0; 13851 *whencep = 0; 13852 *skip_get_err = TRUE; 13853 return (FALSE); 13854 } 13855 return (FALSE); 13856 } 13857 13858 /* 13859 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13860 */ 13861 static void 13862 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13863 { 13864 switch (resp->status) { 13865 case NFS4ERR_ACCESS: 13866 case NFS4ERR_ADMIN_REVOKED: 13867 case NFS4ERR_BADHANDLE: 13868 case NFS4ERR_BAD_RANGE: 13869 case NFS4ERR_BAD_SEQID: 13870 case NFS4ERR_BAD_STATEID: 13871 case NFS4ERR_BADXDR: 13872 case NFS4ERR_DEADLOCK: 13873 case NFS4ERR_DELAY: 13874 case NFS4ERR_EXPIRED: 13875 case NFS4ERR_FHEXPIRED: 13876 case NFS4ERR_GRACE: 13877 case NFS4ERR_INVAL: 13878 case NFS4ERR_ISDIR: 13879 case NFS4ERR_LEASE_MOVED: 13880 case NFS4ERR_LOCK_NOTSUPP: 13881 case NFS4ERR_LOCK_RANGE: 13882 case NFS4ERR_MOVED: 13883 case NFS4ERR_NOFILEHANDLE: 13884 case NFS4ERR_NO_GRACE: 13885 case NFS4ERR_OLD_STATEID: 13886 case NFS4ERR_OPENMODE: 13887 case NFS4ERR_RECLAIM_BAD: 13888 case NFS4ERR_RECLAIM_CONFLICT: 13889 case NFS4ERR_RESOURCE: 13890 case NFS4ERR_SERVERFAULT: 13891 case NFS4ERR_STALE: 13892 case NFS4ERR_STALE_CLIENTID: 13893 case NFS4ERR_STALE_STATEID: 13894 return; 13895 default: 13896 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13897 "nfs4frlock_results_default: got unrecognizable " 13898 "res.status %d", resp->status)); 13899 *errorp = NFS4ERR_INVAL; 13900 } 13901 } 13902 13903 /* 13904 * The lock request was successful, so update the client's state. 13905 */ 13906 static void 13907 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13908 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13909 vnode_t *vp, flock64_t *flk, cred_t *cr, 13910 nfs4_lost_rqst_t *resend_rqstp) 13911 { 13912 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13913 13914 if (lock_args) { 13915 LOCK4res *lock_res; 13916 13917 lock_res = &resop->nfs_resop4_u.oplock; 13918 /* update the stateid with server's response */ 13919 13920 if (lock_args->locker.new_lock_owner == TRUE) { 13921 mutex_enter(&lop->lo_lock); 13922 lop->lo_just_created = NFS4_PERM_CREATED; 13923 mutex_exit(&lop->lo_lock); 13924 } 13925 13926 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13927 13928 /* 13929 * If the lock was the result of a resending a lost 13930 * request, we've synched up the stateid and seqid 13931 * with the server, but now the server might be out of sync 13932 * with what the application thinks it has for locks. 13933 * Clean that up here. It's unclear whether we should do 13934 * this even if the filesystem has been forcibly unmounted. 13935 * For most servers, it's probably wasted effort, but 13936 * RFC3530 lets servers require that unlocks exactly match 13937 * the locks that are held. 13938 */ 13939 if (resend_rqstp != NULL && 13940 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13941 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13942 } else { 13943 flk->l_whence = 0; 13944 } 13945 } else if (locku_args) { 13946 LOCKU4res *locku_res; 13947 13948 locku_res = &resop->nfs_resop4_u.oplocku; 13949 13950 /* Update the stateid with the server's response */ 13951 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13952 } else if (lockt_args) { 13953 /* Switch the lock type to express success, see fcntl */ 13954 flk->l_type = F_UNLCK; 13955 flk->l_whence = 0; 13956 } 13957 } 13958 13959 /* 13960 * Do final cleanup before exiting nfs4frlock. 13961 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13962 * COMPOUND4 args/res for calls that haven't already. 13963 */ 13964 static void 13965 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13966 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13967 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13968 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13969 short whence, u_offset_t offset, struct lm_sysid *ls, 13970 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13971 bool_t did_start_fop, bool_t skip_get_err, 13972 cred_t *cred_otw, cred_t *cred) 13973 { 13974 mntinfo4_t *mi = VTOMI4(vp); 13975 rnode4_t *rp = VTOR4(vp); 13976 int error = *errorp; 13977 nfs_argop4 *argop; 13978 13979 ASSERT(nfs_zone() == mi->mi_zone); 13980 /* 13981 * The client recovery code wants the raw status information, 13982 * so don't map the NFS status code to an errno value for 13983 * non-normal call types. 13984 */ 13985 if (ctype == NFS4_LCK_CTYPE_NORM) { 13986 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13987 *errorp = geterrno4(resp->status); 13988 if (did_start_fop == TRUE) 13989 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13990 needrecov); 13991 13992 /* 13993 * We've established a new lock on the server, so invalidate 13994 * the pages associated with the vnode to get the most up to 13995 * date pages from the server after acquiring the lock. We 13996 * want to be sure that the read operation gets the newest data. 13997 * N.B. 13998 * We used to do this in nfs4frlock_results_ok but that doesn't 13999 * work since VOP_PUTPAGE can call nfs4_commit which calls 14000 * nfs4_start_fop. We flush the pages below after calling 14001 * nfs4_end_fop above 14002 */ 14003 if (!error && resp && resp->status == NFS4_OK) { 14004 int error; 14005 14006 error = VOP_PUTPAGE(vp, (u_offset_t)0, 14007 0, B_INVAL, cred, NULL); 14008 14009 if (error && (error == ENOSPC || error == EDQUOT)) { 14010 rnode4_t *rp = VTOR4(vp); 14011 14012 mutex_enter(&rp->r_statelock); 14013 if (!rp->r_error) 14014 rp->r_error = error; 14015 mutex_exit(&rp->r_statelock); 14016 } 14017 } 14018 } 14019 if (argsp) { 14020 ASSERT(argsp->array_len == 2); 14021 argop = argsp->array; 14022 if (argop[1].argop == OP_LOCK) 14023 nfs4args_lock_free(&argop[1]); 14024 else if (argop[1].argop == OP_LOCKT) 14025 nfs4args_lockt_free(&argop[1]); 14026 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14027 if (resp) 14028 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14029 } 14030 14031 /* free the reference on the lock owner */ 14032 if (lop != NULL) { 14033 nfs4_end_lock_seqid_sync(lop); 14034 lock_owner_rele(lop); 14035 } 14036 14037 /* need to free up the reference on osp for lock args */ 14038 if (osp != NULL) 14039 open_stream_rele(osp, rp); 14040 14041 /* need to free up the reference on oop for lock args */ 14042 if (oop != NULL) { 14043 nfs4_end_open_seqid_sync(oop); 14044 open_owner_rele(oop); 14045 } 14046 14047 (void) convoff(vp, flk, whence, offset); 14048 14049 lm_rel_sysid(ls); 14050 14051 /* 14052 * Record debug information in the event we get EINVAL. 14053 */ 14054 mutex_enter(&mi->mi_lock); 14055 if (*errorp == EINVAL && (lock_args || locku_args) && 14056 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14057 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14058 zcmn_err(getzoneid(), CE_NOTE, 14059 "%s operation failed with " 14060 "EINVAL probably since the server, %s," 14061 " doesn't support POSIX style locking", 14062 lock_args ? "LOCK" : "LOCKU", 14063 mi->mi_curr_serv->sv_hostname); 14064 mi->mi_flags |= MI4_LOCK_DEBUG; 14065 } 14066 } 14067 mutex_exit(&mi->mi_lock); 14068 14069 if (cred_otw) 14070 crfree(cred_otw); 14071 } 14072 14073 /* 14074 * This calls the server and the local locking code. 14075 * 14076 * Client locks are registerred locally by oring the sysid with 14077 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14078 * We need to distinguish between the two to avoid collision in case one 14079 * machine is used as both client and server. 14080 * 14081 * Blocking lock requests will continually retry to acquire the lock 14082 * forever. 14083 * 14084 * The ctype is defined as follows: 14085 * NFS4_LCK_CTYPE_NORM: normal lock request. 14086 * 14087 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14088 * recovery, get the pid from flk instead of curproc, and don't reregister 14089 * the lock locally. 14090 * 14091 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14092 * that we will use the information passed in via resend_rqstp to setup the 14093 * lock/locku request. This resend is the exact same request as the 'lost 14094 * lock', and is initiated by the recovery framework. A successful resend 14095 * request can initiate one or more reinstate requests. 14096 * 14097 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14098 * does not trigger additional reinstate requests. This lock call type is 14099 * set for setting the v4 server's locking state back to match what the 14100 * client's local locking state is in the event of a received 'lost lock'. 14101 * 14102 * Errors are returned via the nfs4_error_t parameter. 14103 */ 14104 void 14105 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14106 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14107 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14108 { 14109 COMPOUND4args_clnt args, *argsp = NULL; 14110 COMPOUND4res_clnt res, *resp = NULL; 14111 nfs_argop4 *argop; 14112 nfs_resop4 *resop; 14113 rnode4_t *rp; 14114 int doqueue = 1; 14115 clock_t tick_delay; /* delay in clock ticks */ 14116 struct lm_sysid *ls; 14117 LOCK4args *lock_args = NULL; 14118 LOCKU4args *locku_args = NULL; 14119 LOCKT4args *lockt_args = NULL; 14120 nfs4_open_owner_t *oop = NULL; 14121 nfs4_open_stream_t *osp = NULL; 14122 nfs4_lock_owner_t *lop = NULL; 14123 bool_t needrecov = FALSE; 14124 nfs4_recov_state_t recov_state; 14125 short whence; 14126 nfs4_op_hint_t op_hint; 14127 nfs4_lost_rqst_t lost_rqst; 14128 bool_t retry = FALSE; 14129 bool_t did_start_fop = FALSE; 14130 bool_t skip_get_err = FALSE; 14131 cred_t *cred_otw = NULL; 14132 bool_t recovonly; /* just queue request */ 14133 int frc_no_reclaim = 0; 14134 #ifdef DEBUG 14135 char *name; 14136 #endif 14137 14138 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14139 14140 #ifdef DEBUG 14141 name = fn_name(VTOSV(vp)->sv_name); 14142 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14143 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14144 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14145 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14146 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14147 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14148 resend_rqstp ? "TRUE" : "FALSE")); 14149 kmem_free(name, MAXNAMELEN); 14150 #endif 14151 14152 nfs4_error_zinit(ep); 14153 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14154 if (ep->error) 14155 return; 14156 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14157 if (ep->error) 14158 return; 14159 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14160 vp, cr, &cred_otw); 14161 14162 recov_retry: 14163 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14164 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14165 rp = VTOR4(vp); 14166 14167 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14168 &did_start_fop, &recovonly); 14169 14170 if (ep->error) 14171 goto out; 14172 14173 if (recovonly) { 14174 /* 14175 * Leave the request for the recovery system to deal with. 14176 */ 14177 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14178 ASSERT(cmd != F_GETLK); 14179 ASSERT(flk->l_type == F_UNLCK); 14180 14181 nfs4_error_init(ep, EINTR); 14182 needrecov = TRUE; 14183 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14184 if (lop != NULL) { 14185 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14186 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14187 (void) nfs4_start_recovery(ep, 14188 VTOMI4(vp), vp, NULL, NULL, 14189 (lost_rqst.lr_op == OP_LOCK || 14190 lost_rqst.lr_op == OP_LOCKU) ? 14191 &lost_rqst : NULL, OP_LOCKU, NULL); 14192 lock_owner_rele(lop); 14193 lop = NULL; 14194 } 14195 flk->l_pid = curproc->p_pid; 14196 nfs4_register_lock_locally(vp, flk, flag, offset); 14197 goto out; 14198 } 14199 14200 /* putfh directory fh */ 14201 argop[0].argop = OP_CPUTFH; 14202 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14203 14204 /* 14205 * Set up the over-the-wire arguments and get references to the 14206 * open owner, etc. 14207 */ 14208 14209 if (ctype == NFS4_LCK_CTYPE_RESEND || 14210 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14211 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14212 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14213 } else { 14214 bool_t go_otw = TRUE; 14215 14216 ASSERT(resend_rqstp == NULL); 14217 14218 switch (cmd) { 14219 case F_GETLK: 14220 case F_O_GETLK: 14221 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14222 &lockt_args, argsp, flk, rp); 14223 break; 14224 case F_SETLKW: 14225 case F_SETLK: 14226 if (flk->l_type == F_UNLCK) 14227 nfs4frlock_setup_locku_args(ctype, 14228 &argop[1], &locku_args, flk, 14229 &lop, ep, argsp, 14230 vp, flag, offset, cr, 14231 &skip_get_err, &go_otw); 14232 else 14233 nfs4frlock_setup_lock_args(ctype, 14234 &lock_args, &oop, &osp, &lop, &argop[1], 14235 argsp, flk, cmd, vp, cr, ep); 14236 14237 if (ep->error) 14238 goto out; 14239 14240 switch (ep->stat) { 14241 case NFS4_OK: 14242 break; 14243 case NFS4ERR_DELAY: 14244 /* recov thread never gets this error */ 14245 ASSERT(resend_rqstp == NULL); 14246 ASSERT(did_start_fop); 14247 14248 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14249 &recov_state, TRUE); 14250 did_start_fop = FALSE; 14251 if (argop[1].argop == OP_LOCK) 14252 nfs4args_lock_free(&argop[1]); 14253 else if (argop[1].argop == OP_LOCKT) 14254 nfs4args_lockt_free(&argop[1]); 14255 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14256 argsp = NULL; 14257 goto recov_retry; 14258 default: 14259 ep->error = EIO; 14260 goto out; 14261 } 14262 break; 14263 default: 14264 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14265 "nfs4_frlock: invalid cmd %d", cmd)); 14266 ep->error = EINVAL; 14267 goto out; 14268 } 14269 14270 if (!go_otw) 14271 goto out; 14272 } 14273 14274 /* XXX should we use the local reclock as a cache ? */ 14275 /* 14276 * Unregister the lock with the local locking code before 14277 * contacting the server. This avoids a potential race where 14278 * another process gets notified that it has been granted a lock 14279 * before we can unregister ourselves locally. 14280 */ 14281 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14282 if (ctype == NFS4_LCK_CTYPE_NORM) 14283 flk->l_pid = ttoproc(curthread)->p_pid; 14284 nfs4_register_lock_locally(vp, flk, flag, offset); 14285 } 14286 14287 /* 14288 * Send the server the lock request. Continually loop with a delay 14289 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14290 */ 14291 resp = &res; 14292 14293 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14294 (CE_NOTE, 14295 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14296 rnode4info(rp))); 14297 14298 if (lock_args && frc_no_reclaim) { 14299 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14300 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14301 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14302 lock_args->reclaim = FALSE; 14303 if (did_reclaimp) 14304 *did_reclaimp = 0; 14305 } 14306 14307 /* 14308 * Do the OTW call. 14309 */ 14310 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14311 14312 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14313 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14314 14315 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14316 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14317 "nfs4frlock: needrecov %d", needrecov)); 14318 14319 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14320 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14321 args.ctag); 14322 14323 /* 14324 * Check if one of these mutually exclusive error cases has 14325 * happened: 14326 * need to swap credentials due to access error 14327 * recovery is needed 14328 * different error (only known case is missing Kerberos ticket) 14329 */ 14330 14331 if ((ep->error == EACCES || 14332 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14333 cred_otw != cr) { 14334 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14335 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14336 cr, &cred_otw); 14337 goto recov_retry; 14338 } 14339 14340 if (needrecov) { 14341 /* 14342 * LOCKT requests don't need to recover from lost 14343 * requests since they don't create/modify state. 14344 */ 14345 if ((ep->error == EINTR || 14346 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14347 lockt_args) 14348 goto out; 14349 /* 14350 * Do not attempt recovery for requests initiated by 14351 * the recovery framework. Let the framework redrive them. 14352 */ 14353 if (ctype != NFS4_LCK_CTYPE_NORM) 14354 goto out; 14355 else { 14356 ASSERT(resend_rqstp == NULL); 14357 } 14358 14359 nfs4frlock_save_lost_rqst(ctype, ep->error, 14360 flk_to_locktype(cmd, flk->l_type), 14361 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14362 14363 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14364 &resp, lock_args, locku_args, &oop, &osp, &lop, 14365 rp, vp, &recov_state, op_hint, &did_start_fop, 14366 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14367 14368 if (retry) { 14369 ASSERT(oop == NULL); 14370 ASSERT(osp == NULL); 14371 ASSERT(lop == NULL); 14372 goto recov_retry; 14373 } 14374 goto out; 14375 } 14376 14377 /* 14378 * Bail out if have reached this point with ep->error set. Can 14379 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14380 * This happens if Kerberos ticket has expired or has been 14381 * destroyed. 14382 */ 14383 if (ep->error != 0) 14384 goto out; 14385 14386 /* 14387 * Process the reply. 14388 */ 14389 switch (resp->status) { 14390 case NFS4_OK: 14391 resop = &resp->array[1]; 14392 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14393 resend_rqstp); 14394 /* 14395 * Have a successful lock operation, now update state. 14396 */ 14397 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14398 resop, lop, vp, flk, cr, resend_rqstp); 14399 break; 14400 14401 case NFS4ERR_DENIED: 14402 resop = &resp->array[1]; 14403 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14404 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14405 &recov_state, needrecov, &argsp, &resp, 14406 &tick_delay, &whence, &ep->error, resop, cr, 14407 &did_start_fop, &skip_get_err); 14408 14409 if (retry) { 14410 ASSERT(oop == NULL); 14411 ASSERT(osp == NULL); 14412 ASSERT(lop == NULL); 14413 goto recov_retry; 14414 } 14415 break; 14416 /* 14417 * If the server won't let us reclaim, fall-back to trying to lock 14418 * the file from scratch. Code elsewhere will check the changeinfo 14419 * to ensure the file hasn't been changed. 14420 */ 14421 case NFS4ERR_NO_GRACE: 14422 if (lock_args && lock_args->reclaim == TRUE) { 14423 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14424 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14425 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14426 frc_no_reclaim = 1; 14427 /* clean up before retrying */ 14428 needrecov = 0; 14429 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14430 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14431 &recov_state, op_hint, &did_start_fop, NULL, flk); 14432 goto recov_retry; 14433 } 14434 /* FALLTHROUGH */ 14435 14436 default: 14437 nfs4frlock_results_default(resp, &ep->error); 14438 break; 14439 } 14440 out: 14441 /* 14442 * Process and cleanup from error. Make interrupted unlock 14443 * requests look successful, since they will be handled by the 14444 * client recovery code. 14445 */ 14446 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14447 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14448 lock_args, locku_args, did_start_fop, 14449 skip_get_err, cred_otw, cr); 14450 14451 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14452 (cmd == F_SETLK || cmd == F_SETLKW)) 14453 ep->error = 0; 14454 } 14455 14456 /* 14457 * nfs4_safelock: 14458 * 14459 * Return non-zero if the given lock request can be handled without 14460 * violating the constraints on concurrent mapping and locking. 14461 */ 14462 14463 static int 14464 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14465 { 14466 rnode4_t *rp = VTOR4(vp); 14467 struct vattr va; 14468 int error; 14469 14470 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14471 ASSERT(rp->r_mapcnt >= 0); 14472 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14473 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14474 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14475 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14476 14477 if (rp->r_mapcnt == 0) 14478 return (1); /* always safe if not mapped */ 14479 14480 /* 14481 * If the file is already mapped and there are locks, then they 14482 * should be all safe locks. So adding or removing a lock is safe 14483 * as long as the new request is safe (i.e., whole-file, meaning 14484 * length and starting offset are both zero). 14485 */ 14486 14487 if (bfp->l_start != 0 || bfp->l_len != 0) { 14488 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14489 "cannot lock a memory mapped file unless locking the " 14490 "entire file: start %"PRIx64", len %"PRIx64, 14491 bfp->l_start, bfp->l_len)); 14492 return (0); 14493 } 14494 14495 /* mandatory locking and mapping don't mix */ 14496 va.va_mask = AT_MODE; 14497 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14498 if (error != 0) { 14499 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14500 "getattr error %d", error)); 14501 return (0); /* treat errors conservatively */ 14502 } 14503 if (MANDLOCK(vp, va.va_mode)) { 14504 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14505 "cannot mandatory lock and mmap a file")); 14506 return (0); 14507 } 14508 14509 return (1); 14510 } 14511 14512 14513 /* 14514 * Register the lock locally within Solaris. 14515 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14516 * recording locks locally. 14517 * 14518 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14519 * are registered locally. 14520 */ 14521 void 14522 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14523 u_offset_t offset) 14524 { 14525 int oldsysid; 14526 int error; 14527 #ifdef DEBUG 14528 char *name; 14529 #endif 14530 14531 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14532 14533 #ifdef DEBUG 14534 name = fn_name(VTOSV(vp)->sv_name); 14535 NFS4_DEBUG(nfs4_client_lock_debug, 14536 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14537 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14538 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14539 flk->l_sysid)); 14540 kmem_free(name, MAXNAMELEN); 14541 #endif 14542 14543 /* register the lock with local locking */ 14544 oldsysid = flk->l_sysid; 14545 flk->l_sysid |= LM_SYSID_CLIENT; 14546 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14547 #ifdef DEBUG 14548 if (error != 0) { 14549 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14550 "nfs4_register_lock_locally: could not register with" 14551 " local locking")); 14552 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14553 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14554 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14555 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14556 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14557 flk->l_type, flk->l_start, flk->l_len)); 14558 (void) reclock(vp, flk, 0, flag, offset, NULL); 14559 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14560 "blocked by pid %d sysid 0x%x type %d " 14561 "off 0x%" PRIx64 " len 0x%" PRIx64, 14562 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14563 flk->l_len)); 14564 } 14565 #endif 14566 flk->l_sysid = oldsysid; 14567 } 14568 14569 /* 14570 * nfs4_lockrelease: 14571 * 14572 * Release any locks on the given vnode that are held by the current 14573 * process. Also removes the lock owner (if one exists) from the rnode's 14574 * list. 14575 */ 14576 static int 14577 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14578 { 14579 flock64_t ld; 14580 int ret, error; 14581 rnode4_t *rp; 14582 nfs4_lock_owner_t *lop; 14583 nfs4_recov_state_t recov_state; 14584 mntinfo4_t *mi; 14585 bool_t possible_orphan = FALSE; 14586 bool_t recovonly; 14587 14588 ASSERT((uintptr_t)vp > KERNELBASE); 14589 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14590 14591 rp = VTOR4(vp); 14592 mi = VTOMI4(vp); 14593 14594 /* 14595 * If we have not locked anything then we can 14596 * just return since we have no work to do. 14597 */ 14598 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14599 return (0); 14600 } 14601 14602 /* 14603 * We need to comprehend that another thread may 14604 * kick off recovery and the lock_owner we have stashed 14605 * in lop might be invalid so we should NOT cache it 14606 * locally! 14607 */ 14608 recov_state.rs_flags = 0; 14609 recov_state.rs_num_retry_despite_err = 0; 14610 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14611 &recovonly); 14612 if (error) { 14613 mutex_enter(&rp->r_statelock); 14614 rp->r_flags |= R4LODANGLERS; 14615 mutex_exit(&rp->r_statelock); 14616 return (error); 14617 } 14618 14619 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14620 14621 /* 14622 * Check if the lock owner might have a lock (request was sent but 14623 * no response was received). Also check if there are any remote 14624 * locks on the file. (In theory we shouldn't have to make this 14625 * second check if there's no lock owner, but for now we'll be 14626 * conservative and do it anyway.) If either condition is true, 14627 * send an unlock for the entire file to the server. 14628 * 14629 * Note that no explicit synchronization is needed here. At worst, 14630 * flk_has_remote_locks() will return a false positive, in which case 14631 * the unlock call wastes time but doesn't harm correctness. 14632 */ 14633 14634 if (lop) { 14635 mutex_enter(&lop->lo_lock); 14636 possible_orphan = lop->lo_pending_rqsts; 14637 mutex_exit(&lop->lo_lock); 14638 lock_owner_rele(lop); 14639 } 14640 14641 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14642 14643 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14644 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14645 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14646 (void *)lop)); 14647 14648 if (possible_orphan || flk_has_remote_locks(vp)) { 14649 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14650 ld.l_whence = 0; /* unlock from start of file */ 14651 ld.l_start = 0; 14652 ld.l_len = 0; /* do entire file */ 14653 14654 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14655 cr, NULL); 14656 14657 if (ret != 0) { 14658 /* 14659 * If VOP_FRLOCK fails, make sure we unregister 14660 * local locks before we continue. 14661 */ 14662 ld.l_pid = ttoproc(curthread)->p_pid; 14663 nfs4_register_lock_locally(vp, &ld, flag, offset); 14664 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14665 "nfs4_lockrelease: lock release error on vp" 14666 " %p: error %d.\n", (void *)vp, ret)); 14667 } 14668 } 14669 14670 recov_state.rs_flags = 0; 14671 recov_state.rs_num_retry_despite_err = 0; 14672 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14673 &recovonly); 14674 if (error) { 14675 mutex_enter(&rp->r_statelock); 14676 rp->r_flags |= R4LODANGLERS; 14677 mutex_exit(&rp->r_statelock); 14678 return (error); 14679 } 14680 14681 /* 14682 * So, here we're going to need to retrieve the lock-owner 14683 * again (in case recovery has done a switch-a-roo) and 14684 * remove it because we can. 14685 */ 14686 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14687 14688 if (lop) { 14689 nfs4_rnode_remove_lock_owner(rp, lop); 14690 lock_owner_rele(lop); 14691 } 14692 14693 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14694 return (0); 14695 } 14696 14697 /* 14698 * Wait for 'tick_delay' clock ticks. 14699 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14700 * NOTE: lock_lease_time is in seconds. 14701 * 14702 * XXX For future improvements, should implement a waiting queue scheme. 14703 */ 14704 static int 14705 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14706 { 14707 long milliseconds_delay; 14708 time_t lock_lease_time; 14709 14710 /* wait tick_delay clock ticks or siginteruptus */ 14711 if (delay_sig(*tick_delay)) { 14712 return (EINTR); 14713 } 14714 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14715 "reissue the lock request: blocked for %ld clock ticks: %ld " 14716 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14717 14718 /* get the lease time */ 14719 lock_lease_time = r2lease_time(rp); 14720 14721 /* drv_hztousec converts ticks to microseconds */ 14722 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14723 if (milliseconds_delay < lock_lease_time * 1000) { 14724 *tick_delay = 2 * *tick_delay; 14725 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14726 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14727 } 14728 return (0); 14729 } 14730 14731 14732 void 14733 nfs4_vnops_init(void) 14734 { 14735 } 14736 14737 void 14738 nfs4_vnops_fini(void) 14739 { 14740 } 14741 14742 /* 14743 * Return a reference to the directory (parent) vnode for a given vnode, 14744 * using the saved pathname information and the directory file handle. The 14745 * caller is responsible for disposing of the reference. 14746 * Returns zero or an errno value. 14747 * 14748 * Caller should set need_start_op to FALSE if it is the recovery 14749 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14750 */ 14751 int 14752 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14753 { 14754 svnode_t *svnp; 14755 vnode_t *dvp = NULL; 14756 servinfo4_t *svp; 14757 nfs4_fname_t *mfname; 14758 int error; 14759 14760 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14761 14762 if (vp->v_flag & VROOT) { 14763 nfs4_sharedfh_t *sfh; 14764 nfs_fh4 fh; 14765 mntinfo4_t *mi; 14766 14767 ASSERT(vp->v_type == VREG); 14768 14769 mi = VTOMI4(vp); 14770 svp = mi->mi_curr_serv; 14771 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14772 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14773 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14774 sfh = sfh4_get(&fh, VTOMI4(vp)); 14775 nfs_rw_exit(&svp->sv_lock); 14776 mfname = mi->mi_fname; 14777 fn_hold(mfname); 14778 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14779 sfh4_rele(&sfh); 14780 14781 if (dvp->v_type == VNON) 14782 dvp->v_type = VDIR; 14783 *dvpp = dvp; 14784 return (0); 14785 } 14786 14787 svnp = VTOSV(vp); 14788 14789 if (svnp == NULL) { 14790 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14791 "shadow node is NULL")); 14792 return (EINVAL); 14793 } 14794 14795 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14796 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14797 "shadow node name or dfh val == NULL")); 14798 return (EINVAL); 14799 } 14800 14801 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14802 (int)need_start_op); 14803 if (error != 0) { 14804 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14805 "nfs4_make_dotdot returned %d", error)); 14806 return (error); 14807 } 14808 if (!dvp) { 14809 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14810 "nfs4_make_dotdot returned a NULL dvp")); 14811 return (EIO); 14812 } 14813 if (dvp->v_type == VNON) 14814 dvp->v_type = VDIR; 14815 ASSERT(dvp->v_type == VDIR); 14816 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14817 mutex_enter(&dvp->v_lock); 14818 dvp->v_flag |= V_XATTRDIR; 14819 mutex_exit(&dvp->v_lock); 14820 } 14821 *dvpp = dvp; 14822 return (0); 14823 } 14824 14825 /* 14826 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14827 * length that fnamep can accept, including the trailing null. 14828 * Returns 0 if okay, returns an errno value if there was a problem. 14829 */ 14830 14831 int 14832 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14833 { 14834 char *fn; 14835 int err = 0; 14836 servinfo4_t *svp; 14837 svnode_t *shvp; 14838 14839 /* 14840 * If the file being opened has VROOT set, then this is 14841 * a "file" mount. sv_name will not be interesting, so 14842 * go back to the servinfo4 to get the original mount 14843 * path and strip off all but the final edge. Otherwise 14844 * just return the name from the shadow vnode. 14845 */ 14846 14847 if (vp->v_flag & VROOT) { 14848 14849 svp = VTOMI4(vp)->mi_curr_serv; 14850 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14851 14852 fn = strrchr(svp->sv_path, '/'); 14853 if (fn == NULL) 14854 err = EINVAL; 14855 else 14856 fn++; 14857 } else { 14858 shvp = VTOSV(vp); 14859 fn = fn_name(shvp->sv_name); 14860 } 14861 14862 if (err == 0) 14863 if (strlen(fn) < maxlen) 14864 (void) strcpy(fnamep, fn); 14865 else 14866 err = ENAMETOOLONG; 14867 14868 if (vp->v_flag & VROOT) 14869 nfs_rw_exit(&svp->sv_lock); 14870 else 14871 kmem_free(fn, MAXNAMELEN); 14872 14873 return (err); 14874 } 14875 14876 /* 14877 * Bookkeeping for a close that doesn't need to go over the wire. 14878 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14879 * it is left at 1. 14880 */ 14881 void 14882 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14883 { 14884 rnode4_t *rp; 14885 mntinfo4_t *mi; 14886 14887 mi = VTOMI4(vp); 14888 rp = VTOR4(vp); 14889 14890 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14891 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14892 ASSERT(nfs_zone() == mi->mi_zone); 14893 ASSERT(mutex_owned(&osp->os_sync_lock)); 14894 ASSERT(*have_lockp); 14895 14896 if (!osp->os_valid || 14897 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14898 return; 14899 } 14900 14901 /* 14902 * This removes the reference obtained at OPEN; ie, 14903 * when the open stream structure was created. 14904 * 14905 * We don't have to worry about calling 'open_stream_rele' 14906 * since we our currently holding a reference to this 14907 * open stream which means the count can not go to 0 with 14908 * this decrement. 14909 */ 14910 ASSERT(osp->os_ref_count >= 2); 14911 osp->os_ref_count--; 14912 osp->os_valid = 0; 14913 mutex_exit(&osp->os_sync_lock); 14914 *have_lockp = 0; 14915 14916 nfs4_dec_state_ref_count(mi); 14917 } 14918 14919 /* 14920 * Close all remaining open streams on the rnode. These open streams 14921 * could be here because: 14922 * - The close attempted at either close or delmap failed 14923 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14924 * - Someone did mknod on a regular file but never opened it 14925 */ 14926 int 14927 nfs4close_all(vnode_t *vp, cred_t *cr) 14928 { 14929 nfs4_open_stream_t *osp; 14930 int error; 14931 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14932 rnode4_t *rp; 14933 14934 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14935 14936 error = 0; 14937 rp = VTOR4(vp); 14938 14939 /* 14940 * At this point, all we know is that the last time 14941 * someone called vn_rele, the count was 1. Since then, 14942 * the vnode could have been re-activated. We want to 14943 * loop through the open streams and close each one, but 14944 * we have to be careful since once we release the rnode 14945 * hash bucket lock, someone else is free to come in and 14946 * re-activate the rnode and add new open streams. The 14947 * strategy is take the rnode hash bucket lock, verify that 14948 * the count is still 1, grab the open stream off the 14949 * head of the list and mark it invalid, then release the 14950 * rnode hash bucket lock and proceed with that open stream. 14951 * This is ok because nfs4close_one() will acquire the proper 14952 * open/create to close/destroy synchronization for open 14953 * streams, and will ensure that if someone has reopened 14954 * the open stream after we've dropped the hash bucket lock 14955 * then we'll just simply return without destroying the 14956 * open stream. 14957 * Repeat until the list is empty. 14958 */ 14959 14960 for (;;) { 14961 14962 /* make sure vnode hasn't been reactivated */ 14963 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14964 mutex_enter(&vp->v_lock); 14965 if (vp->v_count > 1) { 14966 mutex_exit(&vp->v_lock); 14967 rw_exit(&rp->r_hashq->r_lock); 14968 break; 14969 } 14970 /* 14971 * Grabbing r_os_lock before releasing v_lock prevents 14972 * a window where the rnode/open stream could get 14973 * reactivated (and os_force_close set to 0) before we 14974 * had a chance to set os_force_close to 1. 14975 */ 14976 mutex_enter(&rp->r_os_lock); 14977 mutex_exit(&vp->v_lock); 14978 14979 osp = list_head(&rp->r_open_streams); 14980 if (!osp) { 14981 /* nothing left to CLOSE OTW, so return */ 14982 mutex_exit(&rp->r_os_lock); 14983 rw_exit(&rp->r_hashq->r_lock); 14984 break; 14985 } 14986 14987 mutex_enter(&rp->r_statev4_lock); 14988 /* the file can't still be mem mapped */ 14989 ASSERT(rp->r_mapcnt == 0); 14990 if (rp->created_v4) 14991 rp->created_v4 = 0; 14992 mutex_exit(&rp->r_statev4_lock); 14993 14994 /* 14995 * Grab a ref on this open stream; nfs4close_one 14996 * will mark it as invalid 14997 */ 14998 mutex_enter(&osp->os_sync_lock); 14999 osp->os_ref_count++; 15000 osp->os_force_close = 1; 15001 mutex_exit(&osp->os_sync_lock); 15002 mutex_exit(&rp->r_os_lock); 15003 rw_exit(&rp->r_hashq->r_lock); 15004 15005 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15006 15007 /* Update error if it isn't already non-zero */ 15008 if (error == 0) { 15009 if (e.error) 15010 error = e.error; 15011 else if (e.stat) 15012 error = geterrno4(e.stat); 15013 } 15014 15015 #ifdef DEBUG 15016 nfs4close_all_cnt++; 15017 #endif 15018 /* Release the ref on osp acquired above. */ 15019 open_stream_rele(osp, rp); 15020 15021 /* Proceed to the next open stream, if any */ 15022 } 15023 return (error); 15024 } 15025 15026 /* 15027 * nfs4close_one - close one open stream for a file if needed. 15028 * 15029 * "close_type" indicates which close path this is: 15030 * CLOSE_NORM: close initiated via VOP_CLOSE. 15031 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15032 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15033 * the close and release of client state for this open stream 15034 * (unless someone else has the open stream open). 15035 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15036 * (e.g., due to abort because of a signal). 15037 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15038 * 15039 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15040 * recovery. Instead, the caller is expected to deal with retries. 15041 * 15042 * The caller can either pass in the osp ('provided_osp') or not. 15043 * 15044 * 'access_bits' represents the access we are closing/downgrading. 15045 * 15046 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15047 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15048 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15049 * 15050 * Errors are returned via the nfs4_error_t. 15051 */ 15052 void 15053 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15054 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15055 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15056 uint_t mmap_flags) 15057 { 15058 nfs4_open_owner_t *oop; 15059 nfs4_open_stream_t *osp = NULL; 15060 int retry = 0; 15061 int num_retries = NFS4_NUM_RECOV_RETRIES; 15062 rnode4_t *rp; 15063 mntinfo4_t *mi; 15064 nfs4_recov_state_t recov_state; 15065 cred_t *cred_otw = NULL; 15066 bool_t recovonly = FALSE; 15067 int isrecov; 15068 int force_close; 15069 int close_failed = 0; 15070 int did_dec_count = 0; 15071 int did_start_op = 0; 15072 int did_force_recovlock = 0; 15073 int did_start_seqid_sync = 0; 15074 int have_sync_lock = 0; 15075 15076 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15077 15078 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15079 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15080 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15081 len, maxprot, mmap_flags, access_bits)); 15082 15083 nfs4_error_zinit(ep); 15084 rp = VTOR4(vp); 15085 mi = VTOMI4(vp); 15086 isrecov = (close_type == CLOSE_RESEND || 15087 close_type == CLOSE_AFTER_RESEND); 15088 15089 /* 15090 * First get the open owner. 15091 */ 15092 if (!provided_osp) { 15093 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15094 } else { 15095 oop = provided_osp->os_open_owner; 15096 ASSERT(oop != NULL); 15097 open_owner_hold(oop); 15098 } 15099 15100 if (!oop) { 15101 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15102 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15103 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15104 (void *)provided_osp, close_type)); 15105 ep->error = EIO; 15106 goto out; 15107 } 15108 15109 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15110 recov_retry: 15111 osp = NULL; 15112 close_failed = 0; 15113 force_close = (close_type == CLOSE_FORCE); 15114 retry = 0; 15115 did_start_op = 0; 15116 did_force_recovlock = 0; 15117 did_start_seqid_sync = 0; 15118 have_sync_lock = 0; 15119 recovonly = FALSE; 15120 recov_state.rs_flags = 0; 15121 recov_state.rs_num_retry_despite_err = 0; 15122 15123 /* 15124 * Second synchronize with recovery. 15125 */ 15126 if (!isrecov) { 15127 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15128 &recov_state, &recovonly); 15129 if (!ep->error) { 15130 did_start_op = 1; 15131 } else { 15132 close_failed = 1; 15133 /* 15134 * If we couldn't get start_fop, but have to 15135 * cleanup state, then at least acquire the 15136 * mi_recovlock so we can synchronize with 15137 * recovery. 15138 */ 15139 if (close_type == CLOSE_FORCE) { 15140 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15141 RW_READER, FALSE); 15142 did_force_recovlock = 1; 15143 } else 15144 goto out; 15145 } 15146 } 15147 15148 /* 15149 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15150 * set 'recovonly' to TRUE since most likely this is due to 15151 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15152 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15153 * to retry, causing us to loop until recovery finishes. Plus we 15154 * don't need protection over the open seqid since we're not going 15155 * OTW, hence don't need to use the seqid. 15156 */ 15157 if (recovonly == FALSE) { 15158 /* need to grab the open owner sync before 'os_sync_lock' */ 15159 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15160 if (ep->error == EAGAIN) { 15161 ASSERT(!isrecov); 15162 if (did_start_op) 15163 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15164 &recov_state, TRUE); 15165 if (did_force_recovlock) 15166 nfs_rw_exit(&mi->mi_recovlock); 15167 goto recov_retry; 15168 } 15169 did_start_seqid_sync = 1; 15170 } 15171 15172 /* 15173 * Third get an open stream and acquire 'os_sync_lock' to 15174 * sychronize the opening/creating of an open stream with the 15175 * closing/destroying of an open stream. 15176 */ 15177 if (!provided_osp) { 15178 /* returns with 'os_sync_lock' held */ 15179 osp = find_open_stream(oop, rp); 15180 if (!osp) { 15181 ep->error = EIO; 15182 goto out; 15183 } 15184 } else { 15185 osp = provided_osp; 15186 open_stream_hold(osp); 15187 mutex_enter(&osp->os_sync_lock); 15188 } 15189 have_sync_lock = 1; 15190 15191 ASSERT(oop == osp->os_open_owner); 15192 15193 /* 15194 * Fourth, do any special pre-OTW CLOSE processing 15195 * based on the specific close type. 15196 */ 15197 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15198 !did_dec_count) { 15199 ASSERT(osp->os_open_ref_count > 0); 15200 osp->os_open_ref_count--; 15201 did_dec_count = 1; 15202 if (osp->os_open_ref_count == 0) 15203 osp->os_final_close = 1; 15204 } 15205 15206 if (close_type == CLOSE_FORCE) { 15207 /* see if somebody reopened the open stream. */ 15208 if (!osp->os_force_close) { 15209 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15210 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15211 "was reopened, vp %p", (void *)osp, (void *)vp)); 15212 ep->error = 0; 15213 ep->stat = NFS4_OK; 15214 goto out; 15215 } 15216 15217 if (!osp->os_final_close && !did_dec_count) { 15218 osp->os_open_ref_count--; 15219 did_dec_count = 1; 15220 } 15221 15222 /* 15223 * We can't depend on os_open_ref_count being 0 due to the 15224 * way executables are opened (VN_RELE to match a VOP_OPEN). 15225 */ 15226 #ifdef NOTYET 15227 ASSERT(osp->os_open_ref_count == 0); 15228 #endif 15229 if (osp->os_open_ref_count != 0) { 15230 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15231 "nfs4close_one: should panic here on an " 15232 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15233 "since this is probably the exec problem.")); 15234 15235 osp->os_open_ref_count = 0; 15236 } 15237 15238 /* 15239 * There is the possibility that nfs4close_one() 15240 * for close_type == CLOSE_DELMAP couldn't find the 15241 * open stream, thus couldn't decrement its os_mapcnt; 15242 * therefore we can't use this ASSERT yet. 15243 */ 15244 #ifdef NOTYET 15245 ASSERT(osp->os_mapcnt == 0); 15246 #endif 15247 osp->os_mapcnt = 0; 15248 } 15249 15250 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15251 ASSERT(osp->os_mapcnt >= btopr(len)); 15252 15253 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15254 osp->os_mmap_write -= btopr(len); 15255 if (maxprot & PROT_READ) 15256 osp->os_mmap_read -= btopr(len); 15257 if (maxprot & PROT_EXEC) 15258 osp->os_mmap_read -= btopr(len); 15259 /* mirror the PROT_NONE check in nfs4_addmap() */ 15260 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15261 !(maxprot & PROT_EXEC)) 15262 osp->os_mmap_read -= btopr(len); 15263 osp->os_mapcnt -= btopr(len); 15264 did_dec_count = 1; 15265 } 15266 15267 if (recovonly) { 15268 nfs4_lost_rqst_t lost_rqst; 15269 15270 /* request should not already be in recovery queue */ 15271 ASSERT(lrp == NULL); 15272 nfs4_error_init(ep, EINTR); 15273 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15274 osp, cred_otw, vp); 15275 mutex_exit(&osp->os_sync_lock); 15276 have_sync_lock = 0; 15277 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15278 lost_rqst.lr_op == OP_CLOSE ? 15279 &lost_rqst : NULL, OP_CLOSE, NULL); 15280 close_failed = 1; 15281 force_close = 0; 15282 goto close_cleanup; 15283 } 15284 15285 /* 15286 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15287 * we stopped operating on the open owner's <old oo_name, old seqid> 15288 * space, which means we stopped operating on the open stream 15289 * too. So don't go OTW (as the seqid is likely bad, and the 15290 * stateid could be stale, potentially triggering a false 15291 * setclientid), and just clean up the client's internal state. 15292 */ 15293 if (osp->os_orig_oo_name != oop->oo_name) { 15294 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15295 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15296 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15297 "oo_name %" PRIx64")", 15298 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15299 oop->oo_name)); 15300 close_failed = 1; 15301 } 15302 15303 /* If the file failed recovery, just quit. */ 15304 mutex_enter(&rp->r_statelock); 15305 if (rp->r_flags & R4RECOVERR) { 15306 close_failed = 1; 15307 } 15308 mutex_exit(&rp->r_statelock); 15309 15310 /* 15311 * If the force close path failed to obtain start_fop 15312 * then skip the OTW close and just remove the state. 15313 */ 15314 if (close_failed) 15315 goto close_cleanup; 15316 15317 /* 15318 * Fifth, check to see if there are still mapped pages or other 15319 * opens using this open stream. If there are then we can't 15320 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15321 */ 15322 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15323 nfs4_lost_rqst_t new_lost_rqst; 15324 bool_t needrecov = FALSE; 15325 cred_t *odg_cred_otw = NULL; 15326 seqid4 open_dg_seqid = 0; 15327 15328 if (osp->os_delegation) { 15329 /* 15330 * If this open stream was never OPENed OTW then we 15331 * surely can't DOWNGRADE it (especially since the 15332 * osp->open_stateid is really a delegation stateid 15333 * when os_delegation is 1). 15334 */ 15335 if (access_bits & FREAD) 15336 osp->os_share_acc_read--; 15337 if (access_bits & FWRITE) 15338 osp->os_share_acc_write--; 15339 osp->os_share_deny_none--; 15340 nfs4_error_zinit(ep); 15341 goto out; 15342 } 15343 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15344 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15345 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15346 if (needrecov && !isrecov) { 15347 bool_t abort; 15348 nfs4_bseqid_entry_t *bsep = NULL; 15349 15350 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15351 bsep = nfs4_create_bseqid_entry(oop, NULL, 15352 vp, 0, 15353 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15354 open_dg_seqid); 15355 15356 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15357 oop, osp, odg_cred_otw, vp, access_bits, 0); 15358 mutex_exit(&osp->os_sync_lock); 15359 have_sync_lock = 0; 15360 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15361 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15362 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15363 bsep); 15364 if (odg_cred_otw) 15365 crfree(odg_cred_otw); 15366 if (bsep) 15367 kmem_free(bsep, sizeof (*bsep)); 15368 15369 if (abort == TRUE) 15370 goto out; 15371 15372 if (did_start_seqid_sync) { 15373 nfs4_end_open_seqid_sync(oop); 15374 did_start_seqid_sync = 0; 15375 } 15376 open_stream_rele(osp, rp); 15377 15378 if (did_start_op) 15379 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15380 &recov_state, FALSE); 15381 if (did_force_recovlock) 15382 nfs_rw_exit(&mi->mi_recovlock); 15383 15384 goto recov_retry; 15385 } else { 15386 if (odg_cred_otw) 15387 crfree(odg_cred_otw); 15388 } 15389 goto out; 15390 } 15391 15392 /* 15393 * If this open stream was created as the results of an open 15394 * while holding a delegation, then just release it; no need 15395 * to do an OTW close. Otherwise do a "normal" OTW close. 15396 */ 15397 if (osp->os_delegation) { 15398 nfs4close_notw(vp, osp, &have_sync_lock); 15399 nfs4_error_zinit(ep); 15400 goto out; 15401 } 15402 15403 /* 15404 * If this stream is not valid, we're done. 15405 */ 15406 if (!osp->os_valid) { 15407 nfs4_error_zinit(ep); 15408 goto out; 15409 } 15410 15411 /* 15412 * Last open or mmap ref has vanished, need to do an OTW close. 15413 * First check to see if a close is still necessary. 15414 */ 15415 if (osp->os_failed_reopen) { 15416 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15417 "don't close OTW osp %p since reopen failed.", 15418 (void *)osp)); 15419 /* 15420 * Reopen of the open stream failed, hence the 15421 * stateid of the open stream is invalid/stale, and 15422 * sending this OTW would incorrectly cause another 15423 * round of recovery. In this case, we need to set 15424 * the 'os_valid' bit to 0 so another thread doesn't 15425 * come in and re-open this open stream before 15426 * this "closing" thread cleans up state (decrementing 15427 * the nfs4_server_t's state_ref_count and decrementing 15428 * the os_ref_count). 15429 */ 15430 osp->os_valid = 0; 15431 /* 15432 * This removes the reference obtained at OPEN; ie, 15433 * when the open stream structure was created. 15434 * 15435 * We don't have to worry about calling 'open_stream_rele' 15436 * since we our currently holding a reference to this 15437 * open stream which means the count can not go to 0 with 15438 * this decrement. 15439 */ 15440 ASSERT(osp->os_ref_count >= 2); 15441 osp->os_ref_count--; 15442 nfs4_error_zinit(ep); 15443 close_failed = 0; 15444 goto close_cleanup; 15445 } 15446 15447 ASSERT(osp->os_ref_count > 1); 15448 15449 /* 15450 * Sixth, try the CLOSE OTW. 15451 */ 15452 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15453 close_type, ep, &have_sync_lock); 15454 15455 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15456 /* 15457 * Let the recovery thread be responsible for 15458 * removing the state for CLOSE. 15459 */ 15460 close_failed = 1; 15461 force_close = 0; 15462 retry = 0; 15463 } 15464 15465 /* See if we need to retry with a different cred */ 15466 if ((ep->error == EACCES || 15467 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15468 cred_otw != cr) { 15469 crfree(cred_otw); 15470 cred_otw = cr; 15471 crhold(cred_otw); 15472 retry = 1; 15473 } 15474 15475 if (ep->error || ep->stat) 15476 close_failed = 1; 15477 15478 if (retry && !isrecov && num_retries-- > 0) { 15479 if (have_sync_lock) { 15480 mutex_exit(&osp->os_sync_lock); 15481 have_sync_lock = 0; 15482 } 15483 if (did_start_seqid_sync) { 15484 nfs4_end_open_seqid_sync(oop); 15485 did_start_seqid_sync = 0; 15486 } 15487 open_stream_rele(osp, rp); 15488 15489 if (did_start_op) 15490 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15491 &recov_state, FALSE); 15492 if (did_force_recovlock) 15493 nfs_rw_exit(&mi->mi_recovlock); 15494 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15495 "nfs4close_one: need to retry the close " 15496 "operation")); 15497 goto recov_retry; 15498 } 15499 close_cleanup: 15500 /* 15501 * Seventh and lastly, process our results. 15502 */ 15503 if (close_failed && force_close) { 15504 /* 15505 * It's ok to drop and regrab the 'os_sync_lock' since 15506 * nfs4close_notw() will recheck to make sure the 15507 * "close"/removal of state should happen. 15508 */ 15509 if (!have_sync_lock) { 15510 mutex_enter(&osp->os_sync_lock); 15511 have_sync_lock = 1; 15512 } 15513 /* 15514 * This is last call, remove the ref on the open 15515 * stream created by open and clean everything up. 15516 */ 15517 osp->os_pending_close = 0; 15518 nfs4close_notw(vp, osp, &have_sync_lock); 15519 nfs4_error_zinit(ep); 15520 } 15521 15522 if (!close_failed) { 15523 if (have_sync_lock) { 15524 osp->os_pending_close = 0; 15525 mutex_exit(&osp->os_sync_lock); 15526 have_sync_lock = 0; 15527 } else { 15528 mutex_enter(&osp->os_sync_lock); 15529 osp->os_pending_close = 0; 15530 mutex_exit(&osp->os_sync_lock); 15531 } 15532 if (did_start_op && recov_state.rs_sp != NULL) { 15533 mutex_enter(&recov_state.rs_sp->s_lock); 15534 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15535 mutex_exit(&recov_state.rs_sp->s_lock); 15536 } else { 15537 nfs4_dec_state_ref_count(mi); 15538 } 15539 nfs4_error_zinit(ep); 15540 } 15541 15542 out: 15543 if (have_sync_lock) 15544 mutex_exit(&osp->os_sync_lock); 15545 if (did_start_op) 15546 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15547 recovonly ? TRUE : FALSE); 15548 if (did_force_recovlock) 15549 nfs_rw_exit(&mi->mi_recovlock); 15550 if (cred_otw) 15551 crfree(cred_otw); 15552 if (osp) 15553 open_stream_rele(osp, rp); 15554 if (oop) { 15555 if (did_start_seqid_sync) 15556 nfs4_end_open_seqid_sync(oop); 15557 open_owner_rele(oop); 15558 } 15559 } 15560 15561 /* 15562 * Convert information returned by the server in the LOCK4denied 15563 * structure to the form required by fcntl. 15564 */ 15565 static void 15566 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15567 { 15568 nfs4_lo_name_t *lo; 15569 15570 #ifdef DEBUG 15571 if (denied_to_flk_debug) { 15572 lockt_denied_debug = lockt_denied; 15573 debug_enter("lockt_denied"); 15574 } 15575 #endif 15576 15577 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15578 flk->l_whence = 0; /* aka SEEK_SET */ 15579 flk->l_start = lockt_denied->offset; 15580 flk->l_len = lockt_denied->length; 15581 15582 /* 15583 * If the blocking clientid matches our client id, then we can 15584 * interpret the lockowner (since we built it). If not, then 15585 * fabricate a sysid and pid. Note that the l_sysid field 15586 * in *flk already has the local sysid. 15587 */ 15588 15589 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15590 15591 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15592 lo = (nfs4_lo_name_t *) 15593 lockt_denied->owner.owner_val; 15594 15595 flk->l_pid = lo->ln_pid; 15596 } else { 15597 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15598 "denied_to_flk: bad lock owner length\n")); 15599 15600 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15601 } 15602 } else { 15603 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15604 "denied_to_flk: foreign clientid\n")); 15605 15606 /* 15607 * Construct a new sysid which should be different from 15608 * sysids of other systems. 15609 */ 15610 15611 flk->l_sysid++; 15612 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15613 } 15614 } 15615 15616 static pid_t 15617 lo_to_pid(lock_owner4 *lop) 15618 { 15619 pid_t pid = 0; 15620 uchar_t *cp; 15621 int i; 15622 15623 cp = (uchar_t *)&lop->clientid; 15624 15625 for (i = 0; i < sizeof (lop->clientid); i++) 15626 pid += (pid_t)*cp++; 15627 15628 cp = (uchar_t *)lop->owner_val; 15629 15630 for (i = 0; i < lop->owner_len; i++) 15631 pid += (pid_t)*cp++; 15632 15633 return (pid); 15634 } 15635 15636 /* 15637 * Given a lock pointer, returns the length of that lock. 15638 * "end" is the last locked offset the "l_len" covers from 15639 * the start of the lock. 15640 */ 15641 static off64_t 15642 lock_to_end(flock64_t *lock) 15643 { 15644 off64_t lock_end; 15645 15646 if (lock->l_len == 0) 15647 lock_end = (off64_t)MAXEND; 15648 else 15649 lock_end = lock->l_start + lock->l_len - 1; 15650 15651 return (lock_end); 15652 } 15653 15654 /* 15655 * Given the end of a lock, it will return you the length "l_len" for that lock. 15656 */ 15657 static off64_t 15658 end_to_len(off64_t start, off64_t end) 15659 { 15660 off64_t lock_len; 15661 15662 ASSERT(end >= start); 15663 if (end == MAXEND) 15664 lock_len = 0; 15665 else 15666 lock_len = end - start + 1; 15667 15668 return (lock_len); 15669 } 15670 15671 /* 15672 * On given end for a lock it determines if it is the last locked offset 15673 * or not, if so keeps it as is, else adds one to return the length for 15674 * valid start. 15675 */ 15676 static off64_t 15677 start_check(off64_t x) 15678 { 15679 if (x == MAXEND) 15680 return (x); 15681 else 15682 return (x + 1); 15683 } 15684 15685 /* 15686 * See if these two locks overlap, and if so return 1; 15687 * otherwise, return 0. 15688 */ 15689 static int 15690 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15691 { 15692 off64_t llfp_end, curfp_end; 15693 15694 llfp_end = lock_to_end(llfp); 15695 curfp_end = lock_to_end(curfp); 15696 15697 if (((llfp_end >= curfp->l_start) && 15698 (llfp->l_start <= curfp->l_start)) || 15699 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15700 return (1); 15701 return (0); 15702 } 15703 15704 /* 15705 * Determine what the intersecting lock region is, and add that to the 15706 * 'nl_llpp' locklist in increasing order (by l_start). 15707 */ 15708 static void 15709 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15710 locklist_t **nl_llpp, vnode_t *vp) 15711 { 15712 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15713 off64_t lost_flp_end, local_flp_end, len, start; 15714 15715 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15716 15717 if (!locks_intersect(lost_flp, local_flp)) 15718 return; 15719 15720 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15721 "locks intersect")); 15722 15723 lost_flp_end = lock_to_end(lost_flp); 15724 local_flp_end = lock_to_end(local_flp); 15725 15726 /* Find the starting point of the intersecting region */ 15727 if (local_flp->l_start > lost_flp->l_start) 15728 start = local_flp->l_start; 15729 else 15730 start = lost_flp->l_start; 15731 15732 /* Find the lenght of the intersecting region */ 15733 if (lost_flp_end < local_flp_end) 15734 len = end_to_len(start, lost_flp_end); 15735 else 15736 len = end_to_len(start, local_flp_end); 15737 15738 /* 15739 * Prepare the flock structure for the intersection found and insert 15740 * it into the new list in increasing l_start order. This list contains 15741 * intersections of locks registered by the client with the local host 15742 * and the lost lock. 15743 * The lock type of this lock is the same as that of the local_flp. 15744 */ 15745 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15746 intersect_llp->ll_flock.l_start = start; 15747 intersect_llp->ll_flock.l_len = len; 15748 intersect_llp->ll_flock.l_type = local_flp->l_type; 15749 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15750 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15751 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15752 intersect_llp->ll_vp = vp; 15753 15754 tmp_fllp = *nl_llpp; 15755 cur_fllp = NULL; 15756 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15757 intersect_llp->ll_flock.l_start) { 15758 cur_fllp = tmp_fllp; 15759 tmp_fllp = tmp_fllp->ll_next; 15760 } 15761 if (cur_fllp == NULL) { 15762 /* first on the list */ 15763 intersect_llp->ll_next = *nl_llpp; 15764 *nl_llpp = intersect_llp; 15765 } else { 15766 intersect_llp->ll_next = cur_fllp->ll_next; 15767 cur_fllp->ll_next = intersect_llp; 15768 } 15769 15770 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15771 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15772 intersect_llp->ll_flock.l_start, 15773 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15774 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15775 } 15776 15777 /* 15778 * Our local locking current state is potentially different than 15779 * what the NFSv4 server thinks we have due to a lost lock that was 15780 * resent and then received. We need to reset our "NFSv4" locking 15781 * state to match the current local locking state for this pid since 15782 * that is what the user/application sees as what the world is. 15783 * 15784 * We cannot afford to drop the open/lock seqid sync since then we can 15785 * get confused about what the current local locking state "is" versus 15786 * "was". 15787 * 15788 * If we are unable to fix up the locks, we send SIGLOST to the affected 15789 * process. This is not done if the filesystem has been forcibly 15790 * unmounted, in case the process has already exited and a new process 15791 * exists with the same pid. 15792 */ 15793 static void 15794 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15795 nfs4_lock_owner_t *lop) 15796 { 15797 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15798 mntinfo4_t *mi = VTOMI4(vp); 15799 const int cmd = F_SETLK; 15800 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15801 flock64_t ul_fl; 15802 15803 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15804 "nfs4_reinstitute_local_lock_state")); 15805 15806 /* 15807 * Find active locks for this vp from the local locking code. 15808 * Scan through this list and find out the locks that intersect with 15809 * the lost lock. Once we find the lock that intersects, add the 15810 * intersection area as a new lock to a new list "ri_llp". The lock 15811 * type of the intersection region lock added to ri_llp is the same 15812 * as that found in the active lock list, "list". The intersecting 15813 * region locks are added to ri_llp in increasing l_start order. 15814 */ 15815 ASSERT(nfs_zone() == mi->mi_zone); 15816 15817 locks = flk_active_locks_for_vp(vp); 15818 ri_llp = NULL; 15819 15820 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15821 ASSERT(llp->ll_vp == vp); 15822 /* 15823 * Pick locks that belong to this pid/lockowner 15824 */ 15825 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15826 continue; 15827 15828 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15829 } 15830 15831 /* 15832 * Now we have the list of intersections with the lost lock. These are 15833 * the locks that were/are active before the server replied to the 15834 * last/lost lock. Issue these locks to the server here. Playing these 15835 * locks to the server will re-establish aur current local locking state 15836 * with the v4 server. 15837 * If we get an error, send SIGLOST to the application for that lock. 15838 */ 15839 15840 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15841 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15842 "nfs4_reinstitute_local_lock_state: need to issue " 15843 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15844 llp->ll_flock.l_start, 15845 llp->ll_flock.l_start + llp->ll_flock.l_len, 15846 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15847 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15848 /* 15849 * No need to relock what we already have 15850 */ 15851 if (llp->ll_flock.l_type == lost_flp->l_type) 15852 continue; 15853 15854 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15855 } 15856 15857 /* 15858 * Now keeping the start of the lost lock as our reference parse the 15859 * newly created ri_llp locklist to find the ranges that we have locked 15860 * with the v4 server but not in the current local locking. We need 15861 * to unlock these ranges. 15862 * These ranges can also be reffered to as those ranges, where the lost 15863 * lock does not overlap with the locks in the ri_llp but are locked 15864 * since the server replied to the lost lock. 15865 */ 15866 cur_start = lost_flp->l_start; 15867 lost_flp_end = lock_to_end(lost_flp); 15868 15869 ul_fl.l_type = F_UNLCK; 15870 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15871 ul_fl.l_sysid = lost_flp->l_sysid; 15872 ul_fl.l_pid = lost_flp->l_pid; 15873 15874 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15875 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15876 15877 if (llp->ll_flock.l_start <= cur_start) { 15878 cur_start = start_check(llp_ll_flock_end); 15879 continue; 15880 } 15881 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15882 "nfs4_reinstitute_local_lock_state: " 15883 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15884 cur_start, llp->ll_flock.l_start)); 15885 15886 ul_fl.l_start = cur_start; 15887 ul_fl.l_len = end_to_len(cur_start, 15888 (llp->ll_flock.l_start - 1)); 15889 15890 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15891 cur_start = start_check(llp_ll_flock_end); 15892 } 15893 15894 /* 15895 * In the case where the lost lock ends after all intersecting locks, 15896 * unlock the last part of the lost lock range. 15897 */ 15898 if (cur_start != start_check(lost_flp_end)) { 15899 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15900 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15901 "lost lock region [%"PRIx64" - %"PRIx64"]", 15902 cur_start, lost_flp->l_start + lost_flp->l_len)); 15903 15904 ul_fl.l_start = cur_start; 15905 /* 15906 * Is it an to-EOF lock? if so unlock till the end 15907 */ 15908 if (lost_flp->l_len == 0) 15909 ul_fl.l_len = 0; 15910 else 15911 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15912 15913 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15914 } 15915 15916 if (locks != NULL) 15917 flk_free_locklist(locks); 15918 15919 /* Free up our newly created locklist */ 15920 for (llp = ri_llp; llp != NULL; ) { 15921 tmp_llp = llp->ll_next; 15922 kmem_free(llp, sizeof (locklist_t)); 15923 llp = tmp_llp; 15924 } 15925 15926 /* 15927 * Now return back to the original calling nfs4frlock() 15928 * and let us naturally drop our seqid syncs. 15929 */ 15930 } 15931 15932 /* 15933 * Create a lost state record for the given lock reinstantiation request 15934 * and push it onto the lost state queue. 15935 */ 15936 static void 15937 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15938 nfs4_lock_owner_t *lop) 15939 { 15940 nfs4_lost_rqst_t req; 15941 nfs_lock_type4 locktype; 15942 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15943 15944 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15945 15946 locktype = flk_to_locktype(cmd, flk->l_type); 15947 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15948 NULL, NULL, lop, flk, &req, cr, vp); 15949 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15950 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15951 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15952 NULL); 15953 } 15954