1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/vfs_opreg.h> 39 #include <sys/file.h> 40 #include <sys/filio.h> 41 #include <sys/uio.h> 42 #include <sys/buf.h> 43 #include <sys/mman.h> 44 #include <sys/pathname.h> 45 #include <sys/dirent.h> 46 #include <sys/debug.h> 47 #include <sys/vmsystm.h> 48 #include <sys/fcntl.h> 49 #include <sys/flock.h> 50 #include <sys/swap.h> 51 #include <sys/errno.h> 52 #include <sys/strsubr.h> 53 #include <sys/sysmacros.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/pathconf.h> 57 #include <sys/utsname.h> 58 #include <sys/dnlc.h> 59 #include <sys/acl.h> 60 #include <sys/systeminfo.h> 61 #include <sys/policy.h> 62 #include <sys/sdt.h> 63 #include <sys/list.h> 64 #include <sys/stat.h> 65 66 #include <rpc/types.h> 67 #include <rpc/auth.h> 68 #include <rpc/clnt.h> 69 70 #include <nfs/nfs.h> 71 #include <nfs/nfs_clnt.h> 72 #include <nfs/nfs_acl.h> 73 #include <nfs/lm.h> 74 #include <nfs/nfs4.h> 75 #include <nfs/nfs4_kprot.h> 76 #include <nfs/rnode4.h> 77 #include <nfs/nfs4_clnt.h> 78 79 #include <vm/hat.h> 80 #include <vm/as.h> 81 #include <vm/page.h> 82 #include <vm/pvn.h> 83 #include <vm/seg.h> 84 #include <vm/seg_map.h> 85 #include <vm/seg_kpm.h> 86 #include <vm/seg_vn.h> 87 88 #include <fs/fs_subr.h> 89 90 #include <sys/ddi.h> 91 #include <sys/int_fmtio.h> 92 93 typedef struct { 94 nfs4_ga_res_t *di_garp; 95 cred_t *di_cred; 96 hrtime_t di_time_call; 97 } dirattr_info_t; 98 99 typedef enum nfs4_acl_op { 100 NFS4_ACL_GET, 101 NFS4_ACL_SET 102 } nfs4_acl_op_t; 103 104 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 105 106 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 107 char *, dirattr_info_t *); 108 109 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 110 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 111 nfs4_error_t *, int *); 112 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 113 cred_t *); 114 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 115 stable_how4 *); 116 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 117 cred_t *, bool_t, struct uio *); 118 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 119 vsecattr_t *); 120 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 121 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 122 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 123 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 124 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 125 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 126 int, vnode_t **, cred_t *); 127 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 128 cred_t *, int, int, enum createmode4, int); 129 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 130 caller_context_t *); 131 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 132 vnode_t *, char *, cred_t *, nfsstat4 *); 133 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 134 vnode_t *, char *, cred_t *, nfsstat4 *); 135 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 136 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 137 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 138 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 139 page_t *[], size_t, struct seg *, caddr_t, 140 enum seg_rw, cred_t *); 141 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 142 cred_t *); 143 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 144 int, cred_t *); 145 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 146 int, cred_t *); 147 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 148 static void nfs4_set_mod(vnode_t *); 149 static void nfs4_get_commit(vnode_t *); 150 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 151 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 152 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 153 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 154 cred_t *); 155 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 156 cred_t *); 157 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 158 hrtime_t, vnode_t *, cred_t *); 159 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 160 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 161 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 162 u_offset_t); 163 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 164 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 165 static cred_t *state_to_cred(nfs4_open_stream_t *); 166 static int vtoname(vnode_t *, char *, ssize_t); 167 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 168 static pid_t lo_to_pid(lock_owner4 *); 169 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 170 cred_t *, nfs4_lock_owner_t *); 171 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 172 nfs4_lock_owner_t *); 173 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 174 static void nfs4_delmap_callback(struct as *, void *, uint_t); 175 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 176 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 177 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 178 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 179 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 180 uid_t, gid_t, int); 181 182 /* 183 * Routines that implement the setting of v4 args for the misc. ops 184 */ 185 static void nfs4args_lock_free(nfs_argop4 *); 186 static void nfs4args_lockt_free(nfs_argop4 *); 187 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 188 int, rnode4_t *, cred_t *, bitmap4, int *, 189 nfs4_stateid_types_t *); 190 static void nfs4args_setattr_free(nfs_argop4 *); 191 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 192 bitmap4); 193 static void nfs4args_verify_free(nfs_argop4 *); 194 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 195 WRITE4args **, nfs4_stateid_types_t *); 196 197 /* 198 * These are the vnode ops functions that implement the vnode interface to 199 * the networked file system. See more comments below at nfs4_vnodeops. 200 */ 201 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 202 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 203 caller_context_t *); 204 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 205 caller_context_t *); 206 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 207 caller_context_t *); 208 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 209 caller_context_t *); 210 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 211 caller_context_t *); 212 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 213 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 214 caller_context_t *); 215 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 216 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 217 int, vnode_t **, cred_t *, int, caller_context_t *, 218 vsecattr_t *); 219 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 220 int); 221 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 222 caller_context_t *, int); 223 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 224 caller_context_t *, int); 225 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 226 cred_t *, caller_context_t *, int, vsecattr_t *); 227 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 228 caller_context_t *, int); 229 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 230 cred_t *, caller_context_t *, int); 231 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 232 caller_context_t *, int); 233 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 234 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 235 page_t *[], size_t, struct seg *, caddr_t, 236 enum seg_rw, cred_t *, caller_context_t *); 237 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 238 caller_context_t *); 239 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 240 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 241 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 242 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 243 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 244 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 245 struct flk_callback *, cred_t *, caller_context_t *); 246 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 247 cred_t *, caller_context_t *); 248 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 249 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 250 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 251 cred_t *, caller_context_t *); 252 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 253 caller_context_t *); 254 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 255 caller_context_t *); 256 /* 257 * These vnode ops are required to be called from outside this source file, 258 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 259 * as static. 260 */ 261 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 262 caller_context_t *); 263 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 264 int nfs4_lookup(vnode_t *, char *, vnode_t **, 265 struct pathname *, int, vnode_t *, cred_t *, 266 caller_context_t *, int *, pathname_t *); 267 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 268 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 269 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 270 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 271 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 272 caller_context_t *); 273 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 274 caller_context_t *); 275 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 276 caller_context_t *); 277 278 /* 279 * Used for nfs4_commit_vp() to indicate if we should 280 * wait on pending writes. 281 */ 282 #define NFS4_WRITE_NOWAIT 0 283 #define NFS4_WRITE_WAIT 1 284 285 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 286 287 /* 288 * Error flags used to pass information about certain special errors 289 * which need to be handled specially. 290 */ 291 #define NFS_EOF -98 292 #define NFS_VERF_MISMATCH -97 293 294 /* 295 * Flags used to differentiate between which operation drove the 296 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 297 */ 298 #define NFS4_CLOSE_OP 0x1 299 #define NFS4_DELMAP_OP 0x2 300 #define NFS4_INACTIVE_OP 0x3 301 302 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 303 304 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 305 #define ALIGN64(x, ptr, sz) \ 306 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 307 if (x) { \ 308 x = sizeof (uint64_t) - (x); \ 309 sz -= (x); \ 310 ptr += (x); \ 311 } 312 313 #ifdef DEBUG 314 int nfs4_client_attr_debug = 0; 315 int nfs4_client_state_debug = 0; 316 int nfs4_client_shadow_debug = 0; 317 int nfs4_client_lock_debug = 0; 318 int nfs4_seqid_sync = 0; 319 int nfs4_client_map_debug = 0; 320 static int nfs4_pageio_debug = 0; 321 int nfs4_client_inactive_debug = 0; 322 int nfs4_client_recov_debug = 0; 323 int nfs4_client_failover_debug = 0; 324 int nfs4_client_call_debug = 0; 325 int nfs4_client_lookup_debug = 0; 326 int nfs4_client_zone_debug = 0; 327 int nfs4_lost_rqst_debug = 0; 328 int nfs4_rdattrerr_debug = 0; 329 int nfs4_open_stream_debug = 0; 330 331 int nfs4read_error_inject; 332 333 static int nfs4_create_misses = 0; 334 335 static int nfs4_readdir_cache_shorts = 0; 336 static int nfs4_readdir_readahead = 0; 337 338 static int nfs4_bio_do_stop = 0; 339 340 static int nfs4_lostpage = 0; /* number of times we lost original page */ 341 342 int nfs4_mmap_debug = 0; 343 344 static int nfs4_pathconf_cache_hits = 0; 345 static int nfs4_pathconf_cache_misses = 0; 346 347 int nfs4close_all_cnt; 348 int nfs4close_one_debug = 0; 349 int nfs4close_notw_debug = 0; 350 351 int denied_to_flk_debug = 0; 352 void *lockt_denied_debug; 353 354 #endif 355 356 /* 357 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 358 * or NFS4ERR_RESOURCE. 359 */ 360 static int confirm_retry_sec = 30; 361 362 static int nfs4_lookup_neg_cache = 1; 363 364 /* 365 * number of pages to read ahead 366 * optimized for 100 base-T. 367 */ 368 static int nfs4_nra = 4; 369 370 static int nfs4_do_symlink_cache = 1; 371 372 static int nfs4_pathconf_disable_cache = 0; 373 374 /* 375 * These are the vnode ops routines which implement the vnode interface to 376 * the networked file system. These routines just take their parameters, 377 * make them look networkish by putting the right info into interface structs, 378 * and then calling the appropriate remote routine(s) to do the work. 379 * 380 * Note on directory name lookup cacheing: If we detect a stale fhandle, 381 * we purge the directory cache relative to that vnode. This way, the 382 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 383 * more details on rnode locking. 384 */ 385 386 struct vnodeops *nfs4_vnodeops; 387 388 const fs_operation_def_t nfs4_vnodeops_template[] = { 389 VOPNAME_OPEN, { .vop_open = nfs4_open }, 390 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 391 VOPNAME_READ, { .vop_read = nfs4_read }, 392 VOPNAME_WRITE, { .vop_write = nfs4_write }, 393 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 394 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 395 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 396 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 397 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 398 VOPNAME_CREATE, { .vop_create = nfs4_create }, 399 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 400 VOPNAME_LINK, { .vop_link = nfs4_link }, 401 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 402 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 403 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 404 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 405 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 406 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 407 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 408 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 409 VOPNAME_FID, { .vop_fid = nfs4_fid }, 410 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 411 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 412 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 413 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 414 VOPNAME_SPACE, { .vop_space = nfs4_space }, 415 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 416 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 417 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 418 VOPNAME_MAP, { .vop_map = nfs4_map }, 419 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 420 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 421 /* no separate nfs4_dump */ 422 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 423 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 424 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 425 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 426 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 427 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 428 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 429 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 430 NULL, NULL 431 }; 432 433 /* 434 * The following are subroutines and definitions to set args or get res 435 * for the different nfsv4 ops 436 */ 437 438 void 439 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 440 { 441 int i; 442 443 for (i = 0; i < arglen; i++) { 444 if (argop[i].argop == OP_LOOKUP) { 445 kmem_free( 446 argop[i].nfs_argop4_u.oplookup. 447 objname.utf8string_val, 448 argop[i].nfs_argop4_u.oplookup. 449 objname.utf8string_len); 450 } 451 } 452 } 453 454 static void 455 nfs4args_lock_free(nfs_argop4 *argop) 456 { 457 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 458 459 if (locker->new_lock_owner == TRUE) { 460 open_to_lock_owner4 *open_owner; 461 462 open_owner = &locker->locker4_u.open_owner; 463 if (open_owner->lock_owner.owner_val != NULL) { 464 kmem_free(open_owner->lock_owner.owner_val, 465 open_owner->lock_owner.owner_len); 466 } 467 } 468 } 469 470 static void 471 nfs4args_lockt_free(nfs_argop4 *argop) 472 { 473 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 474 475 if (lowner->owner_val != NULL) { 476 kmem_free(lowner->owner_val, lowner->owner_len); 477 } 478 } 479 480 static void 481 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 482 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 483 nfs4_stateid_types_t *sid_types) 484 { 485 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 486 mntinfo4_t *mi; 487 488 argop->argop = OP_SETATTR; 489 /* 490 * The stateid is set to 0 if client is not modifying the size 491 * and otherwise to whatever nfs4_get_stateid() returns. 492 * 493 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 494 * state struct could be found for the process/file pair. We may 495 * want to change this in the future (by OPENing the file). See 496 * bug # 4474852. 497 */ 498 if (vap->va_mask & AT_SIZE) { 499 500 ASSERT(rp != NULL); 501 mi = VTOMI4(RTOV4(rp)); 502 503 argop->nfs_argop4_u.opsetattr.stateid = 504 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 505 OP_SETATTR, sid_types, FALSE); 506 } else { 507 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 508 sizeof (stateid4)); 509 } 510 511 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 512 if (*error) 513 bzero(attr, sizeof (*attr)); 514 } 515 516 static void 517 nfs4args_setattr_free(nfs_argop4 *argop) 518 { 519 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 520 } 521 522 static int 523 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 524 bitmap4 supp) 525 { 526 fattr4 *attr; 527 int error = 0; 528 529 argop->argop = op; 530 switch (op) { 531 case OP_VERIFY: 532 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 533 break; 534 case OP_NVERIFY: 535 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 536 break; 537 default: 538 return (EINVAL); 539 } 540 if (!error) 541 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 542 if (error) 543 bzero(attr, sizeof (*attr)); 544 return (error); 545 } 546 547 static void 548 nfs4args_verify_free(nfs_argop4 *argop) 549 { 550 switch (argop->argop) { 551 case OP_VERIFY: 552 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 553 break; 554 case OP_NVERIFY: 555 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 556 break; 557 default: 558 break; 559 } 560 } 561 562 static void 563 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 564 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 565 { 566 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 567 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 568 569 argop->argop = OP_WRITE; 570 wargs->stable = stable; 571 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 572 mi, OP_WRITE, sid_tp); 573 wargs->mblk = NULL; 574 *wargs_pp = wargs; 575 } 576 577 void 578 nfs4args_copen_free(OPEN4cargs *open_args) 579 { 580 if (open_args->owner.owner_val) { 581 kmem_free(open_args->owner.owner_val, 582 open_args->owner.owner_len); 583 } 584 if ((open_args->opentype == OPEN4_CREATE) && 585 (open_args->mode != EXCLUSIVE4)) { 586 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 587 } 588 } 589 590 /* 591 * XXX: This is referenced in modstubs.s 592 */ 593 struct vnodeops * 594 nfs4_getvnodeops(void) 595 { 596 return (nfs4_vnodeops); 597 } 598 599 /* 600 * The OPEN operation opens a regular file. 601 */ 602 /*ARGSUSED3*/ 603 static int 604 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 605 { 606 vnode_t *dvp = NULL; 607 rnode4_t *rp, *drp; 608 int error; 609 int just_been_created; 610 char fn[MAXNAMELEN]; 611 612 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 613 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 614 return (EIO); 615 rp = VTOR4(*vpp); 616 617 /* 618 * Check to see if opening something besides a regular file; 619 * if so skip the OTW call 620 */ 621 if ((*vpp)->v_type != VREG) { 622 error = nfs4_open_non_reg_file(vpp, flag, cr); 623 return (error); 624 } 625 626 /* 627 * XXX - would like a check right here to know if the file is 628 * executable or not, so as to skip OTW 629 */ 630 631 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 632 return (error); 633 634 drp = VTOR4(dvp); 635 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 636 return (EINTR); 637 638 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 639 nfs_rw_exit(&drp->r_rwlock); 640 return (error); 641 } 642 643 /* 644 * See if this file has just been CREATEd. 645 * If so, clear the flag and update the dnlc, which was previously 646 * skipped in nfs4_create. 647 * XXX need better serilization on this. 648 * XXX move this into the nf4open_otw call, after we have 649 * XXX acquired the open owner seqid sync. 650 */ 651 mutex_enter(&rp->r_statev4_lock); 652 if (rp->created_v4) { 653 rp->created_v4 = 0; 654 mutex_exit(&rp->r_statev4_lock); 655 656 dnlc_update(dvp, fn, *vpp); 657 /* This is needed so we don't bump the open ref count */ 658 just_been_created = 1; 659 } else { 660 mutex_exit(&rp->r_statev4_lock); 661 just_been_created = 0; 662 } 663 664 /* 665 * If caller specified O_TRUNC/FTRUNC, then be sure to set 666 * FWRITE (to drive successful setattr(size=0) after open) 667 */ 668 if (flag & FTRUNC) 669 flag |= FWRITE; 670 671 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 672 just_been_created); 673 674 if (!error && !((*vpp)->v_flag & VROOT)) 675 dnlc_update(dvp, fn, *vpp); 676 677 nfs_rw_exit(&drp->r_rwlock); 678 679 /* release the hold from vtodv */ 680 VN_RELE(dvp); 681 682 /* exchange the shadow for the master vnode, if needed */ 683 684 if (error == 0 && IS_SHADOW(*vpp, rp)) 685 sv_exchange(vpp); 686 687 return (error); 688 } 689 690 /* 691 * See if there's a "lost open" request to be saved and recovered. 692 */ 693 static void 694 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 695 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 696 vnode_t *dvp, OPEN4cargs *open_args) 697 { 698 vfs_t *vfsp; 699 char *srccfp; 700 701 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 702 703 if (error != ETIMEDOUT && error != EINTR && 704 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 705 lost_rqstp->lr_op = 0; 706 return; 707 } 708 709 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 710 "nfs4open_save_lost_rqst: error %d", error)); 711 712 lost_rqstp->lr_op = OP_OPEN; 713 714 /* 715 * The vp (if it is not NULL) and dvp are held and rele'd via 716 * the recovery code. See nfs4_save_lost_rqst. 717 */ 718 lost_rqstp->lr_vp = vp; 719 lost_rqstp->lr_dvp = dvp; 720 lost_rqstp->lr_oop = oop; 721 lost_rqstp->lr_osp = NULL; 722 lost_rqstp->lr_lop = NULL; 723 lost_rqstp->lr_cr = cr; 724 lost_rqstp->lr_flk = NULL; 725 lost_rqstp->lr_oacc = open_args->share_access; 726 lost_rqstp->lr_odeny = open_args->share_deny; 727 lost_rqstp->lr_oclaim = open_args->claim; 728 if (open_args->claim == CLAIM_DELEGATE_CUR) { 729 lost_rqstp->lr_ostateid = 730 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 731 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 732 } else { 733 srccfp = open_args->open_claim4_u.cfile; 734 } 735 lost_rqstp->lr_ofile.utf8string_len = 0; 736 lost_rqstp->lr_ofile.utf8string_val = NULL; 737 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 738 lost_rqstp->lr_putfirst = FALSE; 739 } 740 741 struct nfs4_excl_time { 742 uint32 seconds; 743 uint32 nseconds; 744 }; 745 746 /* 747 * The OPEN operation creates and/or opens a regular file 748 * 749 * ARGSUSED 750 */ 751 static int 752 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 753 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 754 enum createmode4 createmode, int file_just_been_created) 755 { 756 rnode4_t *rp; 757 rnode4_t *drp = VTOR4(dvp); 758 vnode_t *vp = NULL; 759 vnode_t *vpi = *vpp; 760 bool_t needrecov = FALSE; 761 762 int doqueue = 1; 763 764 COMPOUND4args_clnt args; 765 COMPOUND4res_clnt res; 766 nfs_argop4 *argop; 767 nfs_resop4 *resop; 768 int argoplist_size; 769 int idx_open, idx_fattr; 770 771 GETFH4res *gf_res = NULL; 772 OPEN4res *op_res = NULL; 773 nfs4_ga_res_t *garp; 774 fattr4 *attr = NULL; 775 struct nfs4_excl_time verf; 776 bool_t did_excl_setup = FALSE; 777 int created_osp; 778 779 OPEN4cargs *open_args; 780 nfs4_open_owner_t *oop = NULL; 781 nfs4_open_stream_t *osp = NULL; 782 seqid4 seqid = 0; 783 bool_t retry_open = FALSE; 784 nfs4_recov_state_t recov_state; 785 nfs4_lost_rqst_t lost_rqst; 786 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 787 hrtime_t t; 788 int acc = 0; 789 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 790 cred_t *ncr = NULL; 791 792 nfs4_sharedfh_t *otw_sfh; 793 nfs4_sharedfh_t *orig_sfh; 794 int fh_differs = 0; 795 int numops, setgid_flag; 796 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 797 798 /* 799 * Make sure we properly deal with setting the right gid on 800 * a newly created file to reflect the parent's setgid bit 801 */ 802 setgid_flag = 0; 803 if (create_flag && in_va) { 804 805 /* 806 * If the parent's directory has the setgid bit set 807 * _and_ the client was able to get a valid mapping 808 * for the parent dir's owner_group, we want to 809 * append NVERIFY(owner_group == dva.va_gid) and 810 * SETATTR to the CREATE compound. 811 */ 812 mutex_enter(&drp->r_statelock); 813 if (drp->r_attr.va_mode & VSGID && 814 drp->r_attr.va_gid != GID_NOBODY) { 815 in_va->va_gid = drp->r_attr.va_gid; 816 setgid_flag = 1; 817 } 818 mutex_exit(&drp->r_statelock); 819 } 820 821 /* 822 * Normal/non-create compound: 823 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 824 * 825 * Open(create) compound no setgid: 826 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 827 * RESTOREFH + GETATTR 828 * 829 * Open(create) setgid: 830 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 831 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 832 * NVERIFY(grp) + SETATTR 833 */ 834 if (setgid_flag) { 835 numops = 10; 836 idx_open = 1; 837 idx_fattr = 3; 838 } else if (create_flag) { 839 numops = 7; 840 idx_open = 2; 841 idx_fattr = 4; 842 } else { 843 numops = 4; 844 idx_open = 1; 845 idx_fattr = 3; 846 } 847 848 args.array_len = numops; 849 argoplist_size = numops * sizeof (nfs_argop4); 850 argop = kmem_alloc(argoplist_size, KM_SLEEP); 851 852 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 853 "open %s open flag 0x%x cred %p", file_name, open_flag, 854 (void *)cr)); 855 856 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 857 if (create_flag) { 858 /* 859 * We are to create a file. Initialize the passed in vnode 860 * pointer. 861 */ 862 vpi = NULL; 863 } else { 864 /* 865 * Check to see if the client owns a read delegation and is 866 * trying to open for write. If so, then return the delegation 867 * to avoid the server doing a cb_recall and returning DELAY. 868 * NB - we don't use the statev4_lock here because we'd have 869 * to drop the lock anyway and the result would be stale. 870 */ 871 if ((open_flag & FWRITE) && 872 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 873 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 874 875 /* 876 * If the file has a delegation, then do an access check up 877 * front. This avoids having to an access check later after 878 * we've already done start_op, which could deadlock. 879 */ 880 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 881 if (open_flag & FREAD && 882 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 883 acc |= VREAD; 884 if (open_flag & FWRITE && 885 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 886 acc |= VWRITE; 887 } 888 } 889 890 drp = VTOR4(dvp); 891 892 recov_state.rs_flags = 0; 893 recov_state.rs_num_retry_despite_err = 0; 894 cred_otw = cr; 895 896 recov_retry: 897 fh_differs = 0; 898 nfs4_error_zinit(&e); 899 900 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 901 if (e.error) { 902 if (ncr != NULL) 903 crfree(ncr); 904 kmem_free(argop, argoplist_size); 905 return (e.error); 906 } 907 908 args.ctag = TAG_OPEN; 909 args.array_len = numops; 910 args.array = argop; 911 912 /* putfh directory fh */ 913 argop[0].argop = OP_CPUTFH; 914 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 915 916 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 917 argop[idx_open].argop = OP_COPEN; 918 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 919 open_args->claim = CLAIM_NULL; 920 921 /* name of file */ 922 open_args->open_claim4_u.cfile = file_name; 923 open_args->owner.owner_len = 0; 924 open_args->owner.owner_val = NULL; 925 926 if (create_flag) { 927 /* CREATE a file */ 928 open_args->opentype = OPEN4_CREATE; 929 open_args->mode = createmode; 930 if (createmode == EXCLUSIVE4) { 931 if (did_excl_setup == FALSE) { 932 verf.seconds = nfs_atoi(hw_serial); 933 if (verf.seconds != 0) 934 verf.nseconds = newnum(); 935 else { 936 timestruc_t now; 937 938 gethrestime(&now); 939 verf.seconds = now.tv_sec; 940 verf.nseconds = now.tv_nsec; 941 } 942 /* 943 * Since the server will use this value for the 944 * mtime, make sure that it can't overflow. Zero 945 * out the MSB. The actual value does not matter 946 * here, only its uniqeness. 947 */ 948 verf.seconds &= INT32_MAX; 949 did_excl_setup = TRUE; 950 } 951 952 /* Now copy over verifier to OPEN4args. */ 953 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 954 } else { 955 int v_error; 956 bitmap4 supp_attrs; 957 servinfo4_t *svp; 958 959 attr = &open_args->createhow4_u.createattrs; 960 961 svp = drp->r_server; 962 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 963 supp_attrs = svp->sv_supp_attrs; 964 nfs_rw_exit(&svp->sv_lock); 965 966 /* GUARDED4 or UNCHECKED4 */ 967 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 968 supp_attrs); 969 if (v_error) { 970 bzero(attr, sizeof (*attr)); 971 nfs4args_copen_free(open_args); 972 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 973 &recov_state, FALSE); 974 if (ncr != NULL) 975 crfree(ncr); 976 kmem_free(argop, argoplist_size); 977 return (v_error); 978 } 979 } 980 } else { 981 /* NO CREATE */ 982 open_args->opentype = OPEN4_NOCREATE; 983 } 984 985 if (recov_state.rs_sp != NULL) { 986 mutex_enter(&recov_state.rs_sp->s_lock); 987 open_args->owner.clientid = recov_state.rs_sp->clientid; 988 mutex_exit(&recov_state.rs_sp->s_lock); 989 } else { 990 /* XXX should we just fail here? */ 991 open_args->owner.clientid = 0; 992 } 993 994 /* 995 * This increments oop's ref count or creates a temporary 'just_created' 996 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 997 * completes. 998 */ 999 mutex_enter(&VTOMI4(dvp)->mi_lock); 1000 1001 /* See if a permanent or just created open owner exists */ 1002 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1003 if (!oop) { 1004 /* 1005 * This open owner does not exist so create a temporary 1006 * just created one. 1007 */ 1008 oop = create_open_owner(cr, VTOMI4(dvp)); 1009 ASSERT(oop != NULL); 1010 } 1011 mutex_exit(&VTOMI4(dvp)->mi_lock); 1012 1013 /* this length never changes, do alloc before seqid sync */ 1014 open_args->owner.owner_len = sizeof (oop->oo_name); 1015 open_args->owner.owner_val = 1016 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1017 1018 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1019 if (e.error == EAGAIN) { 1020 open_owner_rele(oop); 1021 nfs4args_copen_free(open_args); 1022 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1023 if (ncr != NULL) { 1024 crfree(ncr); 1025 ncr = NULL; 1026 } 1027 goto recov_retry; 1028 } 1029 1030 /* Check to see if we need to do the OTW call */ 1031 if (!create_flag) { 1032 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1033 file_just_been_created, &e.error, acc, &recov_state)) { 1034 1035 /* 1036 * The OTW open is not necessary. Either 1037 * the open can succeed without it (eg. 1038 * delegation, error == 0) or the open 1039 * must fail due to an access failure 1040 * (error != 0). In either case, tidy 1041 * up and return. 1042 */ 1043 1044 nfs4_end_open_seqid_sync(oop); 1045 open_owner_rele(oop); 1046 nfs4args_copen_free(open_args); 1047 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1048 if (ncr != NULL) 1049 crfree(ncr); 1050 kmem_free(argop, argoplist_size); 1051 return (e.error); 1052 } 1053 } 1054 1055 bcopy(&oop->oo_name, open_args->owner.owner_val, 1056 open_args->owner.owner_len); 1057 1058 seqid = nfs4_get_open_seqid(oop) + 1; 1059 open_args->seqid = seqid; 1060 open_args->share_access = 0; 1061 if (open_flag & FREAD) 1062 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1063 if (open_flag & FWRITE) 1064 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1065 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1066 1067 1068 1069 /* 1070 * getfh w/sanity check for idx_open/idx_fattr 1071 */ 1072 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1073 argop[idx_open + 1].argop = OP_GETFH; 1074 1075 /* getattr */ 1076 argop[idx_fattr].argop = OP_GETATTR; 1077 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1078 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1079 1080 if (setgid_flag) { 1081 vattr_t _v; 1082 servinfo4_t *svp; 1083 bitmap4 supp_attrs; 1084 1085 svp = drp->r_server; 1086 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1087 supp_attrs = svp->sv_supp_attrs; 1088 nfs_rw_exit(&svp->sv_lock); 1089 1090 /* 1091 * For setgid case, we need to: 1092 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1093 */ 1094 argop[4].argop = OP_SAVEFH; 1095 1096 argop[5].argop = OP_CPUTFH; 1097 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1098 1099 argop[6].argop = OP_GETATTR; 1100 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1101 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1102 1103 argop[7].argop = OP_RESTOREFH; 1104 1105 /* 1106 * nverify 1107 */ 1108 _v.va_mask = AT_GID; 1109 _v.va_gid = in_va->va_gid; 1110 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1111 supp_attrs))) { 1112 1113 /* 1114 * setattr 1115 * 1116 * We _know_ we're not messing with AT_SIZE or 1117 * AT_XTIME, so no need for stateid or flags. 1118 * Also we specify NULL rp since we're only 1119 * interested in setting owner_group attributes. 1120 */ 1121 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1122 supp_attrs, &e.error, 0); 1123 if (e.error) 1124 nfs4args_verify_free(&argop[8]); 1125 } 1126 1127 if (e.error) { 1128 /* 1129 * XXX - Revisit the last argument to nfs4_end_op() 1130 * once 5020486 is fixed. 1131 */ 1132 nfs4_end_open_seqid_sync(oop); 1133 open_owner_rele(oop); 1134 nfs4args_copen_free(open_args); 1135 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1136 if (ncr != NULL) 1137 crfree(ncr); 1138 kmem_free(argop, argoplist_size); 1139 return (e.error); 1140 } 1141 } else if (create_flag) { 1142 /* 1143 * For setgid case, we need to: 1144 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1145 */ 1146 argop[1].argop = OP_SAVEFH; 1147 1148 argop[5].argop = OP_RESTOREFH; 1149 1150 argop[6].argop = OP_GETATTR; 1151 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1152 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1153 } 1154 1155 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1156 "nfs4open_otw: %s call, nm %s, rp %s", 1157 needrecov ? "recov" : "first", file_name, 1158 rnode4info(VTOR4(dvp)))); 1159 1160 t = gethrtime(); 1161 1162 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1163 1164 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1165 nfs4_set_open_seqid(seqid, oop, args.ctag); 1166 1167 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1168 1169 if (e.error || needrecov) { 1170 bool_t abort = FALSE; 1171 1172 if (needrecov) { 1173 nfs4_bseqid_entry_t *bsep = NULL; 1174 1175 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1176 cred_otw, vpi, dvp, open_args); 1177 1178 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1179 bsep = nfs4_create_bseqid_entry(oop, NULL, 1180 vpi, 0, args.ctag, open_args->seqid); 1181 num_bseqid_retry--; 1182 } 1183 1184 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1185 NULL, lost_rqst.lr_op == OP_OPEN ? 1186 &lost_rqst : NULL, OP_OPEN, bsep); 1187 1188 if (bsep) 1189 kmem_free(bsep, sizeof (*bsep)); 1190 /* give up if we keep getting BAD_SEQID */ 1191 if (num_bseqid_retry == 0) 1192 abort = TRUE; 1193 if (abort == TRUE && e.error == 0) 1194 e.error = geterrno4(res.status); 1195 } 1196 nfs4_end_open_seqid_sync(oop); 1197 open_owner_rele(oop); 1198 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1199 nfs4args_copen_free(open_args); 1200 if (setgid_flag) { 1201 nfs4args_verify_free(&argop[8]); 1202 nfs4args_setattr_free(&argop[9]); 1203 } 1204 if (!e.error) 1205 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1206 if (ncr != NULL) { 1207 crfree(ncr); 1208 ncr = NULL; 1209 } 1210 if (!needrecov || abort == TRUE || e.error == EINTR || 1211 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1212 kmem_free(argop, argoplist_size); 1213 return (e.error); 1214 } 1215 goto recov_retry; 1216 } 1217 1218 /* 1219 * Will check and update lease after checking the rflag for 1220 * OPEN_CONFIRM in the successful OPEN call. 1221 */ 1222 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1223 1224 /* 1225 * XXX what if we're crossing mount points from server1:/drp 1226 * to server2:/drp/rp. 1227 */ 1228 1229 /* Signal our end of use of the open seqid */ 1230 nfs4_end_open_seqid_sync(oop); 1231 1232 /* 1233 * This will destroy the open owner if it was just created, 1234 * and no one else has put a reference on it. 1235 */ 1236 open_owner_rele(oop); 1237 if (create_flag && (createmode != EXCLUSIVE4) && 1238 res.status == NFS4ERR_BADOWNER) 1239 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1240 1241 e.error = geterrno4(res.status); 1242 nfs4args_copen_free(open_args); 1243 if (setgid_flag) { 1244 nfs4args_verify_free(&argop[8]); 1245 nfs4args_setattr_free(&argop[9]); 1246 } 1247 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1248 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1249 /* 1250 * If the reply is NFS4ERR_ACCESS, it may be because 1251 * we are root (no root net access). If the real uid 1252 * is not root, then retry with the real uid instead. 1253 */ 1254 if (ncr != NULL) { 1255 crfree(ncr); 1256 ncr = NULL; 1257 } 1258 if (res.status == NFS4ERR_ACCESS && 1259 (ncr = crnetadjust(cred_otw)) != NULL) { 1260 cred_otw = ncr; 1261 goto recov_retry; 1262 } 1263 kmem_free(argop, argoplist_size); 1264 return (e.error); 1265 } 1266 1267 resop = &res.array[idx_open]; /* open res */ 1268 op_res = &resop->nfs_resop4_u.opopen; 1269 1270 #ifdef DEBUG 1271 /* 1272 * verify attrset bitmap 1273 */ 1274 if (create_flag && 1275 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1276 /* make sure attrset returned is what we asked for */ 1277 /* XXX Ignore this 'error' for now */ 1278 if (attr->attrmask != op_res->attrset) 1279 /* EMPTY */; 1280 } 1281 #endif 1282 1283 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1284 mutex_enter(&VTOMI4(dvp)->mi_lock); 1285 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1286 mutex_exit(&VTOMI4(dvp)->mi_lock); 1287 } 1288 1289 resop = &res.array[idx_open + 1]; /* getfh res */ 1290 gf_res = &resop->nfs_resop4_u.opgetfh; 1291 1292 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1293 1294 /* 1295 * The open stateid has been updated on the server but not 1296 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1297 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1298 * WRITE call. That, however, will use the old stateid, so go ahead 1299 * and upate the open stateid now, before any call to makenfs4node. 1300 */ 1301 if (vpi) { 1302 nfs4_open_stream_t *tmp_osp; 1303 rnode4_t *tmp_rp = VTOR4(vpi); 1304 1305 tmp_osp = find_open_stream(oop, tmp_rp); 1306 if (tmp_osp) { 1307 tmp_osp->open_stateid = op_res->stateid; 1308 mutex_exit(&tmp_osp->os_sync_lock); 1309 open_stream_rele(tmp_osp, tmp_rp); 1310 } 1311 1312 /* 1313 * We must determine if the file handle given by the otw open 1314 * is the same as the file handle which was passed in with 1315 * *vpp. This case can be reached if the file we are trying 1316 * to open has been removed and another file has been created 1317 * having the same file name. The passed in vnode is released 1318 * later. 1319 */ 1320 orig_sfh = VTOR4(vpi)->r_fh; 1321 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1322 } 1323 1324 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1325 1326 if (create_flag || fh_differs) { 1327 int rnode_err = 0; 1328 1329 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1330 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1331 1332 if (e.error) 1333 PURGE_ATTRCACHE4(vp); 1334 /* 1335 * For the newly created vp case, make sure the rnode 1336 * isn't bad before using it. 1337 */ 1338 mutex_enter(&(VTOR4(vp))->r_statelock); 1339 if (VTOR4(vp)->r_flags & R4RECOVERR) 1340 rnode_err = EIO; 1341 mutex_exit(&(VTOR4(vp))->r_statelock); 1342 1343 if (rnode_err) { 1344 nfs4_end_open_seqid_sync(oop); 1345 nfs4args_copen_free(open_args); 1346 if (setgid_flag) { 1347 nfs4args_verify_free(&argop[8]); 1348 nfs4args_setattr_free(&argop[9]); 1349 } 1350 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1351 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1352 needrecov); 1353 open_owner_rele(oop); 1354 VN_RELE(vp); 1355 if (ncr != NULL) 1356 crfree(ncr); 1357 sfh4_rele(&otw_sfh); 1358 kmem_free(argop, argoplist_size); 1359 return (EIO); 1360 } 1361 } else { 1362 vp = vpi; 1363 } 1364 sfh4_rele(&otw_sfh); 1365 1366 /* 1367 * It seems odd to get a full set of attrs and then not update 1368 * the object's attrcache in the non-create case. Create case uses 1369 * the attrs since makenfs4node checks to see if the attrs need to 1370 * be updated (and then updates them). The non-create case should 1371 * update attrs also. 1372 */ 1373 if (! create_flag && ! fh_differs && !e.error) { 1374 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1375 } 1376 1377 nfs4_error_zinit(&e); 1378 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1379 /* This does not do recovery for vp explicitly. */ 1380 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1381 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1382 1383 if (e.error || e.stat) { 1384 nfs4_end_open_seqid_sync(oop); 1385 nfs4args_copen_free(open_args); 1386 if (setgid_flag) { 1387 nfs4args_verify_free(&argop[8]); 1388 nfs4args_setattr_free(&argop[9]); 1389 } 1390 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1391 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1392 needrecov); 1393 open_owner_rele(oop); 1394 if (create_flag || fh_differs) { 1395 /* rele the makenfs4node */ 1396 VN_RELE(vp); 1397 } 1398 if (ncr != NULL) { 1399 crfree(ncr); 1400 ncr = NULL; 1401 } 1402 if (retry_open == TRUE) { 1403 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1404 "nfs4open_otw: retry the open since OPEN " 1405 "CONFIRM failed with error %d stat %d", 1406 e.error, e.stat)); 1407 if (create_flag && createmode == GUARDED4) { 1408 NFS4_DEBUG(nfs4_client_recov_debug, 1409 (CE_NOTE, "nfs4open_otw: switch " 1410 "createmode from GUARDED4 to " 1411 "UNCHECKED4")); 1412 createmode = UNCHECKED4; 1413 } 1414 goto recov_retry; 1415 } 1416 if (!e.error) { 1417 if (create_flag && (createmode != EXCLUSIVE4) && 1418 e.stat == NFS4ERR_BADOWNER) 1419 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1420 1421 e.error = geterrno4(e.stat); 1422 } 1423 kmem_free(argop, argoplist_size); 1424 return (e.error); 1425 } 1426 } 1427 1428 rp = VTOR4(vp); 1429 1430 mutex_enter(&rp->r_statev4_lock); 1431 if (create_flag) 1432 rp->created_v4 = 1; 1433 mutex_exit(&rp->r_statev4_lock); 1434 1435 mutex_enter(&oop->oo_lock); 1436 /* Doesn't matter if 'oo_just_created' already was set as this */ 1437 oop->oo_just_created = NFS4_PERM_CREATED; 1438 if (oop->oo_cred_otw) 1439 crfree(oop->oo_cred_otw); 1440 oop->oo_cred_otw = cred_otw; 1441 crhold(oop->oo_cred_otw); 1442 mutex_exit(&oop->oo_lock); 1443 1444 /* returns with 'os_sync_lock' held */ 1445 osp = find_or_create_open_stream(oop, rp, &created_osp); 1446 if (!osp) { 1447 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1448 "nfs4open_otw: failed to create an open stream")); 1449 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1450 "signal our end of use of the open seqid")); 1451 1452 nfs4_end_open_seqid_sync(oop); 1453 open_owner_rele(oop); 1454 nfs4args_copen_free(open_args); 1455 if (setgid_flag) { 1456 nfs4args_verify_free(&argop[8]); 1457 nfs4args_setattr_free(&argop[9]); 1458 } 1459 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1460 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1461 if (create_flag || fh_differs) 1462 VN_RELE(vp); 1463 if (ncr != NULL) 1464 crfree(ncr); 1465 1466 kmem_free(argop, argoplist_size); 1467 return (EINVAL); 1468 1469 } 1470 1471 osp->open_stateid = op_res->stateid; 1472 1473 if (open_flag & FREAD) 1474 osp->os_share_acc_read++; 1475 if (open_flag & FWRITE) 1476 osp->os_share_acc_write++; 1477 osp->os_share_deny_none++; 1478 1479 /* 1480 * Need to reset this bitfield for the possible case where we were 1481 * going to OTW CLOSE the file, got a non-recoverable error, and before 1482 * we could retry the CLOSE, OPENed the file again. 1483 */ 1484 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1485 osp->os_final_close = 0; 1486 osp->os_force_close = 0; 1487 #ifdef DEBUG 1488 if (osp->os_failed_reopen) 1489 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1490 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1491 (void *)osp, (void *)cr, rnode4info(rp))); 1492 #endif 1493 osp->os_failed_reopen = 0; 1494 1495 mutex_exit(&osp->os_sync_lock); 1496 1497 nfs4_end_open_seqid_sync(oop); 1498 1499 if (created_osp && recov_state.rs_sp != NULL) { 1500 mutex_enter(&recov_state.rs_sp->s_lock); 1501 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1502 mutex_exit(&recov_state.rs_sp->s_lock); 1503 } 1504 1505 /* get rid of our reference to find oop */ 1506 open_owner_rele(oop); 1507 1508 open_stream_rele(osp, rp); 1509 1510 /* accept delegation, if any */ 1511 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1512 1513 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1514 1515 if (createmode == EXCLUSIVE4 && 1516 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1517 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1518 " EXCLUSIVE4: sending a SETATTR")); 1519 /* 1520 * If doing an exclusive create, then generate 1521 * a SETATTR to set the initial attributes. 1522 * Try to set the mtime and the atime to the 1523 * server's current time. It is somewhat 1524 * expected that these fields will be used to 1525 * store the exclusive create cookie. If not, 1526 * server implementors will need to know that 1527 * a SETATTR will follow an exclusive create 1528 * and the cookie should be destroyed if 1529 * appropriate. 1530 * 1531 * The AT_GID and AT_SIZE bits are turned off 1532 * so that the SETATTR request will not attempt 1533 * to process these. The gid will be set 1534 * separately if appropriate. The size is turned 1535 * off because it is assumed that a new file will 1536 * be created empty and if the file wasn't empty, 1537 * then the exclusive create will have failed 1538 * because the file must have existed already. 1539 * Therefore, no truncate operation is needed. 1540 */ 1541 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1542 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1543 1544 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1545 if (e.error) { 1546 /* 1547 * Couldn't correct the attributes of 1548 * the newly created file and the 1549 * attributes are wrong. Remove the 1550 * file and return an error to the 1551 * application. 1552 */ 1553 /* XXX will this take care of client state ? */ 1554 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1555 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1556 " remove file", e.error)); 1557 VN_RELE(vp); 1558 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1559 /* 1560 * Since we've reled the vnode and removed 1561 * the file we now need to return the error. 1562 * At this point we don't want to update the 1563 * dircaches, call nfs4_waitfor_purge_complete 1564 * or set vpp to vp so we need to skip these 1565 * as well. 1566 */ 1567 goto skip_update_dircaches; 1568 } 1569 } 1570 1571 /* 1572 * If we created or found the correct vnode, due to create_flag or 1573 * fh_differs being set, then update directory cache attribute, readdir 1574 * and dnlc caches. 1575 */ 1576 if (create_flag || fh_differs) { 1577 dirattr_info_t dinfo, *dinfop; 1578 1579 /* 1580 * Make sure getattr succeeded before using results. 1581 * note: op 7 is getattr(dir) for both flavors of 1582 * open(create). 1583 */ 1584 if (create_flag && res.status == NFS4_OK) { 1585 dinfo.di_time_call = t; 1586 dinfo.di_cred = cr; 1587 dinfo.di_garp = 1588 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1589 dinfop = &dinfo; 1590 } else { 1591 dinfop = NULL; 1592 } 1593 1594 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1595 dinfop); 1596 } 1597 1598 /* 1599 * If the page cache for this file was flushed from actions 1600 * above, it was done asynchronously and if that is true, 1601 * there is a need to wait here for it to complete. This must 1602 * be done outside of start_fop/end_fop. 1603 */ 1604 (void) nfs4_waitfor_purge_complete(vp); 1605 1606 /* 1607 * It is implicit that we are in the open case (create_flag == 0) since 1608 * fh_differs can only be set to a non-zero value in the open case. 1609 */ 1610 if (fh_differs != 0 && vpi != NULL) 1611 VN_RELE(vpi); 1612 1613 /* 1614 * Be sure to set *vpp to the correct value before returning. 1615 */ 1616 *vpp = vp; 1617 1618 skip_update_dircaches: 1619 1620 nfs4args_copen_free(open_args); 1621 if (setgid_flag) { 1622 nfs4args_verify_free(&argop[8]); 1623 nfs4args_setattr_free(&argop[9]); 1624 } 1625 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1626 1627 if (ncr) 1628 crfree(ncr); 1629 kmem_free(argop, argoplist_size); 1630 return (e.error); 1631 } 1632 1633 /* 1634 * Reopen an open instance. cf. nfs4open_otw(). 1635 * 1636 * Errors are returned by the nfs4_error_t parameter. 1637 * - ep->error contains an errno value or zero. 1638 * - if it is zero, ep->stat is set to an NFS status code, if any. 1639 * If the file could not be reopened, but the caller should continue, the 1640 * file is marked dead and no error values are returned. If the caller 1641 * should stop recovering open files and start over, either the ep->error 1642 * value or ep->stat will indicate an error (either something that requires 1643 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1644 * filehandles) may be handled silently by this routine. 1645 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1646 * will be started, so the caller should not do it. 1647 * 1648 * Gotos: 1649 * - kill_file : reopen failed in such a fashion to constitute marking the 1650 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1651 * is for cases where recovery is not possible. 1652 * - failed_reopen : same as above, except that the file has already been 1653 * marked dead, so no need to do it again. 1654 * - bailout : reopen failed but we are able to recover and retry the reopen - 1655 * either within this function immediately or via the calling function. 1656 */ 1657 1658 void 1659 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1660 open_claim_type4 claim, bool_t frc_use_claim_previous, 1661 bool_t is_recov) 1662 { 1663 COMPOUND4args_clnt args; 1664 COMPOUND4res_clnt res; 1665 nfs_argop4 argop[4]; 1666 nfs_resop4 *resop; 1667 OPEN4res *op_res = NULL; 1668 OPEN4cargs *open_args; 1669 GETFH4res *gf_res; 1670 rnode4_t *rp = VTOR4(vp); 1671 int doqueue = 1; 1672 cred_t *cr = NULL, *cred_otw = NULL; 1673 nfs4_open_owner_t *oop = NULL; 1674 seqid4 seqid; 1675 nfs4_ga_res_t *garp; 1676 char fn[MAXNAMELEN]; 1677 nfs4_recov_state_t recov = {NULL, 0}; 1678 nfs4_lost_rqst_t lost_rqst; 1679 mntinfo4_t *mi = VTOMI4(vp); 1680 bool_t abort; 1681 char *failed_msg = ""; 1682 int fh_different; 1683 hrtime_t t; 1684 nfs4_bseqid_entry_t *bsep = NULL; 1685 1686 ASSERT(nfs4_consistent_type(vp)); 1687 ASSERT(nfs_zone() == mi->mi_zone); 1688 1689 nfs4_error_zinit(ep); 1690 1691 /* this is the cred used to find the open owner */ 1692 cr = state_to_cred(osp); 1693 if (cr == NULL) { 1694 failed_msg = "Couldn't reopen: no cred"; 1695 goto kill_file; 1696 } 1697 /* use this cred for OTW operations */ 1698 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1699 1700 top: 1701 nfs4_error_zinit(ep); 1702 1703 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1704 /* File system has been unmounted, quit */ 1705 ep->error = EIO; 1706 failed_msg = "Couldn't reopen: file system has been unmounted"; 1707 goto kill_file; 1708 } 1709 1710 oop = osp->os_open_owner; 1711 1712 ASSERT(oop != NULL); 1713 if (oop == NULL) { /* be defensive in non-DEBUG */ 1714 failed_msg = "can't reopen: no open owner"; 1715 goto kill_file; 1716 } 1717 open_owner_hold(oop); 1718 1719 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1720 if (ep->error) { 1721 open_owner_rele(oop); 1722 oop = NULL; 1723 goto bailout; 1724 } 1725 1726 /* 1727 * If the rnode has a delegation and the delegation has been 1728 * recovered and the server didn't request a recall and the caller 1729 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1730 * recovery) and the rnode hasn't been marked dead, then install 1731 * the delegation stateid in the open stream. Otherwise, proceed 1732 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1733 */ 1734 mutex_enter(&rp->r_statev4_lock); 1735 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1736 !rp->r_deleg_return_pending && 1737 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1738 !rp->r_deleg_needs_recall && 1739 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1740 !(rp->r_flags & R4RECOVERR)) { 1741 mutex_enter(&osp->os_sync_lock); 1742 osp->os_delegation = 1; 1743 osp->open_stateid = rp->r_deleg_stateid; 1744 mutex_exit(&osp->os_sync_lock); 1745 mutex_exit(&rp->r_statev4_lock); 1746 goto bailout; 1747 } 1748 mutex_exit(&rp->r_statev4_lock); 1749 1750 /* 1751 * If the file failed recovery, just quit. This failure need not 1752 * affect other reopens, so don't return an error. 1753 */ 1754 mutex_enter(&rp->r_statelock); 1755 if (rp->r_flags & R4RECOVERR) { 1756 mutex_exit(&rp->r_statelock); 1757 ep->error = 0; 1758 goto failed_reopen; 1759 } 1760 mutex_exit(&rp->r_statelock); 1761 1762 /* 1763 * argop is empty here 1764 * 1765 * PUTFH, OPEN, GETATTR 1766 */ 1767 args.ctag = TAG_REOPEN; 1768 args.array_len = 4; 1769 args.array = argop; 1770 1771 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1772 "nfs4_reopen: file is type %d, id %s", 1773 vp->v_type, rnode4info(VTOR4(vp)))); 1774 1775 argop[0].argop = OP_CPUTFH; 1776 1777 if (claim != CLAIM_PREVIOUS) { 1778 /* 1779 * if this is a file mount then 1780 * use the mntinfo parentfh 1781 */ 1782 argop[0].nfs_argop4_u.opcputfh.sfh = 1783 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1784 VTOSV(vp)->sv_dfh; 1785 } else { 1786 /* putfh fh to reopen */ 1787 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1788 } 1789 1790 argop[1].argop = OP_COPEN; 1791 open_args = &argop[1].nfs_argop4_u.opcopen; 1792 open_args->claim = claim; 1793 1794 if (claim == CLAIM_NULL) { 1795 1796 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1797 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1798 "failed for vp 0x%p for CLAIM_NULL with %m", 1799 (void *)vp); 1800 failed_msg = "Couldn't reopen: vtoname failed for " 1801 "CLAIM_NULL"; 1802 /* nothing allocated yet */ 1803 goto kill_file; 1804 } 1805 1806 open_args->open_claim4_u.cfile = fn; 1807 } else if (claim == CLAIM_PREVIOUS) { 1808 1809 /* 1810 * We have two cases to deal with here: 1811 * 1) We're being called to reopen files in order to satisfy 1812 * a lock operation request which requires us to explicitly 1813 * reopen files which were opened under a delegation. If 1814 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1815 * that case, frc_use_claim_previous is TRUE and we must 1816 * use the rnode's current delegation type (r_deleg_type). 1817 * 2) We're reopening files during some form of recovery. 1818 * In this case, frc_use_claim_previous is FALSE and we 1819 * use the delegation type appropriate for recovery 1820 * (r_deleg_needs_recovery). 1821 */ 1822 mutex_enter(&rp->r_statev4_lock); 1823 open_args->open_claim4_u.delegate_type = 1824 frc_use_claim_previous ? 1825 rp->r_deleg_type : 1826 rp->r_deleg_needs_recovery; 1827 mutex_exit(&rp->r_statev4_lock); 1828 1829 } else if (claim == CLAIM_DELEGATE_CUR) { 1830 1831 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1832 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1833 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1834 "with %m", (void *)vp); 1835 failed_msg = "Couldn't reopen: vtoname failed for " 1836 "CLAIM_DELEGATE_CUR"; 1837 /* nothing allocated yet */ 1838 goto kill_file; 1839 } 1840 1841 mutex_enter(&rp->r_statev4_lock); 1842 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1843 rp->r_deleg_stateid; 1844 mutex_exit(&rp->r_statev4_lock); 1845 1846 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1847 } 1848 open_args->opentype = OPEN4_NOCREATE; 1849 open_args->owner.clientid = mi2clientid(mi); 1850 open_args->owner.owner_len = sizeof (oop->oo_name); 1851 open_args->owner.owner_val = 1852 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1853 bcopy(&oop->oo_name, open_args->owner.owner_val, 1854 open_args->owner.owner_len); 1855 open_args->share_access = 0; 1856 open_args->share_deny = 0; 1857 1858 mutex_enter(&osp->os_sync_lock); 1859 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1860 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1861 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1862 (void *)osp, (void *)rp, osp->os_share_acc_read, 1863 osp->os_share_acc_write, osp->os_open_ref_count, 1864 osp->os_mmap_read, osp->os_mmap_write, claim)); 1865 1866 if (osp->os_share_acc_read || osp->os_mmap_read) 1867 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1868 if (osp->os_share_acc_write || osp->os_mmap_write) 1869 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1870 if (osp->os_share_deny_read) 1871 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1872 if (osp->os_share_deny_write) 1873 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1874 mutex_exit(&osp->os_sync_lock); 1875 1876 seqid = nfs4_get_open_seqid(oop) + 1; 1877 open_args->seqid = seqid; 1878 1879 /* Construct the getfh part of the compound */ 1880 argop[2].argop = OP_GETFH; 1881 1882 /* Construct the getattr part of the compound */ 1883 argop[3].argop = OP_GETATTR; 1884 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1885 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1886 1887 t = gethrtime(); 1888 1889 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1890 1891 if (ep->error) { 1892 if (!is_recov && !frc_use_claim_previous && 1893 (ep->error == EINTR || ep->error == ETIMEDOUT || 1894 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1895 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1896 cred_otw, vp, NULL, open_args); 1897 abort = nfs4_start_recovery(ep, 1898 VTOMI4(vp), vp, NULL, NULL, 1899 lost_rqst.lr_op == OP_OPEN ? 1900 &lost_rqst : NULL, OP_OPEN, NULL); 1901 nfs4args_copen_free(open_args); 1902 goto bailout; 1903 } 1904 1905 nfs4args_copen_free(open_args); 1906 1907 if (ep->error == EACCES && cred_otw != cr) { 1908 crfree(cred_otw); 1909 cred_otw = cr; 1910 crhold(cred_otw); 1911 nfs4_end_open_seqid_sync(oop); 1912 open_owner_rele(oop); 1913 oop = NULL; 1914 goto top; 1915 } 1916 if (ep->error == ETIMEDOUT) 1917 goto bailout; 1918 failed_msg = "Couldn't reopen: rpc error"; 1919 goto kill_file; 1920 } 1921 1922 if (nfs4_need_to_bump_seqid(&res)) 1923 nfs4_set_open_seqid(seqid, oop, args.ctag); 1924 1925 switch (res.status) { 1926 case NFS4_OK: 1927 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1928 mutex_enter(&rp->r_statelock); 1929 rp->r_delay_interval = 0; 1930 mutex_exit(&rp->r_statelock); 1931 } 1932 break; 1933 case NFS4ERR_BAD_SEQID: 1934 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1935 args.ctag, open_args->seqid); 1936 1937 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1938 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1939 NULL, OP_OPEN, bsep); 1940 1941 nfs4args_copen_free(open_args); 1942 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1943 nfs4_end_open_seqid_sync(oop); 1944 open_owner_rele(oop); 1945 oop = NULL; 1946 kmem_free(bsep, sizeof (*bsep)); 1947 1948 goto kill_file; 1949 case NFS4ERR_NO_GRACE: 1950 nfs4args_copen_free(open_args); 1951 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1952 nfs4_end_open_seqid_sync(oop); 1953 open_owner_rele(oop); 1954 oop = NULL; 1955 if (claim == CLAIM_PREVIOUS) { 1956 /* 1957 * Retry as a plain open. We don't need to worry about 1958 * checking the changeinfo: it is acceptable for a 1959 * client to re-open a file and continue processing 1960 * (in the absence of locks). 1961 */ 1962 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1963 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1964 "will retry as CLAIM_NULL")); 1965 claim = CLAIM_NULL; 1966 nfs4_mi_kstat_inc_no_grace(mi); 1967 goto top; 1968 } 1969 failed_msg = 1970 "Couldn't reopen: tried reclaim outside grace period. "; 1971 goto kill_file; 1972 case NFS4ERR_GRACE: 1973 nfs4_set_grace_wait(mi); 1974 nfs4args_copen_free(open_args); 1975 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1976 nfs4_end_open_seqid_sync(oop); 1977 open_owner_rele(oop); 1978 oop = NULL; 1979 ep->error = nfs4_wait_for_grace(mi, &recov); 1980 if (ep->error != 0) 1981 goto bailout; 1982 goto top; 1983 case NFS4ERR_DELAY: 1984 nfs4_set_delay_wait(vp); 1985 nfs4args_copen_free(open_args); 1986 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1987 nfs4_end_open_seqid_sync(oop); 1988 open_owner_rele(oop); 1989 oop = NULL; 1990 ep->error = nfs4_wait_for_delay(vp, &recov); 1991 nfs4_mi_kstat_inc_delay(mi); 1992 if (ep->error != 0) 1993 goto bailout; 1994 goto top; 1995 case NFS4ERR_FHEXPIRED: 1996 /* recover filehandle and retry */ 1997 abort = nfs4_start_recovery(ep, 1998 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 1999 nfs4args_copen_free(open_args); 2000 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2001 nfs4_end_open_seqid_sync(oop); 2002 open_owner_rele(oop); 2003 oop = NULL; 2004 if (abort == FALSE) 2005 goto top; 2006 failed_msg = "Couldn't reopen: recovery aborted"; 2007 goto kill_file; 2008 case NFS4ERR_RESOURCE: 2009 case NFS4ERR_STALE_CLIENTID: 2010 case NFS4ERR_WRONGSEC: 2011 case NFS4ERR_EXPIRED: 2012 /* 2013 * Do not mark the file dead and let the calling 2014 * function initiate recovery. 2015 */ 2016 nfs4args_copen_free(open_args); 2017 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2018 nfs4_end_open_seqid_sync(oop); 2019 open_owner_rele(oop); 2020 oop = NULL; 2021 goto bailout; 2022 case NFS4ERR_ACCESS: 2023 if (cred_otw != cr) { 2024 crfree(cred_otw); 2025 cred_otw = cr; 2026 crhold(cred_otw); 2027 nfs4args_copen_free(open_args); 2028 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2029 nfs4_end_open_seqid_sync(oop); 2030 open_owner_rele(oop); 2031 oop = NULL; 2032 goto top; 2033 } 2034 /* fall through */ 2035 default: 2036 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2037 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2038 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2039 rnode4info(VTOR4(vp)))); 2040 failed_msg = "Couldn't reopen: NFSv4 error"; 2041 nfs4args_copen_free(open_args); 2042 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2043 goto kill_file; 2044 } 2045 2046 resop = &res.array[1]; /* open res */ 2047 op_res = &resop->nfs_resop4_u.opopen; 2048 2049 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2050 2051 /* 2052 * Check if the path we reopened really is the same 2053 * file. We could end up in a situation where the file 2054 * was removed and a new file created with the same name. 2055 */ 2056 resop = &res.array[2]; 2057 gf_res = &resop->nfs_resop4_u.opgetfh; 2058 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2059 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2060 if (fh_different) { 2061 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2062 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2063 /* Oops, we don't have the same file */ 2064 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2065 failed_msg = "Couldn't reopen: Persistent " 2066 "file handle changed"; 2067 else 2068 failed_msg = "Couldn't reopen: Volatile " 2069 "(no expire on open) file handle changed"; 2070 2071 nfs4args_copen_free(open_args); 2072 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2073 nfs_rw_exit(&mi->mi_fh_lock); 2074 goto kill_file; 2075 2076 } else { 2077 /* 2078 * We have volatile file handles that don't compare. 2079 * If the fids are the same then we assume that the 2080 * file handle expired but the rnode still refers to 2081 * the same file object. 2082 * 2083 * First check that we have fids or not. 2084 * If we don't we have a dumb server so we will 2085 * just assume every thing is ok for now. 2086 */ 2087 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2088 rp->r_attr.va_mask & AT_NODEID && 2089 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2090 /* 2091 * We have fids, but they don't 2092 * compare. So kill the file. 2093 */ 2094 failed_msg = 2095 "Couldn't reopen: file handle changed" 2096 " due to mismatched fids"; 2097 nfs4args_copen_free(open_args); 2098 (void) xdr_free(xdr_COMPOUND4res_clnt, 2099 (caddr_t)&res); 2100 nfs_rw_exit(&mi->mi_fh_lock); 2101 goto kill_file; 2102 } else { 2103 /* 2104 * We have volatile file handles that refers 2105 * to the same file (at least they have the 2106 * same fid) or we don't have fids so we 2107 * can't tell. :(. We'll be a kind and accepting 2108 * client so we'll update the rnode's file 2109 * handle with the otw handle. 2110 * 2111 * We need to drop mi->mi_fh_lock since 2112 * sh4_update acquires it. Since there is 2113 * only one recovery thread there is no 2114 * race. 2115 */ 2116 nfs_rw_exit(&mi->mi_fh_lock); 2117 sfh4_update(rp->r_fh, &gf_res->object); 2118 } 2119 } 2120 } else { 2121 nfs_rw_exit(&mi->mi_fh_lock); 2122 } 2123 2124 ASSERT(nfs4_consistent_type(vp)); 2125 2126 /* 2127 * If the server wanted an OPEN_CONFIRM but that fails, just start 2128 * over. Presumably if there is a persistent error it will show up 2129 * when we resend the OPEN. 2130 */ 2131 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2132 bool_t retry_open = FALSE; 2133 2134 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2135 cred_otw, is_recov, &retry_open, 2136 oop, FALSE, ep, NULL); 2137 if (ep->error || ep->stat) { 2138 nfs4args_copen_free(open_args); 2139 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2140 nfs4_end_open_seqid_sync(oop); 2141 open_owner_rele(oop); 2142 oop = NULL; 2143 goto top; 2144 } 2145 } 2146 2147 mutex_enter(&osp->os_sync_lock); 2148 osp->open_stateid = op_res->stateid; 2149 osp->os_delegation = 0; 2150 /* 2151 * Need to reset this bitfield for the possible case where we were 2152 * going to OTW CLOSE the file, got a non-recoverable error, and before 2153 * we could retry the CLOSE, OPENed the file again. 2154 */ 2155 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2156 osp->os_final_close = 0; 2157 osp->os_force_close = 0; 2158 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2159 osp->os_dc_openacc = open_args->share_access; 2160 mutex_exit(&osp->os_sync_lock); 2161 2162 nfs4_end_open_seqid_sync(oop); 2163 2164 /* accept delegation, if any */ 2165 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2166 2167 nfs4args_copen_free(open_args); 2168 2169 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2170 2171 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2172 2173 ASSERT(nfs4_consistent_type(vp)); 2174 2175 open_owner_rele(oop); 2176 crfree(cr); 2177 crfree(cred_otw); 2178 return; 2179 2180 kill_file: 2181 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2182 failed_reopen: 2183 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2184 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2185 (void *)osp, (void *)cr, rnode4info(rp))); 2186 mutex_enter(&osp->os_sync_lock); 2187 osp->os_failed_reopen = 1; 2188 mutex_exit(&osp->os_sync_lock); 2189 bailout: 2190 if (oop != NULL) { 2191 nfs4_end_open_seqid_sync(oop); 2192 open_owner_rele(oop); 2193 } 2194 if (cr != NULL) 2195 crfree(cr); 2196 if (cred_otw != NULL) 2197 crfree(cred_otw); 2198 } 2199 2200 /* for . and .. OPENs */ 2201 /* ARGSUSED */ 2202 static int 2203 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2204 { 2205 rnode4_t *rp; 2206 nfs4_ga_res_t gar; 2207 2208 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2209 2210 /* 2211 * If close-to-open consistency checking is turned off or 2212 * if there is no cached data, we can avoid 2213 * the over the wire getattr. Otherwise, force a 2214 * call to the server to get fresh attributes and to 2215 * check caches. This is required for close-to-open 2216 * consistency. 2217 */ 2218 rp = VTOR4(*vpp); 2219 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2220 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2221 return (0); 2222 2223 gar.n4g_va.va_mask = AT_ALL; 2224 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2225 } 2226 2227 /* 2228 * CLOSE a file 2229 */ 2230 /* ARGSUSED */ 2231 static int 2232 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2233 caller_context_t *ct) 2234 { 2235 rnode4_t *rp; 2236 int error = 0; 2237 int r_error = 0; 2238 int n4error = 0; 2239 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2240 2241 /* 2242 * Remove client state for this (lockowner, file) pair. 2243 * Issue otw v4 call to have the server do the same. 2244 */ 2245 2246 rp = VTOR4(vp); 2247 2248 /* 2249 * zone_enter(2) prevents processes from changing zones with NFS files 2250 * open; if we happen to get here from the wrong zone we can't do 2251 * anything over the wire. 2252 */ 2253 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2254 /* 2255 * We could attempt to clean up locks, except we're sure 2256 * that the current process didn't acquire any locks on 2257 * the file: any attempt to lock a file belong to another zone 2258 * will fail, and one can't lock an NFS file and then change 2259 * zones, as that fails too. 2260 * 2261 * Returning an error here is the sane thing to do. A 2262 * subsequent call to VN_RELE() which translates to a 2263 * nfs4_inactive() will clean up state: if the zone of the 2264 * vnode's origin is still alive and kicking, the inactive 2265 * thread will handle the request (from the correct zone), and 2266 * everything (minus the OTW close call) should be OK. If the 2267 * zone is going away nfs4_async_inactive() will throw away 2268 * delegations, open streams and cached pages inline. 2269 */ 2270 return (EIO); 2271 } 2272 2273 /* 2274 * If we are using local locking for this filesystem, then 2275 * release all of the SYSV style record locks. Otherwise, 2276 * we are doing network locking and we need to release all 2277 * of the network locks. All of the locks held by this 2278 * process on this file are released no matter what the 2279 * incoming reference count is. 2280 */ 2281 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2282 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2283 cleanshares(vp, ttoproc(curthread)->p_pid); 2284 } else 2285 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2286 2287 if (e.error) { 2288 struct lm_sysid *lmsid; 2289 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2290 if (lmsid == NULL) { 2291 DTRACE_PROBE2(unknown__sysid, int, e.error, 2292 vnode_t *, vp); 2293 } else { 2294 cleanlocks(vp, ttoproc(curthread)->p_pid, 2295 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2296 } 2297 return (e.error); 2298 } 2299 2300 if (count > 1) 2301 return (0); 2302 2303 /* 2304 * If the file has been `unlinked', then purge the 2305 * DNLC so that this vnode will get reycled quicker 2306 * and the .nfs* file on the server will get removed. 2307 */ 2308 if (rp->r_unldvp != NULL) 2309 dnlc_purge_vp(vp); 2310 2311 /* 2312 * If the file was open for write and there are pages, 2313 * do a synchronous flush and commit of all of the 2314 * dirty and uncommitted pages. 2315 */ 2316 ASSERT(!e.error); 2317 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2318 error = nfs4_putpage_commit(vp, 0, 0, cr); 2319 2320 mutex_enter(&rp->r_statelock); 2321 r_error = rp->r_error; 2322 rp->r_error = 0; 2323 mutex_exit(&rp->r_statelock); 2324 2325 /* 2326 * If this file type is one for which no explicit 'open' was 2327 * done, then bail now (ie. no need for protocol 'close'). If 2328 * there was an error w/the vm subsystem, return _that_ error, 2329 * otherwise, return any errors that may've been reported via 2330 * the rnode. 2331 */ 2332 if (vp->v_type != VREG) 2333 return (error ? error : r_error); 2334 2335 /* 2336 * The sync putpage commit may have failed above, but since 2337 * we're working w/a regular file, we need to do the protocol 2338 * 'close' (nfs4close_one will figure out if an otw close is 2339 * needed or not). Report any errors _after_ doing the protocol 2340 * 'close'. 2341 */ 2342 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2343 n4error = e.error ? e.error : geterrno4(e.stat); 2344 2345 /* 2346 * Error reporting prio (Hi -> Lo) 2347 * 2348 * i) nfs4_putpage_commit (error) 2349 * ii) rnode's (r_error) 2350 * iii) nfs4close_one (n4error) 2351 */ 2352 return (error ? error : (r_error ? r_error : n4error)); 2353 } 2354 2355 /* 2356 * Initialize *lost_rqstp. 2357 */ 2358 2359 static void 2360 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2361 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2362 vnode_t *vp) 2363 { 2364 if (error != ETIMEDOUT && error != EINTR && 2365 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2366 lost_rqstp->lr_op = 0; 2367 return; 2368 } 2369 2370 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2371 "nfs4close_save_lost_rqst: error %d", error)); 2372 2373 lost_rqstp->lr_op = OP_CLOSE; 2374 /* 2375 * The vp is held and rele'd via the recovery code. 2376 * See nfs4_save_lost_rqst. 2377 */ 2378 lost_rqstp->lr_vp = vp; 2379 lost_rqstp->lr_dvp = NULL; 2380 lost_rqstp->lr_oop = oop; 2381 lost_rqstp->lr_osp = osp; 2382 ASSERT(osp != NULL); 2383 ASSERT(mutex_owned(&osp->os_sync_lock)); 2384 osp->os_pending_close = 1; 2385 lost_rqstp->lr_lop = NULL; 2386 lost_rqstp->lr_cr = cr; 2387 lost_rqstp->lr_flk = NULL; 2388 lost_rqstp->lr_putfirst = FALSE; 2389 } 2390 2391 /* 2392 * Assumes you already have the open seqid sync grabbed as well as the 2393 * 'os_sync_lock'. Note: this will release the open seqid sync and 2394 * 'os_sync_lock' if client recovery starts. Calling functions have to 2395 * be prepared to handle this. 2396 * 2397 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2398 * was needed and was started, and that the calling function should retry 2399 * this function; otherwise it is returned as 0. 2400 * 2401 * Errors are returned via the nfs4_error_t parameter. 2402 */ 2403 static void 2404 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2405 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2406 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2407 { 2408 COMPOUND4args_clnt args; 2409 COMPOUND4res_clnt res; 2410 CLOSE4args *close_args; 2411 nfs_resop4 *resop; 2412 nfs_argop4 argop[3]; 2413 int doqueue = 1; 2414 mntinfo4_t *mi; 2415 seqid4 seqid; 2416 vnode_t *vp; 2417 bool_t needrecov = FALSE; 2418 nfs4_lost_rqst_t lost_rqst; 2419 hrtime_t t; 2420 2421 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2422 2423 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2424 2425 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2426 2427 /* Only set this to 1 if recovery is started */ 2428 *recov = 0; 2429 2430 /* do the OTW call to close the file */ 2431 2432 if (close_type == CLOSE_RESEND) 2433 args.ctag = TAG_CLOSE_LOST; 2434 else if (close_type == CLOSE_AFTER_RESEND) 2435 args.ctag = TAG_CLOSE_UNDO; 2436 else 2437 args.ctag = TAG_CLOSE; 2438 2439 args.array_len = 3; 2440 args.array = argop; 2441 2442 vp = RTOV4(rp); 2443 2444 mi = VTOMI4(vp); 2445 2446 /* putfh target fh */ 2447 argop[0].argop = OP_CPUTFH; 2448 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2449 2450 argop[1].argop = OP_GETATTR; 2451 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2452 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2453 2454 argop[2].argop = OP_CLOSE; 2455 close_args = &argop[2].nfs_argop4_u.opclose; 2456 2457 seqid = nfs4_get_open_seqid(oop) + 1; 2458 2459 close_args->seqid = seqid; 2460 close_args->open_stateid = osp->open_stateid; 2461 2462 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2463 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2464 rnode4info(rp))); 2465 2466 t = gethrtime(); 2467 2468 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2469 2470 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2471 nfs4_set_open_seqid(seqid, oop, args.ctag); 2472 } 2473 2474 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2475 if (ep->error && !needrecov) { 2476 /* 2477 * if there was an error and no recovery is to be done 2478 * then then set up the file to flush its cache if 2479 * needed for the next caller. 2480 */ 2481 mutex_enter(&rp->r_statelock); 2482 PURGE_ATTRCACHE4_LOCKED(rp); 2483 rp->r_flags &= ~R4WRITEMODIFIED; 2484 mutex_exit(&rp->r_statelock); 2485 return; 2486 } 2487 2488 if (needrecov) { 2489 bool_t abort; 2490 nfs4_bseqid_entry_t *bsep = NULL; 2491 2492 if (close_type != CLOSE_RESEND) 2493 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2494 osp, cred_otw, vp); 2495 2496 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2497 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2498 0, args.ctag, close_args->seqid); 2499 2500 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2501 "nfs4close_otw: initiating recovery. error %d " 2502 "res.status %d", ep->error, res.status)); 2503 2504 /* 2505 * Drop the 'os_sync_lock' here so we don't hit 2506 * a potential recursive mutex_enter via an 2507 * 'open_stream_hold()'. 2508 */ 2509 mutex_exit(&osp->os_sync_lock); 2510 *have_sync_lockp = 0; 2511 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2512 (close_type != CLOSE_RESEND && 2513 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2514 OP_CLOSE, bsep); 2515 2516 /* drop open seq sync, and let the calling function regrab it */ 2517 nfs4_end_open_seqid_sync(oop); 2518 *did_start_seqid_syncp = 0; 2519 2520 if (bsep) 2521 kmem_free(bsep, sizeof (*bsep)); 2522 /* 2523 * For signals, the caller wants to quit, so don't say to 2524 * retry. For forced unmount, if it's a user thread, it 2525 * wants to quit. If it's a recovery thread, the retry 2526 * will happen higher-up on the call stack. Either way, 2527 * don't say to retry. 2528 */ 2529 if (abort == FALSE && ep->error != EINTR && 2530 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2531 close_type != CLOSE_RESEND && 2532 close_type != CLOSE_AFTER_RESEND) 2533 *recov = 1; 2534 else 2535 *recov = 0; 2536 2537 if (!ep->error) 2538 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2539 return; 2540 } 2541 2542 if (res.status) { 2543 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2544 return; 2545 } 2546 2547 mutex_enter(&rp->r_statev4_lock); 2548 rp->created_v4 = 0; 2549 mutex_exit(&rp->r_statev4_lock); 2550 2551 resop = &res.array[2]; 2552 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2553 osp->os_valid = 0; 2554 2555 /* 2556 * This removes the reference obtained at OPEN; ie, when the 2557 * open stream structure was created. 2558 * 2559 * We don't have to worry about calling 'open_stream_rele' 2560 * since we our currently holding a reference to the open 2561 * stream which means the count cannot go to 0 with this 2562 * decrement. 2563 */ 2564 ASSERT(osp->os_ref_count >= 2); 2565 osp->os_ref_count--; 2566 2567 if (!ep->error) 2568 nfs4_attr_cache(vp, 2569 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2570 t, cred_otw, TRUE, NULL); 2571 2572 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2573 " returning %d", ep->error)); 2574 2575 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2576 } 2577 2578 /* ARGSUSED */ 2579 static int 2580 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2581 caller_context_t *ct) 2582 { 2583 rnode4_t *rp; 2584 u_offset_t off; 2585 offset_t diff; 2586 uint_t on; 2587 uint_t n; 2588 caddr_t base; 2589 uint_t flags; 2590 int error; 2591 mntinfo4_t *mi; 2592 2593 rp = VTOR4(vp); 2594 2595 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2596 2597 if (IS_SHADOW(vp, rp)) 2598 vp = RTOV4(rp); 2599 2600 if (vp->v_type != VREG) 2601 return (EISDIR); 2602 2603 mi = VTOMI4(vp); 2604 2605 if (nfs_zone() != mi->mi_zone) 2606 return (EIO); 2607 2608 if (uiop->uio_resid == 0) 2609 return (0); 2610 2611 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2612 return (EINVAL); 2613 2614 mutex_enter(&rp->r_statelock); 2615 if (rp->r_flags & R4RECOVERRP) 2616 error = (rp->r_error ? rp->r_error : EIO); 2617 else 2618 error = 0; 2619 mutex_exit(&rp->r_statelock); 2620 if (error) 2621 return (error); 2622 2623 /* 2624 * Bypass VM if caching has been disabled (e.g., locking) or if 2625 * using client-side direct I/O and the file is not mmap'd and 2626 * there are no cached pages. 2627 */ 2628 if ((vp->v_flag & VNOCACHE) || 2629 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2630 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2631 size_t resid = 0; 2632 2633 return (nfs4read(vp, NULL, uiop->uio_loffset, 2634 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2635 } 2636 2637 error = 0; 2638 2639 do { 2640 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2641 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2642 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2643 2644 if (error = nfs4_validate_caches(vp, cr)) 2645 break; 2646 2647 mutex_enter(&rp->r_statelock); 2648 while (rp->r_flags & R4INCACHEPURGE) { 2649 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2650 mutex_exit(&rp->r_statelock); 2651 return (EINTR); 2652 } 2653 } 2654 diff = rp->r_size - uiop->uio_loffset; 2655 mutex_exit(&rp->r_statelock); 2656 if (diff <= 0) 2657 break; 2658 if (diff < n) 2659 n = (uint_t)diff; 2660 2661 if (vpm_enable) { 2662 /* 2663 * Copy data. 2664 */ 2665 error = vpm_data_copy(vp, off + on, n, uiop, 2666 1, NULL, 0, S_READ); 2667 } else { 2668 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2669 S_READ); 2670 2671 error = uiomove(base + on, n, UIO_READ, uiop); 2672 } 2673 2674 if (!error) { 2675 /* 2676 * If read a whole block or read to eof, 2677 * won't need this buffer again soon. 2678 */ 2679 mutex_enter(&rp->r_statelock); 2680 if (n + on == MAXBSIZE || 2681 uiop->uio_loffset == rp->r_size) 2682 flags = SM_DONTNEED; 2683 else 2684 flags = 0; 2685 mutex_exit(&rp->r_statelock); 2686 if (vpm_enable) { 2687 error = vpm_sync_pages(vp, off, n, flags); 2688 } else { 2689 error = segmap_release(segkmap, base, flags); 2690 } 2691 } else { 2692 if (vpm_enable) { 2693 (void) vpm_sync_pages(vp, off, n, 0); 2694 } else { 2695 (void) segmap_release(segkmap, base, 0); 2696 } 2697 } 2698 } while (!error && uiop->uio_resid > 0); 2699 2700 return (error); 2701 } 2702 2703 /* ARGSUSED */ 2704 static int 2705 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2706 caller_context_t *ct) 2707 { 2708 rlim64_t limit = uiop->uio_llimit; 2709 rnode4_t *rp; 2710 u_offset_t off; 2711 caddr_t base; 2712 uint_t flags; 2713 int remainder; 2714 size_t n; 2715 int on; 2716 int error; 2717 int resid; 2718 u_offset_t offset; 2719 mntinfo4_t *mi; 2720 uint_t bsize; 2721 2722 rp = VTOR4(vp); 2723 2724 if (IS_SHADOW(vp, rp)) 2725 vp = RTOV4(rp); 2726 2727 if (vp->v_type != VREG) 2728 return (EISDIR); 2729 2730 mi = VTOMI4(vp); 2731 2732 if (nfs_zone() != mi->mi_zone) 2733 return (EIO); 2734 2735 if (uiop->uio_resid == 0) 2736 return (0); 2737 2738 mutex_enter(&rp->r_statelock); 2739 if (rp->r_flags & R4RECOVERRP) 2740 error = (rp->r_error ? rp->r_error : EIO); 2741 else 2742 error = 0; 2743 mutex_exit(&rp->r_statelock); 2744 if (error) 2745 return (error); 2746 2747 if (ioflag & FAPPEND) { 2748 struct vattr va; 2749 2750 /* 2751 * Must serialize if appending. 2752 */ 2753 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2754 nfs_rw_exit(&rp->r_rwlock); 2755 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2756 INTR(vp))) 2757 return (EINTR); 2758 } 2759 2760 va.va_mask = AT_SIZE; 2761 error = nfs4getattr(vp, &va, cr); 2762 if (error) 2763 return (error); 2764 uiop->uio_loffset = va.va_size; 2765 } 2766 2767 offset = uiop->uio_loffset + uiop->uio_resid; 2768 2769 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2770 return (EINVAL); 2771 2772 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2773 limit = MAXOFFSET_T; 2774 2775 /* 2776 * Check to make sure that the process will not exceed 2777 * its limit on file size. It is okay to write up to 2778 * the limit, but not beyond. Thus, the write which 2779 * reaches the limit will be short and the next write 2780 * will return an error. 2781 */ 2782 remainder = 0; 2783 if (offset > uiop->uio_llimit) { 2784 remainder = offset - uiop->uio_llimit; 2785 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2786 if (uiop->uio_resid <= 0) { 2787 proc_t *p = ttoproc(curthread); 2788 2789 uiop->uio_resid += remainder; 2790 mutex_enter(&p->p_lock); 2791 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2792 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2793 mutex_exit(&p->p_lock); 2794 return (EFBIG); 2795 } 2796 } 2797 2798 /* update the change attribute, if we have a write delegation */ 2799 2800 mutex_enter(&rp->r_statev4_lock); 2801 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2802 rp->r_deleg_change++; 2803 2804 mutex_exit(&rp->r_statev4_lock); 2805 2806 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2807 return (EINTR); 2808 2809 /* 2810 * Bypass VM if caching has been disabled (e.g., locking) or if 2811 * using client-side direct I/O and the file is not mmap'd and 2812 * there are no cached pages. 2813 */ 2814 if ((vp->v_flag & VNOCACHE) || 2815 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2816 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2817 size_t bufsize; 2818 int count; 2819 u_offset_t org_offset; 2820 stable_how4 stab_comm; 2821 nfs4_fwrite: 2822 if (rp->r_flags & R4STALE) { 2823 resid = uiop->uio_resid; 2824 offset = uiop->uio_loffset; 2825 error = rp->r_error; 2826 goto bottom; 2827 } 2828 2829 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2830 base = kmem_alloc(bufsize, KM_SLEEP); 2831 do { 2832 if (ioflag & FDSYNC) 2833 stab_comm = DATA_SYNC4; 2834 else 2835 stab_comm = FILE_SYNC4; 2836 resid = uiop->uio_resid; 2837 offset = uiop->uio_loffset; 2838 count = MIN(uiop->uio_resid, bufsize); 2839 org_offset = uiop->uio_loffset; 2840 error = uiomove(base, count, UIO_WRITE, uiop); 2841 if (!error) { 2842 error = nfs4write(vp, base, org_offset, 2843 count, cr, &stab_comm); 2844 if (!error) { 2845 mutex_enter(&rp->r_statelock); 2846 if (rp->r_size < uiop->uio_loffset) 2847 rp->r_size = uiop->uio_loffset; 2848 mutex_exit(&rp->r_statelock); 2849 } 2850 } 2851 } while (!error && uiop->uio_resid > 0); 2852 kmem_free(base, bufsize); 2853 goto bottom; 2854 } 2855 2856 bsize = vp->v_vfsp->vfs_bsize; 2857 2858 do { 2859 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2860 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2861 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2862 2863 resid = uiop->uio_resid; 2864 offset = uiop->uio_loffset; 2865 2866 if (rp->r_flags & R4STALE) { 2867 error = rp->r_error; 2868 break; 2869 } 2870 2871 /* 2872 * Don't create dirty pages faster than they 2873 * can be cleaned so that the system doesn't 2874 * get imbalanced. If the async queue is 2875 * maxed out, then wait for it to drain before 2876 * creating more dirty pages. Also, wait for 2877 * any threads doing pagewalks in the vop_getattr 2878 * entry points so that they don't block for 2879 * long periods. 2880 */ 2881 mutex_enter(&rp->r_statelock); 2882 while ((mi->mi_max_threads != 0 && 2883 rp->r_awcount > 2 * mi->mi_max_threads) || 2884 rp->r_gcount > 0) 2885 cv_wait(&rp->r_cv, &rp->r_statelock); 2886 mutex_exit(&rp->r_statelock); 2887 2888 /* 2889 * Touch the page and fault it in if it is not in core 2890 * before segmap_getmapflt or vpm_data_copy can lock it. 2891 * This is to avoid the deadlock if the buffer is mapped 2892 * to the same file through mmap which we want to write. 2893 */ 2894 uio_prefaultpages((long)n, uiop); 2895 2896 if (vpm_enable) { 2897 /* 2898 * It will use kpm mappings, so no need to 2899 * pass an address. 2900 */ 2901 error = writerp4(rp, NULL, n, uiop, 0); 2902 } else { 2903 if (segmap_kpm) { 2904 int pon = uiop->uio_loffset & PAGEOFFSET; 2905 size_t pn = MIN(PAGESIZE - pon, 2906 uiop->uio_resid); 2907 int pagecreate; 2908 2909 mutex_enter(&rp->r_statelock); 2910 pagecreate = (pon == 0) && (pn == PAGESIZE || 2911 uiop->uio_loffset + pn >= rp->r_size); 2912 mutex_exit(&rp->r_statelock); 2913 2914 base = segmap_getmapflt(segkmap, vp, off + on, 2915 pn, !pagecreate, S_WRITE); 2916 2917 error = writerp4(rp, base + pon, n, uiop, 2918 pagecreate); 2919 2920 } else { 2921 base = segmap_getmapflt(segkmap, vp, off + on, 2922 n, 0, S_READ); 2923 error = writerp4(rp, base + on, n, uiop, 0); 2924 } 2925 } 2926 2927 if (!error) { 2928 if (mi->mi_flags & MI4_NOAC) 2929 flags = SM_WRITE; 2930 else if ((uiop->uio_loffset % bsize) == 0 || 2931 IS_SWAPVP(vp)) { 2932 /* 2933 * Have written a whole block. 2934 * Start an asynchronous write 2935 * and mark the buffer to 2936 * indicate that it won't be 2937 * needed again soon. 2938 */ 2939 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2940 } else 2941 flags = 0; 2942 if ((ioflag & (FSYNC|FDSYNC)) || 2943 (rp->r_flags & R4OUTOFSPACE)) { 2944 flags &= ~SM_ASYNC; 2945 flags |= SM_WRITE; 2946 } 2947 if (vpm_enable) { 2948 error = vpm_sync_pages(vp, off, n, flags); 2949 } else { 2950 error = segmap_release(segkmap, base, flags); 2951 } 2952 } else { 2953 if (vpm_enable) { 2954 (void) vpm_sync_pages(vp, off, n, 0); 2955 } else { 2956 (void) segmap_release(segkmap, base, 0); 2957 } 2958 /* 2959 * In the event that we got an access error while 2960 * faulting in a page for a write-only file just 2961 * force a write. 2962 */ 2963 if (error == EACCES) 2964 goto nfs4_fwrite; 2965 } 2966 } while (!error && uiop->uio_resid > 0); 2967 2968 bottom: 2969 if (error) { 2970 uiop->uio_resid = resid + remainder; 2971 uiop->uio_loffset = offset; 2972 } else { 2973 uiop->uio_resid += remainder; 2974 2975 mutex_enter(&rp->r_statev4_lock); 2976 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 2977 gethrestime(&rp->r_attr.va_mtime); 2978 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 2979 } 2980 mutex_exit(&rp->r_statev4_lock); 2981 } 2982 2983 nfs_rw_exit(&rp->r_lkserlock); 2984 2985 return (error); 2986 } 2987 2988 /* 2989 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 2990 */ 2991 static int 2992 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 2993 int flags, cred_t *cr) 2994 { 2995 struct buf *bp; 2996 int error; 2997 page_t *savepp; 2998 uchar_t fsdata; 2999 stable_how4 stab_comm; 3000 3001 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3002 bp = pageio_setup(pp, len, vp, flags); 3003 ASSERT(bp != NULL); 3004 3005 /* 3006 * pageio_setup should have set b_addr to 0. This 3007 * is correct since we want to do I/O on a page 3008 * boundary. bp_mapin will use this addr to calculate 3009 * an offset, and then set b_addr to the kernel virtual 3010 * address it allocated for us. 3011 */ 3012 ASSERT(bp->b_un.b_addr == 0); 3013 3014 bp->b_edev = 0; 3015 bp->b_dev = 0; 3016 bp->b_lblkno = lbtodb(off); 3017 bp->b_file = vp; 3018 bp->b_offset = (offset_t)off; 3019 bp_mapin(bp); 3020 3021 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3022 freemem > desfree) 3023 stab_comm = UNSTABLE4; 3024 else 3025 stab_comm = FILE_SYNC4; 3026 3027 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3028 3029 bp_mapout(bp); 3030 pageio_done(bp); 3031 3032 if (stab_comm == UNSTABLE4) 3033 fsdata = C_DELAYCOMMIT; 3034 else 3035 fsdata = C_NOCOMMIT; 3036 3037 savepp = pp; 3038 do { 3039 pp->p_fsdata = fsdata; 3040 } while ((pp = pp->p_next) != savepp); 3041 3042 return (error); 3043 } 3044 3045 /* 3046 */ 3047 static int 3048 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3049 { 3050 nfs4_open_owner_t *oop; 3051 nfs4_open_stream_t *osp; 3052 rnode4_t *rp = VTOR4(vp); 3053 mntinfo4_t *mi = VTOMI4(vp); 3054 int reopen_needed; 3055 3056 ASSERT(nfs_zone() == mi->mi_zone); 3057 3058 3059 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3060 if (!oop) 3061 return (EIO); 3062 3063 /* returns with 'os_sync_lock' held */ 3064 osp = find_open_stream(oop, rp); 3065 if (!osp) { 3066 open_owner_rele(oop); 3067 return (EIO); 3068 } 3069 3070 if (osp->os_failed_reopen) { 3071 mutex_exit(&osp->os_sync_lock); 3072 open_stream_rele(osp, rp); 3073 open_owner_rele(oop); 3074 return (EIO); 3075 } 3076 3077 /* 3078 * Determine whether a reopen is needed. If this 3079 * is a delegation open stream, then the os_delegation bit 3080 * should be set. 3081 */ 3082 3083 reopen_needed = osp->os_delegation; 3084 3085 mutex_exit(&osp->os_sync_lock); 3086 open_owner_rele(oop); 3087 3088 if (reopen_needed) { 3089 nfs4_error_zinit(ep); 3090 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3091 mutex_enter(&osp->os_sync_lock); 3092 if (ep->error || ep->stat || osp->os_failed_reopen) { 3093 mutex_exit(&osp->os_sync_lock); 3094 open_stream_rele(osp, rp); 3095 return (EIO); 3096 } 3097 mutex_exit(&osp->os_sync_lock); 3098 } 3099 open_stream_rele(osp, rp); 3100 3101 return (0); 3102 } 3103 3104 /* 3105 * Write to file. Writes to remote server in largest size 3106 * chunks that the server can handle. Write is synchronous. 3107 */ 3108 static int 3109 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3110 stable_how4 *stab_comm) 3111 { 3112 mntinfo4_t *mi; 3113 COMPOUND4args_clnt args; 3114 COMPOUND4res_clnt res; 3115 WRITE4args *wargs; 3116 WRITE4res *wres; 3117 nfs_argop4 argop[2]; 3118 nfs_resop4 *resop; 3119 int tsize; 3120 stable_how4 stable; 3121 rnode4_t *rp; 3122 int doqueue = 1; 3123 bool_t needrecov; 3124 nfs4_recov_state_t recov_state; 3125 nfs4_stateid_types_t sid_types; 3126 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3127 int recov; 3128 3129 rp = VTOR4(vp); 3130 mi = VTOMI4(vp); 3131 3132 ASSERT(nfs_zone() == mi->mi_zone); 3133 3134 stable = *stab_comm; 3135 *stab_comm = FILE_SYNC4; 3136 3137 needrecov = FALSE; 3138 recov_state.rs_flags = 0; 3139 recov_state.rs_num_retry_despite_err = 0; 3140 nfs4_init_stateid_types(&sid_types); 3141 3142 /* Is curthread the recovery thread? */ 3143 mutex_enter(&mi->mi_lock); 3144 recov = (mi->mi_recovthread == curthread); 3145 mutex_exit(&mi->mi_lock); 3146 3147 recov_retry: 3148 args.ctag = TAG_WRITE; 3149 args.array_len = 2; 3150 args.array = argop; 3151 3152 if (!recov) { 3153 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3154 &recov_state, NULL); 3155 if (e.error) 3156 return (e.error); 3157 } 3158 3159 /* 0. putfh target fh */ 3160 argop[0].argop = OP_CPUTFH; 3161 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3162 3163 /* 1. write */ 3164 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3165 3166 do { 3167 3168 wargs->offset = (offset4)offset; 3169 wargs->data_val = base; 3170 3171 if (mi->mi_io_kstats) { 3172 mutex_enter(&mi->mi_lock); 3173 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3174 mutex_exit(&mi->mi_lock); 3175 } 3176 3177 if ((vp->v_flag & VNOCACHE) || 3178 (rp->r_flags & R4DIRECTIO) || 3179 (mi->mi_flags & MI4_DIRECTIO)) 3180 tsize = MIN(mi->mi_stsize, count); 3181 else 3182 tsize = MIN(mi->mi_curwrite, count); 3183 wargs->data_len = (uint_t)tsize; 3184 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3185 3186 if (mi->mi_io_kstats) { 3187 mutex_enter(&mi->mi_lock); 3188 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3189 mutex_exit(&mi->mi_lock); 3190 } 3191 3192 if (!recov) { 3193 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3194 if (e.error && !needrecov) { 3195 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3196 &recov_state, needrecov); 3197 return (e.error); 3198 } 3199 } else { 3200 if (e.error) 3201 return (e.error); 3202 } 3203 3204 /* 3205 * Do handling of OLD_STATEID outside 3206 * of the normal recovery framework. 3207 * 3208 * If write receives a BAD stateid error while using a 3209 * delegation stateid, retry using the open stateid (if it 3210 * exists). If it doesn't have an open stateid, reopen the 3211 * file first, then retry. 3212 */ 3213 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3214 sid_types.cur_sid_type != SPEC_SID) { 3215 nfs4_save_stateid(&wargs->stateid, &sid_types); 3216 if (!recov) 3217 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3218 &recov_state, needrecov); 3219 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3220 goto recov_retry; 3221 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3222 sid_types.cur_sid_type == DEL_SID) { 3223 nfs4_save_stateid(&wargs->stateid, &sid_types); 3224 mutex_enter(&rp->r_statev4_lock); 3225 rp->r_deleg_return_pending = TRUE; 3226 mutex_exit(&rp->r_statev4_lock); 3227 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3228 if (!recov) 3229 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3230 &recov_state, needrecov); 3231 (void) xdr_free(xdr_COMPOUND4res_clnt, 3232 (caddr_t)&res); 3233 return (EIO); 3234 } 3235 if (!recov) 3236 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3237 &recov_state, needrecov); 3238 /* hold needed for nfs4delegreturn_thread */ 3239 VN_HOLD(vp); 3240 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3241 NFS4_DR_DISCARD), FALSE); 3242 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3243 goto recov_retry; 3244 } 3245 3246 if (needrecov) { 3247 bool_t abort; 3248 3249 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3250 "nfs4write: client got error %d, res.status %d" 3251 ", so start recovery", e.error, res.status)); 3252 3253 abort = nfs4_start_recovery(&e, 3254 VTOMI4(vp), vp, NULL, &wargs->stateid, 3255 NULL, OP_WRITE, NULL); 3256 if (!e.error) { 3257 e.error = geterrno4(res.status); 3258 (void) xdr_free(xdr_COMPOUND4res_clnt, 3259 (caddr_t)&res); 3260 } 3261 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3262 &recov_state, needrecov); 3263 if (abort == FALSE) 3264 goto recov_retry; 3265 return (e.error); 3266 } 3267 3268 if (res.status) { 3269 e.error = geterrno4(res.status); 3270 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3271 if (!recov) 3272 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3273 &recov_state, needrecov); 3274 return (e.error); 3275 } 3276 3277 resop = &res.array[1]; /* write res */ 3278 wres = &resop->nfs_resop4_u.opwrite; 3279 3280 if ((int)wres->count > tsize) { 3281 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3282 3283 zcmn_err(getzoneid(), CE_WARN, 3284 "nfs4write: server wrote %u, requested was %u", 3285 (int)wres->count, tsize); 3286 if (!recov) 3287 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3288 &recov_state, needrecov); 3289 return (EIO); 3290 } 3291 if (wres->committed == UNSTABLE4) { 3292 *stab_comm = UNSTABLE4; 3293 if (wargs->stable == DATA_SYNC4 || 3294 wargs->stable == FILE_SYNC4) { 3295 (void) xdr_free(xdr_COMPOUND4res_clnt, 3296 (caddr_t)&res); 3297 zcmn_err(getzoneid(), CE_WARN, 3298 "nfs4write: server %s did not commit " 3299 "to stable storage", 3300 rp->r_server->sv_hostname); 3301 if (!recov) 3302 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3303 OH_WRITE, &recov_state, needrecov); 3304 return (EIO); 3305 } 3306 } 3307 3308 tsize = (int)wres->count; 3309 count -= tsize; 3310 base += tsize; 3311 offset += tsize; 3312 if (mi->mi_io_kstats) { 3313 mutex_enter(&mi->mi_lock); 3314 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3315 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3316 tsize; 3317 mutex_exit(&mi->mi_lock); 3318 } 3319 lwp_stat_update(LWP_STAT_OUBLK, 1); 3320 mutex_enter(&rp->r_statelock); 3321 if (rp->r_flags & R4HAVEVERF) { 3322 if (rp->r_writeverf != wres->writeverf) { 3323 nfs4_set_mod(vp); 3324 rp->r_writeverf = wres->writeverf; 3325 } 3326 } else { 3327 rp->r_writeverf = wres->writeverf; 3328 rp->r_flags |= R4HAVEVERF; 3329 } 3330 PURGE_ATTRCACHE4_LOCKED(rp); 3331 rp->r_flags |= R4WRITEMODIFIED; 3332 gethrestime(&rp->r_attr.va_mtime); 3333 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3334 mutex_exit(&rp->r_statelock); 3335 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3336 } while (count); 3337 3338 if (!recov) 3339 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3340 needrecov); 3341 3342 return (e.error); 3343 } 3344 3345 /* 3346 * Read from a file. Reads data in largest chunks our interface can handle. 3347 */ 3348 static int 3349 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3350 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3351 { 3352 mntinfo4_t *mi; 3353 COMPOUND4args_clnt args; 3354 COMPOUND4res_clnt res; 3355 READ4args *rargs; 3356 nfs_argop4 argop[2]; 3357 int tsize; 3358 int doqueue; 3359 rnode4_t *rp; 3360 int data_len; 3361 bool_t is_eof; 3362 bool_t needrecov = FALSE; 3363 nfs4_recov_state_t recov_state; 3364 nfs4_stateid_types_t sid_types; 3365 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3366 3367 rp = VTOR4(vp); 3368 mi = VTOMI4(vp); 3369 doqueue = 1; 3370 3371 ASSERT(nfs_zone() == mi->mi_zone); 3372 3373 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3374 3375 args.array_len = 2; 3376 args.array = argop; 3377 3378 nfs4_init_stateid_types(&sid_types); 3379 3380 recov_state.rs_flags = 0; 3381 recov_state.rs_num_retry_despite_err = 0; 3382 3383 recov_retry: 3384 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3385 &recov_state, NULL); 3386 if (e.error) 3387 return (e.error); 3388 3389 /* putfh target fh */ 3390 argop[0].argop = OP_CPUTFH; 3391 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3392 3393 /* read */ 3394 argop[1].argop = OP_READ; 3395 rargs = &argop[1].nfs_argop4_u.opread; 3396 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3397 OP_READ, &sid_types, async); 3398 3399 do { 3400 if (mi->mi_io_kstats) { 3401 mutex_enter(&mi->mi_lock); 3402 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3403 mutex_exit(&mi->mi_lock); 3404 } 3405 3406 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3407 "nfs4read: %s call, rp %s", 3408 needrecov ? "recov" : "first", 3409 rnode4info(rp))); 3410 3411 if ((vp->v_flag & VNOCACHE) || 3412 (rp->r_flags & R4DIRECTIO) || 3413 (mi->mi_flags & MI4_DIRECTIO)) 3414 tsize = MIN(mi->mi_tsize, count); 3415 else 3416 tsize = MIN(mi->mi_curread, count); 3417 3418 rargs->offset = (offset4)offset; 3419 rargs->count = (count4)tsize; 3420 rargs->res_data_val_alt = NULL; 3421 rargs->res_mblk = NULL; 3422 rargs->res_uiop = NULL; 3423 rargs->res_maxsize = 0; 3424 rargs->wlist = NULL; 3425 3426 if (uiop) 3427 rargs->res_uiop = uiop; 3428 else 3429 rargs->res_data_val_alt = base; 3430 rargs->res_maxsize = tsize; 3431 3432 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3433 #ifdef DEBUG 3434 if (nfs4read_error_inject) { 3435 res.status = nfs4read_error_inject; 3436 nfs4read_error_inject = 0; 3437 } 3438 #endif 3439 3440 if (mi->mi_io_kstats) { 3441 mutex_enter(&mi->mi_lock); 3442 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3443 mutex_exit(&mi->mi_lock); 3444 } 3445 3446 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3447 if (e.error != 0 && !needrecov) { 3448 nfs4_end_fop(mi, vp, NULL, OH_READ, 3449 &recov_state, needrecov); 3450 return (e.error); 3451 } 3452 3453 /* 3454 * Do proper retry for OLD and BAD stateid errors outside 3455 * of the normal recovery framework. There are two differences 3456 * between async and sync reads. The first is that we allow 3457 * retry on BAD_STATEID for async reads, but not sync reads. 3458 * The second is that we mark the file dead for a failed 3459 * attempt with a special stateid for sync reads, but just 3460 * return EIO for async reads. 3461 * 3462 * If a sync read receives a BAD stateid error while using a 3463 * delegation stateid, retry using the open stateid (if it 3464 * exists). If it doesn't have an open stateid, reopen the 3465 * file first, then retry. 3466 */ 3467 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3468 res.status == NFS4ERR_BAD_STATEID) && async) { 3469 nfs4_end_fop(mi, vp, NULL, OH_READ, 3470 &recov_state, needrecov); 3471 if (sid_types.cur_sid_type == SPEC_SID) { 3472 (void) xdr_free(xdr_COMPOUND4res_clnt, 3473 (caddr_t)&res); 3474 return (EIO); 3475 } 3476 nfs4_save_stateid(&rargs->stateid, &sid_types); 3477 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3478 goto recov_retry; 3479 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3480 !async && sid_types.cur_sid_type != SPEC_SID) { 3481 nfs4_save_stateid(&rargs->stateid, &sid_types); 3482 nfs4_end_fop(mi, vp, NULL, OH_READ, 3483 &recov_state, needrecov); 3484 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3485 goto recov_retry; 3486 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3487 sid_types.cur_sid_type == DEL_SID) { 3488 nfs4_save_stateid(&rargs->stateid, &sid_types); 3489 mutex_enter(&rp->r_statev4_lock); 3490 rp->r_deleg_return_pending = TRUE; 3491 mutex_exit(&rp->r_statev4_lock); 3492 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3493 nfs4_end_fop(mi, vp, NULL, OH_READ, 3494 &recov_state, needrecov); 3495 (void) xdr_free(xdr_COMPOUND4res_clnt, 3496 (caddr_t)&res); 3497 return (EIO); 3498 } 3499 nfs4_end_fop(mi, vp, NULL, OH_READ, 3500 &recov_state, needrecov); 3501 /* hold needed for nfs4delegreturn_thread */ 3502 VN_HOLD(vp); 3503 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3504 NFS4_DR_DISCARD), FALSE); 3505 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3506 goto recov_retry; 3507 } 3508 if (needrecov) { 3509 bool_t abort; 3510 3511 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3512 "nfs4read: initiating recovery\n")); 3513 abort = nfs4_start_recovery(&e, 3514 mi, vp, NULL, &rargs->stateid, 3515 NULL, OP_READ, NULL); 3516 nfs4_end_fop(mi, vp, NULL, OH_READ, 3517 &recov_state, needrecov); 3518 /* 3519 * Do not retry if we got OLD_STATEID using a special 3520 * stateid. This avoids looping with a broken server. 3521 */ 3522 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3523 sid_types.cur_sid_type == SPEC_SID) 3524 abort = TRUE; 3525 3526 if (abort == FALSE) { 3527 /* 3528 * Need to retry all possible stateids in 3529 * case the recovery error wasn't stateid 3530 * related or the stateids have become 3531 * stale (server reboot). 3532 */ 3533 nfs4_init_stateid_types(&sid_types); 3534 (void) xdr_free(xdr_COMPOUND4res_clnt, 3535 (caddr_t)&res); 3536 goto recov_retry; 3537 } 3538 3539 if (!e.error) { 3540 e.error = geterrno4(res.status); 3541 (void) xdr_free(xdr_COMPOUND4res_clnt, 3542 (caddr_t)&res); 3543 } 3544 return (e.error); 3545 } 3546 3547 if (res.status) { 3548 e.error = geterrno4(res.status); 3549 nfs4_end_fop(mi, vp, NULL, OH_READ, 3550 &recov_state, needrecov); 3551 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3552 return (e.error); 3553 } 3554 3555 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3556 count -= data_len; 3557 if (base) 3558 base += data_len; 3559 offset += data_len; 3560 if (mi->mi_io_kstats) { 3561 mutex_enter(&mi->mi_lock); 3562 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3563 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3564 mutex_exit(&mi->mi_lock); 3565 } 3566 lwp_stat_update(LWP_STAT_INBLK, 1); 3567 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3568 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3569 3570 } while (count && !is_eof); 3571 3572 *residp = count; 3573 3574 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3575 3576 return (e.error); 3577 } 3578 3579 /* ARGSUSED */ 3580 static int 3581 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3582 caller_context_t *ct) 3583 { 3584 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3585 return (EIO); 3586 switch (cmd) { 3587 case _FIODIRECTIO: 3588 return (nfs4_directio(vp, (int)arg, cr)); 3589 default: 3590 return (ENOTTY); 3591 } 3592 } 3593 3594 /* ARGSUSED */ 3595 int 3596 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3597 caller_context_t *ct) 3598 { 3599 int error; 3600 rnode4_t *rp = VTOR4(vp); 3601 3602 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3603 return (EIO); 3604 /* 3605 * If it has been specified that the return value will 3606 * just be used as a hint, and we are only being asked 3607 * for size, fsid or rdevid, then return the client's 3608 * notion of these values without checking to make sure 3609 * that the attribute cache is up to date. 3610 * The whole point is to avoid an over the wire GETATTR 3611 * call. 3612 */ 3613 if (flags & ATTR_HINT) { 3614 if (vap->va_mask == 3615 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3616 mutex_enter(&rp->r_statelock); 3617 if (vap->va_mask | AT_SIZE) 3618 vap->va_size = rp->r_size; 3619 if (vap->va_mask | AT_FSID) 3620 vap->va_fsid = rp->r_attr.va_fsid; 3621 if (vap->va_mask | AT_RDEV) 3622 vap->va_rdev = rp->r_attr.va_rdev; 3623 mutex_exit(&rp->r_statelock); 3624 return (0); 3625 } 3626 } 3627 3628 /* 3629 * Only need to flush pages if asking for the mtime 3630 * and if there any dirty pages or any outstanding 3631 * asynchronous (write) requests for this file. 3632 */ 3633 if (vap->va_mask & AT_MTIME) { 3634 rp = VTOR4(vp); 3635 if (nfs4_has_pages(vp)) { 3636 mutex_enter(&rp->r_statev4_lock); 3637 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3638 mutex_exit(&rp->r_statev4_lock); 3639 if (rp->r_flags & R4DIRTY || 3640 rp->r_awcount > 0) { 3641 mutex_enter(&rp->r_statelock); 3642 rp->r_gcount++; 3643 mutex_exit(&rp->r_statelock); 3644 error = 3645 nfs4_putpage(vp, (u_offset_t)0, 3646 0, 0, cr, NULL); 3647 mutex_enter(&rp->r_statelock); 3648 if (error && (error == ENOSPC || 3649 error == EDQUOT)) { 3650 if (!rp->r_error) 3651 rp->r_error = error; 3652 } 3653 if (--rp->r_gcount == 0) 3654 cv_broadcast(&rp->r_cv); 3655 mutex_exit(&rp->r_statelock); 3656 } 3657 } else { 3658 mutex_exit(&rp->r_statev4_lock); 3659 } 3660 } 3661 } 3662 return (nfs4getattr(vp, vap, cr)); 3663 } 3664 3665 int 3666 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3667 { 3668 /* 3669 * If these are the only two bits cleared 3670 * on the server then return 0 (OK) else 3671 * return 1 (BAD). 3672 */ 3673 on_client &= ~(S_ISUID|S_ISGID); 3674 if (on_client == from_server) 3675 return (0); 3676 else 3677 return (1); 3678 } 3679 3680 /*ARGSUSED4*/ 3681 static int 3682 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3683 caller_context_t *ct) 3684 { 3685 if (vap->va_mask & AT_NOSET) 3686 return (EINVAL); 3687 3688 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3689 return (EIO); 3690 3691 /* 3692 * Don't call secpolicy_vnode_setattr, the client cannot 3693 * use its cached attributes to make security decisions 3694 * as the server may be faking mode bits or mapping uid/gid. 3695 * Always just let the server to the checking. 3696 * If we provide the ability to remove basic priviledges 3697 * to setattr (e.g. basic without chmod) then we will 3698 * need to add a check here before calling the server. 3699 */ 3700 3701 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3702 } 3703 3704 /* 3705 * To replace the "guarded" version 3 setattr, we use two types of compound 3706 * setattr requests: 3707 * 1. The "normal" setattr, used when the size of the file isn't being 3708 * changed - { Putfh <fh>; Setattr; Getattr }/ 3709 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3710 * with only ctime as the argument. If the server ctime differs from 3711 * what is cached on the client, the verify will fail, but we would 3712 * already have the ctime from the preceding getattr, so just set it 3713 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3714 * Setattr; Getattr }. 3715 * 3716 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3717 * this setattr and NULL if they are not. 3718 */ 3719 static int 3720 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3721 vsecattr_t *vsap) 3722 { 3723 COMPOUND4args_clnt args; 3724 COMPOUND4res_clnt res, *resp = NULL; 3725 nfs4_ga_res_t *garp = NULL; 3726 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3727 nfs_argop4 argop[5]; 3728 int verify_argop = -1; 3729 int setattr_argop = 1; 3730 nfs_resop4 *resop; 3731 vattr_t va; 3732 rnode4_t *rp; 3733 int doqueue = 1; 3734 uint_t mask = vap->va_mask; 3735 mode_t omode; 3736 vsecattr_t *vsp; 3737 timestruc_t ctime; 3738 bool_t needrecov = FALSE; 3739 nfs4_recov_state_t recov_state; 3740 nfs4_stateid_types_t sid_types; 3741 stateid4 stateid; 3742 hrtime_t t; 3743 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3744 servinfo4_t *svp; 3745 bitmap4 supp_attrs; 3746 3747 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3748 rp = VTOR4(vp); 3749 nfs4_init_stateid_types(&sid_types); 3750 3751 /* 3752 * Only need to flush pages if there are any pages and 3753 * if the file is marked as dirty in some fashion. The 3754 * file must be flushed so that we can accurately 3755 * determine the size of the file and the cached data 3756 * after the SETATTR returns. A file is considered to 3757 * be dirty if it is either marked with R4DIRTY, has 3758 * outstanding i/o's active, or is mmap'd. In this 3759 * last case, we can't tell whether there are dirty 3760 * pages, so we flush just to be sure. 3761 */ 3762 if (nfs4_has_pages(vp) && 3763 ((rp->r_flags & R4DIRTY) || 3764 rp->r_count > 0 || 3765 rp->r_mapcnt > 0)) { 3766 ASSERT(vp->v_type != VCHR); 3767 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3768 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3769 mutex_enter(&rp->r_statelock); 3770 if (!rp->r_error) 3771 rp->r_error = e.error; 3772 mutex_exit(&rp->r_statelock); 3773 } 3774 } 3775 3776 if (mask & AT_SIZE) { 3777 /* 3778 * Verification setattr compound for non-deleg AT_SIZE: 3779 * { Putfh; Getattr; Verify; Setattr; Getattr } 3780 * Set ctime local here (outside the do_again label) 3781 * so that subsequent retries (after failed VERIFY) 3782 * will use ctime from GETATTR results (from failed 3783 * verify compound) as VERIFY arg. 3784 * If file has delegation, then VERIFY(time_metadata) 3785 * is of little added value, so don't bother. 3786 */ 3787 mutex_enter(&rp->r_statev4_lock); 3788 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3789 rp->r_deleg_return_pending) { 3790 numops = 5; 3791 ctime = rp->r_attr.va_ctime; 3792 } 3793 mutex_exit(&rp->r_statev4_lock); 3794 } 3795 3796 recov_state.rs_flags = 0; 3797 recov_state.rs_num_retry_despite_err = 0; 3798 3799 args.ctag = TAG_SETATTR; 3800 do_again: 3801 recov_retry: 3802 setattr_argop = numops - 2; 3803 3804 args.array = argop; 3805 args.array_len = numops; 3806 3807 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3808 if (e.error) 3809 return (e.error); 3810 3811 3812 /* putfh target fh */ 3813 argop[0].argop = OP_CPUTFH; 3814 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3815 3816 if (numops == 5) { 3817 /* 3818 * We only care about the ctime, but need to get mtime 3819 * and size for proper cache update. 3820 */ 3821 /* getattr */ 3822 argop[1].argop = OP_GETATTR; 3823 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3824 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3825 3826 /* verify - set later in loop */ 3827 verify_argop = 2; 3828 } 3829 3830 /* setattr */ 3831 svp = rp->r_server; 3832 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3833 supp_attrs = svp->sv_supp_attrs; 3834 nfs_rw_exit(&svp->sv_lock); 3835 3836 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3837 supp_attrs, &e.error, &sid_types); 3838 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3839 if (e.error) { 3840 /* req time field(s) overflow - return immediately */ 3841 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3842 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3843 opsetattr.obj_attributes); 3844 return (e.error); 3845 } 3846 omode = rp->r_attr.va_mode; 3847 3848 /* getattr */ 3849 argop[numops-1].argop = OP_GETATTR; 3850 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3851 /* 3852 * If we are setting the ACL (indicated only by vsap != NULL), request 3853 * the ACL in this getattr. The ACL returned from this getattr will be 3854 * used in updating the ACL cache. 3855 */ 3856 if (vsap != NULL) 3857 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3858 FATTR4_ACL_MASK; 3859 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3860 3861 /* 3862 * setattr iterates if the object size is set and the cached ctime 3863 * does not match the file ctime. In that case, verify the ctime first. 3864 */ 3865 3866 do { 3867 if (verify_argop != -1) { 3868 /* 3869 * Verify that the ctime match before doing setattr. 3870 */ 3871 va.va_mask = AT_CTIME; 3872 va.va_ctime = ctime; 3873 svp = rp->r_server; 3874 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3875 supp_attrs = svp->sv_supp_attrs; 3876 nfs_rw_exit(&svp->sv_lock); 3877 e.error = nfs4args_verify(&argop[verify_argop], &va, 3878 OP_VERIFY, supp_attrs); 3879 if (e.error) { 3880 /* req time field(s) overflow - return */ 3881 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3882 needrecov); 3883 break; 3884 } 3885 } 3886 3887 doqueue = 1; 3888 3889 t = gethrtime(); 3890 3891 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3892 3893 /* 3894 * Purge the access cache and ACL cache if changing either the 3895 * owner of the file, the group owner, or the mode. These may 3896 * change the access permissions of the file, so purge old 3897 * information and start over again. 3898 */ 3899 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3900 (void) nfs4_access_purge_rp(rp); 3901 if (rp->r_secattr != NULL) { 3902 mutex_enter(&rp->r_statelock); 3903 vsp = rp->r_secattr; 3904 rp->r_secattr = NULL; 3905 mutex_exit(&rp->r_statelock); 3906 if (vsp != NULL) 3907 nfs4_acl_free_cache(vsp); 3908 } 3909 } 3910 3911 /* 3912 * If res.array_len == numops, then everything succeeded, 3913 * except for possibly the final getattr. If only the 3914 * last getattr failed, give up, and don't try recovery. 3915 */ 3916 if (res.array_len == numops) { 3917 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3918 needrecov); 3919 if (! e.error) 3920 resp = &res; 3921 break; 3922 } 3923 3924 /* 3925 * if either rpc call failed or completely succeeded - done 3926 */ 3927 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3928 if (e.error) { 3929 PURGE_ATTRCACHE4(vp); 3930 if (!needrecov) { 3931 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3932 needrecov); 3933 break; 3934 } 3935 } 3936 3937 /* 3938 * Do proper retry for OLD_STATEID outside of the normal 3939 * recovery framework. 3940 */ 3941 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3942 sid_types.cur_sid_type != SPEC_SID && 3943 sid_types.cur_sid_type != NO_SID) { 3944 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3945 needrecov); 3946 nfs4_save_stateid(&stateid, &sid_types); 3947 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3948 opsetattr.obj_attributes); 3949 if (verify_argop != -1) { 3950 nfs4args_verify_free(&argop[verify_argop]); 3951 verify_argop = -1; 3952 } 3953 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3954 goto recov_retry; 3955 } 3956 3957 if (needrecov) { 3958 bool_t abort; 3959 3960 abort = nfs4_start_recovery(&e, 3961 VTOMI4(vp), vp, NULL, NULL, NULL, 3962 OP_SETATTR, NULL); 3963 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3964 needrecov); 3965 /* 3966 * Do not retry if we failed with OLD_STATEID using 3967 * a special stateid. This is done to avoid looping 3968 * with a broken server. 3969 */ 3970 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3971 (sid_types.cur_sid_type == SPEC_SID || 3972 sid_types.cur_sid_type == NO_SID)) 3973 abort = TRUE; 3974 if (!e.error) { 3975 if (res.status == NFS4ERR_BADOWNER) 3976 nfs4_log_badowner(VTOMI4(vp), 3977 OP_SETATTR); 3978 3979 e.error = geterrno4(res.status); 3980 (void) xdr_free(xdr_COMPOUND4res_clnt, 3981 (caddr_t)&res); 3982 } 3983 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3984 opsetattr.obj_attributes); 3985 if (verify_argop != -1) { 3986 nfs4args_verify_free(&argop[verify_argop]); 3987 verify_argop = -1; 3988 } 3989 if (abort == FALSE) { 3990 /* 3991 * Need to retry all possible stateids in 3992 * case the recovery error wasn't stateid 3993 * related or the stateids have become 3994 * stale (server reboot). 3995 */ 3996 nfs4_init_stateid_types(&sid_types); 3997 goto recov_retry; 3998 } 3999 return (e.error); 4000 } 4001 4002 /* 4003 * Need to call nfs4_end_op before nfs4getattr to 4004 * avoid potential nfs4_start_op deadlock. See RFE 4005 * 4777612. Calls to nfs4_invalidate_pages() and 4006 * nfs4_purge_stale_fh() might also generate over the 4007 * wire calls which my cause nfs4_start_op() deadlock. 4008 */ 4009 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4010 4011 /* 4012 * Check to update lease. 4013 */ 4014 resp = &res; 4015 if (res.status == NFS4_OK) { 4016 break; 4017 } 4018 4019 /* 4020 * Check if verify failed to see if try again 4021 */ 4022 if ((verify_argop == -1) || (res.array_len != 3)) { 4023 /* 4024 * can't continue... 4025 */ 4026 if (res.status == NFS4ERR_BADOWNER) 4027 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4028 4029 e.error = geterrno4(res.status); 4030 } else { 4031 /* 4032 * When the verify request fails, the client ctime is 4033 * not in sync with the server. This is the same as 4034 * the version 3 "not synchronized" error, and we 4035 * handle it in a similar manner (XXX do we need to???). 4036 * Use the ctime returned in the first getattr for 4037 * the input to the next verify. 4038 * If we couldn't get the attributes, then we give up 4039 * because we can't complete the operation as required. 4040 */ 4041 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4042 } 4043 if (e.error) { 4044 PURGE_ATTRCACHE4(vp); 4045 nfs4_purge_stale_fh(e.error, vp, cr); 4046 } else { 4047 /* 4048 * retry with a new verify value 4049 */ 4050 ctime = garp->n4g_va.va_ctime; 4051 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4052 resp = NULL; 4053 } 4054 if (!e.error) { 4055 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4056 opsetattr.obj_attributes); 4057 if (verify_argop != -1) { 4058 nfs4args_verify_free(&argop[verify_argop]); 4059 verify_argop = -1; 4060 } 4061 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4062 goto do_again; 4063 } 4064 } while (!e.error); 4065 4066 if (e.error) { 4067 /* 4068 * If we are here, rfs4call has an irrecoverable error - return 4069 */ 4070 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4071 opsetattr.obj_attributes); 4072 if (verify_argop != -1) { 4073 nfs4args_verify_free(&argop[verify_argop]); 4074 verify_argop = -1; 4075 } 4076 if (resp) 4077 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4078 return (e.error); 4079 } 4080 4081 4082 4083 /* 4084 * If changing the size of the file, invalidate 4085 * any local cached data which is no longer part 4086 * of the file. We also possibly invalidate the 4087 * last page in the file. We could use 4088 * pvn_vpzero(), but this would mark the page as 4089 * modified and require it to be written back to 4090 * the server for no particularly good reason. 4091 * This way, if we access it, then we bring it 4092 * back in. A read should be cheaper than a 4093 * write. 4094 */ 4095 if (mask & AT_SIZE) { 4096 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4097 } 4098 4099 /* either no error or one of the postop getattr failed */ 4100 4101 /* 4102 * XXX Perform a simplified version of wcc checking. Instead of 4103 * have another getattr to get pre-op, just purge cache if 4104 * any of the ops prior to and including the getattr failed. 4105 * If the getattr succeeded then update the attrcache accordingly. 4106 */ 4107 4108 garp = NULL; 4109 if (res.status == NFS4_OK) { 4110 /* 4111 * Last getattr 4112 */ 4113 resop = &res.array[numops - 1]; 4114 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4115 } 4116 /* 4117 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4118 * rather than filling it. See the function itself for details. 4119 */ 4120 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4121 if (garp != NULL) { 4122 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4123 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4124 vs_ace4_destroy(&garp->n4g_vsa); 4125 } else { 4126 if (vsap != NULL) { 4127 /* 4128 * The ACL was supposed to be set and to be 4129 * returned in the last getattr of this 4130 * compound, but for some reason the getattr 4131 * result doesn't contain the ACL. In this 4132 * case, purge the ACL cache. 4133 */ 4134 if (rp->r_secattr != NULL) { 4135 mutex_enter(&rp->r_statelock); 4136 vsp = rp->r_secattr; 4137 rp->r_secattr = NULL; 4138 mutex_exit(&rp->r_statelock); 4139 if (vsp != NULL) 4140 nfs4_acl_free_cache(vsp); 4141 } 4142 } 4143 } 4144 } 4145 4146 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4147 /* 4148 * Set the size, rather than relying on getting it updated 4149 * via a GETATTR. With delegations the client tries to 4150 * suppress GETATTR calls. 4151 */ 4152 mutex_enter(&rp->r_statelock); 4153 rp->r_size = vap->va_size; 4154 mutex_exit(&rp->r_statelock); 4155 } 4156 4157 /* 4158 * Can free up request args and res 4159 */ 4160 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4161 opsetattr.obj_attributes); 4162 if (verify_argop != -1) { 4163 nfs4args_verify_free(&argop[verify_argop]); 4164 verify_argop = -1; 4165 } 4166 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4167 4168 /* 4169 * Some servers will change the mode to clear the setuid 4170 * and setgid bits when changing the uid or gid. The 4171 * client needs to compensate appropriately. 4172 */ 4173 if (mask & (AT_UID | AT_GID)) { 4174 int terror, do_setattr; 4175 4176 do_setattr = 0; 4177 va.va_mask = AT_MODE; 4178 terror = nfs4getattr(vp, &va, cr); 4179 if (!terror && 4180 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4181 (!(mask & AT_MODE) && va.va_mode != omode))) { 4182 va.va_mask = AT_MODE; 4183 if (mask & AT_MODE) { 4184 /* 4185 * We asked the mode to be changed and what 4186 * we just got from the server in getattr is 4187 * not what we wanted it to be, so set it now. 4188 */ 4189 va.va_mode = vap->va_mode; 4190 do_setattr = 1; 4191 } else { 4192 /* 4193 * We did not ask the mode to be changed, 4194 * Check to see that the server just cleared 4195 * I_SUID and I_GUID from it. If not then 4196 * set mode to omode with UID/GID cleared. 4197 */ 4198 if (nfs4_compare_modes(va.va_mode, omode)) { 4199 omode &= ~(S_ISUID|S_ISGID); 4200 va.va_mode = omode; 4201 do_setattr = 1; 4202 } 4203 } 4204 4205 if (do_setattr) 4206 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4207 } 4208 } 4209 4210 return (e.error); 4211 } 4212 4213 /* ARGSUSED */ 4214 static int 4215 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4216 { 4217 COMPOUND4args_clnt args; 4218 COMPOUND4res_clnt res; 4219 int doqueue; 4220 uint32_t acc, resacc, argacc; 4221 rnode4_t *rp; 4222 cred_t *cred, *ncr, *ncrfree = NULL; 4223 nfs4_access_type_t cacc; 4224 int num_ops; 4225 nfs_argop4 argop[3]; 4226 nfs_resop4 *resop; 4227 bool_t needrecov = FALSE, do_getattr; 4228 nfs4_recov_state_t recov_state; 4229 int rpc_error; 4230 hrtime_t t; 4231 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4232 mntinfo4_t *mi = VTOMI4(vp); 4233 4234 if (nfs_zone() != mi->mi_zone) 4235 return (EIO); 4236 4237 acc = 0; 4238 if (mode & VREAD) 4239 acc |= ACCESS4_READ; 4240 if (mode & VWRITE) { 4241 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4242 return (EROFS); 4243 if (vp->v_type == VDIR) 4244 acc |= ACCESS4_DELETE; 4245 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4246 } 4247 if (mode & VEXEC) { 4248 if (vp->v_type == VDIR) 4249 acc |= ACCESS4_LOOKUP; 4250 else 4251 acc |= ACCESS4_EXECUTE; 4252 } 4253 4254 if (VTOR4(vp)->r_acache != NULL) { 4255 e.error = nfs4_validate_caches(vp, cr); 4256 if (e.error) 4257 return (e.error); 4258 } 4259 4260 rp = VTOR4(vp); 4261 if (vp->v_type == VDIR) 4262 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4263 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4264 else 4265 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4266 ACCESS4_EXECUTE; 4267 recov_state.rs_flags = 0; 4268 recov_state.rs_num_retry_despite_err = 0; 4269 4270 cred = cr; 4271 /* 4272 * ncr and ncrfree both initially 4273 * point to the memory area returned 4274 * by crnetadjust(); 4275 * ncrfree not NULL when exiting means 4276 * that we need to release it 4277 */ 4278 ncr = crnetadjust(cred); 4279 ncrfree = ncr; 4280 4281 tryagain: 4282 cacc = nfs4_access_check(rp, acc, cred); 4283 if (cacc == NFS4_ACCESS_ALLOWED) { 4284 if (ncrfree != NULL) 4285 crfree(ncrfree); 4286 return (0); 4287 } 4288 if (cacc == NFS4_ACCESS_DENIED) { 4289 /* 4290 * If the cred can be adjusted, try again 4291 * with the new cred. 4292 */ 4293 if (ncr != NULL) { 4294 cred = ncr; 4295 ncr = NULL; 4296 goto tryagain; 4297 } 4298 if (ncrfree != NULL) 4299 crfree(ncrfree); 4300 return (EACCES); 4301 } 4302 4303 recov_retry: 4304 /* 4305 * Don't take with r_statev4_lock here. r_deleg_type could 4306 * change as soon as lock is released. Since it is an int, 4307 * there is no atomicity issue. 4308 */ 4309 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4310 num_ops = do_getattr ? 3 : 2; 4311 4312 args.ctag = TAG_ACCESS; 4313 4314 args.array_len = num_ops; 4315 args.array = argop; 4316 4317 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4318 &recov_state, NULL)) { 4319 if (ncrfree != NULL) 4320 crfree(ncrfree); 4321 return (e.error); 4322 } 4323 4324 /* putfh target fh */ 4325 argop[0].argop = OP_CPUTFH; 4326 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4327 4328 /* access */ 4329 argop[1].argop = OP_ACCESS; 4330 argop[1].nfs_argop4_u.opaccess.access = argacc; 4331 4332 /* getattr */ 4333 if (do_getattr) { 4334 argop[2].argop = OP_GETATTR; 4335 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4336 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4337 } 4338 4339 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4340 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4341 rnode4info(VTOR4(vp)))); 4342 4343 doqueue = 1; 4344 t = gethrtime(); 4345 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4346 rpc_error = e.error; 4347 4348 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4349 if (needrecov) { 4350 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4351 "nfs4_access: initiating recovery\n")); 4352 4353 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4354 NULL, OP_ACCESS, NULL) == FALSE) { 4355 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4356 &recov_state, needrecov); 4357 if (!e.error) 4358 (void) xdr_free(xdr_COMPOUND4res_clnt, 4359 (caddr_t)&res); 4360 goto recov_retry; 4361 } 4362 } 4363 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4364 4365 if (e.error) 4366 goto out; 4367 4368 if (res.status) { 4369 e.error = geterrno4(res.status); 4370 /* 4371 * This might generate over the wire calls throught 4372 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4373 * here to avoid a deadlock. 4374 */ 4375 nfs4_purge_stale_fh(e.error, vp, cr); 4376 goto out; 4377 } 4378 resop = &res.array[1]; /* access res */ 4379 4380 resacc = resop->nfs_resop4_u.opaccess.access; 4381 4382 if (do_getattr) { 4383 resop++; /* getattr res */ 4384 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4385 t, cr, FALSE, NULL); 4386 } 4387 4388 if (!e.error) { 4389 nfs4_access_cache(rp, argacc, resacc, cred); 4390 /* 4391 * we just cached results with cred; if cred is the 4392 * adjusted credentials from crnetadjust, we do not want 4393 * to release them before exiting: hence setting ncrfree 4394 * to NULL 4395 */ 4396 if (cred != cr) 4397 ncrfree = NULL; 4398 /* XXX check the supported bits too? */ 4399 if ((acc & resacc) != acc) { 4400 /* 4401 * The following code implements the semantic 4402 * that a setuid root program has *at least* the 4403 * permissions of the user that is running the 4404 * program. See rfs3call() for more portions 4405 * of the implementation of this functionality. 4406 */ 4407 /* XXX-LP */ 4408 if (ncr != NULL) { 4409 (void) xdr_free(xdr_COMPOUND4res_clnt, 4410 (caddr_t)&res); 4411 cred = ncr; 4412 ncr = NULL; 4413 goto tryagain; 4414 } 4415 e.error = EACCES; 4416 } 4417 } 4418 4419 out: 4420 if (!rpc_error) 4421 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4422 4423 if (ncrfree != NULL) 4424 crfree(ncrfree); 4425 4426 return (e.error); 4427 } 4428 4429 /* ARGSUSED */ 4430 static int 4431 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4432 { 4433 COMPOUND4args_clnt args; 4434 COMPOUND4res_clnt res; 4435 int doqueue; 4436 rnode4_t *rp; 4437 nfs_argop4 argop[3]; 4438 nfs_resop4 *resop; 4439 READLINK4res *lr_res; 4440 nfs4_ga_res_t *garp; 4441 uint_t len; 4442 char *linkdata; 4443 bool_t needrecov = FALSE; 4444 nfs4_recov_state_t recov_state; 4445 hrtime_t t; 4446 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4447 4448 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4449 return (EIO); 4450 /* 4451 * Can't readlink anything other than a symbolic link. 4452 */ 4453 if (vp->v_type != VLNK) 4454 return (EINVAL); 4455 4456 rp = VTOR4(vp); 4457 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4458 e.error = nfs4_validate_caches(vp, cr); 4459 if (e.error) 4460 return (e.error); 4461 mutex_enter(&rp->r_statelock); 4462 if (rp->r_symlink.contents != NULL) { 4463 e.error = uiomove(rp->r_symlink.contents, 4464 rp->r_symlink.len, UIO_READ, uiop); 4465 mutex_exit(&rp->r_statelock); 4466 return (e.error); 4467 } 4468 mutex_exit(&rp->r_statelock); 4469 } 4470 recov_state.rs_flags = 0; 4471 recov_state.rs_num_retry_despite_err = 0; 4472 4473 recov_retry: 4474 args.array_len = 3; 4475 args.array = argop; 4476 args.ctag = TAG_READLINK; 4477 4478 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4479 if (e.error) { 4480 return (e.error); 4481 } 4482 4483 /* 0. putfh symlink fh */ 4484 argop[0].argop = OP_CPUTFH; 4485 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4486 4487 /* 1. readlink */ 4488 argop[1].argop = OP_READLINK; 4489 4490 /* 2. getattr */ 4491 argop[2].argop = OP_GETATTR; 4492 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4493 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4494 4495 doqueue = 1; 4496 4497 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4498 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4499 rnode4info(VTOR4(vp)))); 4500 4501 t = gethrtime(); 4502 4503 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4504 4505 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4506 if (needrecov) { 4507 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4508 "nfs4_readlink: initiating recovery\n")); 4509 4510 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4511 NULL, OP_READLINK, NULL) == FALSE) { 4512 if (!e.error) 4513 (void) xdr_free(xdr_COMPOUND4res_clnt, 4514 (caddr_t)&res); 4515 4516 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4517 needrecov); 4518 goto recov_retry; 4519 } 4520 } 4521 4522 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4523 4524 if (e.error) 4525 return (e.error); 4526 4527 /* 4528 * There is an path in the code below which calls 4529 * nfs4_purge_stale_fh(), which may generate otw calls through 4530 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4531 * here to avoid nfs4_start_op() deadlock. 4532 */ 4533 4534 if (res.status && (res.array_len < args.array_len)) { 4535 /* 4536 * either Putfh or Link failed 4537 */ 4538 e.error = geterrno4(res.status); 4539 nfs4_purge_stale_fh(e.error, vp, cr); 4540 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4541 return (e.error); 4542 } 4543 4544 resop = &res.array[1]; /* readlink res */ 4545 lr_res = &resop->nfs_resop4_u.opreadlink; 4546 4547 /* 4548 * treat symlink names as data 4549 */ 4550 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4551 if (linkdata != NULL) { 4552 int uio_len = len - 1; 4553 /* len includes null byte, which we won't uiomove */ 4554 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4555 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4556 mutex_enter(&rp->r_statelock); 4557 if (rp->r_symlink.contents == NULL) { 4558 rp->r_symlink.contents = linkdata; 4559 rp->r_symlink.len = uio_len; 4560 rp->r_symlink.size = len; 4561 mutex_exit(&rp->r_statelock); 4562 } else { 4563 mutex_exit(&rp->r_statelock); 4564 kmem_free(linkdata, len); 4565 } 4566 } else { 4567 kmem_free(linkdata, len); 4568 } 4569 } 4570 if (res.status == NFS4_OK) { 4571 resop++; /* getattr res */ 4572 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4573 } 4574 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4575 4576 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4577 4578 /* 4579 * The over the wire error for attempting to readlink something 4580 * other than a symbolic link is ENXIO. However, we need to 4581 * return EINVAL instead of ENXIO, so we map it here. 4582 */ 4583 return (e.error == ENXIO ? EINVAL : e.error); 4584 } 4585 4586 /* 4587 * Flush local dirty pages to stable storage on the server. 4588 * 4589 * If FNODSYNC is specified, then there is nothing to do because 4590 * metadata changes are not cached on the client before being 4591 * sent to the server. 4592 */ 4593 /* ARGSUSED */ 4594 static int 4595 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4596 { 4597 int error; 4598 4599 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4600 return (0); 4601 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4602 return (EIO); 4603 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4604 if (!error) 4605 error = VTOR4(vp)->r_error; 4606 return (error); 4607 } 4608 4609 /* 4610 * Weirdness: if the file was removed or the target of a rename 4611 * operation while it was open, it got renamed instead. Here we 4612 * remove the renamed file. 4613 */ 4614 /* ARGSUSED */ 4615 void 4616 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4617 { 4618 rnode4_t *rp; 4619 4620 ASSERT(vp != DNLC_NO_VNODE); 4621 4622 rp = VTOR4(vp); 4623 4624 if (IS_SHADOW(vp, rp)) { 4625 sv_inactive(vp); 4626 return; 4627 } 4628 4629 /* 4630 * If this is coming from the wrong zone, we let someone in the right 4631 * zone take care of it asynchronously. We can get here due to 4632 * VN_RELE() being called from pageout() or fsflush(). This call may 4633 * potentially turn into an expensive no-op if, for instance, v_count 4634 * gets incremented in the meantime, but it's still correct. 4635 */ 4636 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4637 nfs4_async_inactive(vp, cr); 4638 return; 4639 } 4640 4641 /* 4642 * Some of the cleanup steps might require over-the-wire 4643 * operations. Since VOP_INACTIVE can get called as a result of 4644 * other over-the-wire operations (e.g., an attribute cache update 4645 * can lead to a DNLC purge), doing those steps now would lead to a 4646 * nested call to the recovery framework, which can deadlock. So 4647 * do any over-the-wire cleanups asynchronously, in a separate 4648 * thread. 4649 */ 4650 4651 mutex_enter(&rp->r_os_lock); 4652 mutex_enter(&rp->r_statelock); 4653 mutex_enter(&rp->r_statev4_lock); 4654 4655 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4656 mutex_exit(&rp->r_statev4_lock); 4657 mutex_exit(&rp->r_statelock); 4658 mutex_exit(&rp->r_os_lock); 4659 nfs4_async_inactive(vp, cr); 4660 return; 4661 } 4662 4663 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4664 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4665 mutex_exit(&rp->r_statev4_lock); 4666 mutex_exit(&rp->r_statelock); 4667 mutex_exit(&rp->r_os_lock); 4668 nfs4_async_inactive(vp, cr); 4669 return; 4670 } 4671 4672 if (rp->r_unldvp != NULL) { 4673 mutex_exit(&rp->r_statev4_lock); 4674 mutex_exit(&rp->r_statelock); 4675 mutex_exit(&rp->r_os_lock); 4676 nfs4_async_inactive(vp, cr); 4677 return; 4678 } 4679 mutex_exit(&rp->r_statev4_lock); 4680 mutex_exit(&rp->r_statelock); 4681 mutex_exit(&rp->r_os_lock); 4682 4683 rp4_addfree(rp, cr); 4684 } 4685 4686 /* 4687 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4688 * various bits of state. The caller must not refer to vp after this call. 4689 */ 4690 4691 void 4692 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4693 { 4694 rnode4_t *rp = VTOR4(vp); 4695 nfs4_recov_state_t recov_state; 4696 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4697 vnode_t *unldvp; 4698 char *unlname; 4699 cred_t *unlcred; 4700 COMPOUND4args_clnt args; 4701 COMPOUND4res_clnt res, *resp; 4702 nfs_argop4 argop[2]; 4703 int doqueue; 4704 #ifdef DEBUG 4705 char *name; 4706 #endif 4707 4708 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4709 ASSERT(!IS_SHADOW(vp, rp)); 4710 4711 #ifdef DEBUG 4712 name = fn_name(VTOSV(vp)->sv_name); 4713 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4714 "release vnode %s", name)); 4715 kmem_free(name, MAXNAMELEN); 4716 #endif 4717 4718 if (vp->v_type == VREG) { 4719 bool_t recov_failed = FALSE; 4720 4721 e.error = nfs4close_all(vp, cr); 4722 if (e.error) { 4723 /* Check to see if recovery failed */ 4724 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4725 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4726 recov_failed = TRUE; 4727 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4728 if (!recov_failed) { 4729 mutex_enter(&rp->r_statelock); 4730 if (rp->r_flags & R4RECOVERR) 4731 recov_failed = TRUE; 4732 mutex_exit(&rp->r_statelock); 4733 } 4734 if (recov_failed) { 4735 NFS4_DEBUG(nfs4_client_recov_debug, 4736 (CE_NOTE, "nfs4_inactive_otw: " 4737 "close failed (recovery failure)")); 4738 } 4739 } 4740 } 4741 4742 redo: 4743 if (rp->r_unldvp == NULL) { 4744 rp4_addfree(rp, cr); 4745 return; 4746 } 4747 4748 /* 4749 * Save the vnode pointer for the directory where the 4750 * unlinked-open file got renamed, then set it to NULL 4751 * to prevent another thread from getting here before 4752 * we're done with the remove. While we have the 4753 * statelock, make local copies of the pertinent rnode 4754 * fields. If we weren't to do this in an atomic way, the 4755 * the unl* fields could become inconsistent with respect 4756 * to each other due to a race condition between this 4757 * code and nfs_remove(). See bug report 1034328. 4758 */ 4759 mutex_enter(&rp->r_statelock); 4760 if (rp->r_unldvp == NULL) { 4761 mutex_exit(&rp->r_statelock); 4762 rp4_addfree(rp, cr); 4763 return; 4764 } 4765 4766 unldvp = rp->r_unldvp; 4767 rp->r_unldvp = NULL; 4768 unlname = rp->r_unlname; 4769 rp->r_unlname = NULL; 4770 unlcred = rp->r_unlcred; 4771 rp->r_unlcred = NULL; 4772 mutex_exit(&rp->r_statelock); 4773 4774 /* 4775 * If there are any dirty pages left, then flush 4776 * them. This is unfortunate because they just 4777 * may get thrown away during the remove operation, 4778 * but we have to do this for correctness. 4779 */ 4780 if (nfs4_has_pages(vp) && 4781 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4782 ASSERT(vp->v_type != VCHR); 4783 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4784 if (e.error) { 4785 mutex_enter(&rp->r_statelock); 4786 if (!rp->r_error) 4787 rp->r_error = e.error; 4788 mutex_exit(&rp->r_statelock); 4789 } 4790 } 4791 4792 recov_state.rs_flags = 0; 4793 recov_state.rs_num_retry_despite_err = 0; 4794 recov_retry_remove: 4795 /* 4796 * Do the remove operation on the renamed file 4797 */ 4798 args.ctag = TAG_INACTIVE; 4799 4800 /* 4801 * Remove ops: putfh dir; remove 4802 */ 4803 args.array_len = 2; 4804 args.array = argop; 4805 4806 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4807 if (e.error) { 4808 kmem_free(unlname, MAXNAMELEN); 4809 crfree(unlcred); 4810 VN_RELE(unldvp); 4811 /* 4812 * Try again; this time around r_unldvp will be NULL, so we'll 4813 * just call rp4_addfree() and return. 4814 */ 4815 goto redo; 4816 } 4817 4818 /* putfh directory */ 4819 argop[0].argop = OP_CPUTFH; 4820 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4821 4822 /* remove */ 4823 argop[1].argop = OP_CREMOVE; 4824 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4825 4826 doqueue = 1; 4827 resp = &res; 4828 4829 #if 0 /* notyet */ 4830 /* 4831 * Can't do this yet. We may be being called from 4832 * dnlc_purge_XXX while that routine is holding a 4833 * mutex lock to the nc_rele list. The calls to 4834 * nfs3_cache_wcc_data may result in calls to 4835 * dnlc_purge_XXX. This will result in a deadlock. 4836 */ 4837 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4838 if (e.error) { 4839 PURGE_ATTRCACHE4(unldvp); 4840 resp = NULL; 4841 } else if (res.status) { 4842 e.error = geterrno4(res.status); 4843 PURGE_ATTRCACHE4(unldvp); 4844 /* 4845 * This code is inactive right now 4846 * but if made active there should 4847 * be a nfs4_end_op() call before 4848 * nfs4_purge_stale_fh to avoid start_op() 4849 * deadlock. See BugId: 4948726 4850 */ 4851 nfs4_purge_stale_fh(error, unldvp, cr); 4852 } else { 4853 nfs_resop4 *resop; 4854 REMOVE4res *rm_res; 4855 4856 resop = &res.array[1]; 4857 rm_res = &resop->nfs_resop4_u.opremove; 4858 /* 4859 * Update directory cache attribute, 4860 * readdir and dnlc caches. 4861 */ 4862 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4863 } 4864 #else 4865 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4866 4867 PURGE_ATTRCACHE4(unldvp); 4868 #endif 4869 4870 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4871 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4872 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4873 if (!e.error) 4874 (void) xdr_free(xdr_COMPOUND4res_clnt, 4875 (caddr_t)&res); 4876 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4877 &recov_state, TRUE); 4878 goto recov_retry_remove; 4879 } 4880 } 4881 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4882 4883 /* 4884 * Release stuff held for the remove 4885 */ 4886 VN_RELE(unldvp); 4887 if (!e.error && resp) 4888 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4889 4890 kmem_free(unlname, MAXNAMELEN); 4891 crfree(unlcred); 4892 goto redo; 4893 } 4894 4895 /* 4896 * Remote file system operations having to do with directory manipulation. 4897 */ 4898 /* ARGSUSED3 */ 4899 int 4900 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4901 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4902 int *direntflags, pathname_t *realpnp) 4903 { 4904 int error; 4905 vnode_t *vp, *avp = NULL; 4906 rnode4_t *drp; 4907 4908 *vpp = NULL; 4909 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4910 return (EPERM); 4911 /* 4912 * if LOOKUP_XATTR, must replace dvp (object) with 4913 * object's attrdir before continuing with lookup 4914 */ 4915 if (flags & LOOKUP_XATTR) { 4916 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4917 if (error) 4918 return (error); 4919 4920 dvp = avp; 4921 4922 /* 4923 * If lookup is for "", just return dvp now. The attrdir 4924 * has already been activated (from nfs4lookup_xattr), and 4925 * the caller will RELE the original dvp -- not 4926 * the attrdir. So, set vpp and return. 4927 * Currently, when the LOOKUP_XATTR flag is 4928 * passed to VOP_LOOKUP, the name is always empty, and 4929 * shortcircuiting here avoids 3 unneeded lock/unlock 4930 * pairs. 4931 * 4932 * If a non-empty name was provided, then it is the 4933 * attribute name, and it will be looked up below. 4934 */ 4935 if (*nm == '\0') { 4936 *vpp = dvp; 4937 return (0); 4938 } 4939 4940 /* 4941 * The vfs layer never sends a name when asking for the 4942 * attrdir, so we should never get here (unless of course 4943 * name is passed at some time in future -- at which time 4944 * we'll blow up here). 4945 */ 4946 ASSERT(0); 4947 } 4948 4949 drp = VTOR4(dvp); 4950 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4951 return (EINTR); 4952 4953 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4954 nfs_rw_exit(&drp->r_rwlock); 4955 4956 /* 4957 * If vnode is a device, create special vnode. 4958 */ 4959 if (!error && ISVDEV((*vpp)->v_type)) { 4960 vp = *vpp; 4961 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4962 VN_RELE(vp); 4963 } 4964 4965 return (error); 4966 } 4967 4968 /* ARGSUSED */ 4969 static int 4970 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 4971 { 4972 int error; 4973 rnode4_t *drp; 4974 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 4975 mntinfo4_t *mi; 4976 4977 mi = VTOMI4(dvp); 4978 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 4979 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 4980 return (EINVAL); 4981 4982 drp = VTOR4(dvp); 4983 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4984 return (EINTR); 4985 4986 mutex_enter(&drp->r_statelock); 4987 /* 4988 * If the server doesn't support xattrs just return EINVAL 4989 */ 4990 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 4991 mutex_exit(&drp->r_statelock); 4992 nfs_rw_exit(&drp->r_rwlock); 4993 return (EINVAL); 4994 } 4995 4996 /* 4997 * If there is a cached xattr directory entry, 4998 * use it as long as the attributes are valid. If the 4999 * attributes are not valid, take the simple approach and 5000 * free the cached value and re-fetch a new value. 5001 * 5002 * We don't negative entry cache for now, if we did we 5003 * would need to check if the file has changed on every 5004 * lookup. But xattrs don't exist very often and failing 5005 * an openattr is not much more expensive than and NVERIFY or GETATTR 5006 * so do an openattr over the wire for now. 5007 */ 5008 if (drp->r_xattr_dir != NULL) { 5009 if (ATTRCACHE4_VALID(dvp)) { 5010 VN_HOLD(drp->r_xattr_dir); 5011 *vpp = drp->r_xattr_dir; 5012 mutex_exit(&drp->r_statelock); 5013 nfs_rw_exit(&drp->r_rwlock); 5014 return (0); 5015 } 5016 VN_RELE(drp->r_xattr_dir); 5017 drp->r_xattr_dir = NULL; 5018 } 5019 mutex_exit(&drp->r_statelock); 5020 5021 error = nfs4openattr(dvp, vpp, cflag, cr); 5022 5023 nfs_rw_exit(&drp->r_rwlock); 5024 5025 return (error); 5026 } 5027 5028 static int 5029 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5030 { 5031 int error; 5032 rnode4_t *drp; 5033 5034 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5035 5036 /* 5037 * If lookup is for "", just return dvp. Don't need 5038 * to send it over the wire, look it up in the dnlc, 5039 * or perform any access checks. 5040 */ 5041 if (*nm == '\0') { 5042 VN_HOLD(dvp); 5043 *vpp = dvp; 5044 return (0); 5045 } 5046 5047 /* 5048 * Can't do lookups in non-directories. 5049 */ 5050 if (dvp->v_type != VDIR) 5051 return (ENOTDIR); 5052 5053 /* 5054 * If lookup is for ".", just return dvp. Don't need 5055 * to send it over the wire or look it up in the dnlc, 5056 * just need to check access. 5057 */ 5058 if (nm[0] == '.' && nm[1] == '\0') { 5059 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5060 if (error) 5061 return (error); 5062 VN_HOLD(dvp); 5063 *vpp = dvp; 5064 return (0); 5065 } 5066 5067 drp = VTOR4(dvp); 5068 if (!(drp->r_flags & R4LOOKUP)) { 5069 mutex_enter(&drp->r_statelock); 5070 drp->r_flags |= R4LOOKUP; 5071 mutex_exit(&drp->r_statelock); 5072 } 5073 5074 *vpp = NULL; 5075 /* 5076 * Lookup this name in the DNLC. If there is no entry 5077 * lookup over the wire. 5078 */ 5079 if (!skipdnlc) 5080 *vpp = dnlc_lookup(dvp, nm); 5081 if (*vpp == NULL) { 5082 /* 5083 * We need to go over the wire to lookup the name. 5084 */ 5085 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5086 } 5087 5088 /* 5089 * We hit on the dnlc 5090 */ 5091 if (*vpp != DNLC_NO_VNODE || 5092 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5093 /* 5094 * But our attrs may not be valid. 5095 */ 5096 if (ATTRCACHE4_VALID(dvp)) { 5097 error = nfs4_waitfor_purge_complete(dvp); 5098 if (error) { 5099 VN_RELE(*vpp); 5100 *vpp = NULL; 5101 return (error); 5102 } 5103 5104 /* 5105 * If after the purge completes, check to make sure 5106 * our attrs are still valid. 5107 */ 5108 if (ATTRCACHE4_VALID(dvp)) { 5109 /* 5110 * If we waited for a purge we may have 5111 * lost our vnode so look it up again. 5112 */ 5113 VN_RELE(*vpp); 5114 *vpp = dnlc_lookup(dvp, nm); 5115 if (*vpp == NULL) 5116 return (nfs4lookupnew_otw(dvp, 5117 nm, vpp, cr)); 5118 5119 /* 5120 * The access cache should almost always hit 5121 */ 5122 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5123 5124 if (error) { 5125 VN_RELE(*vpp); 5126 *vpp = NULL; 5127 return (error); 5128 } 5129 if (*vpp == DNLC_NO_VNODE) { 5130 VN_RELE(*vpp); 5131 *vpp = NULL; 5132 return (ENOENT); 5133 } 5134 return (0); 5135 } 5136 } 5137 } 5138 5139 ASSERT(*vpp != NULL); 5140 5141 /* 5142 * We may have gotten here we have one of the following cases: 5143 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5144 * need to validate them. 5145 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5146 * must validate. 5147 * 5148 * Go to the server and check if the directory has changed, if 5149 * it hasn't we are done and can use the dnlc entry. 5150 */ 5151 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5152 } 5153 5154 /* 5155 * Go to the server and check if the directory has changed, if 5156 * it hasn't we are done and can use the dnlc entry. If it 5157 * has changed we get a new copy of its attributes and check 5158 * the access for VEXEC, then relookup the filename and 5159 * get its filehandle and attributes. 5160 * 5161 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5162 * if the NVERIFY failed we must 5163 * purge the caches 5164 * cache new attributes (will set r_time_attr_inval) 5165 * cache new access 5166 * recheck VEXEC access 5167 * add name to dnlc, possibly negative 5168 * if LOOKUP succeeded 5169 * cache new attributes 5170 * else 5171 * set a new r_time_attr_inval for dvp 5172 * check to make sure we have access 5173 * 5174 * The vpp returned is the vnode passed in if the directory is valid, 5175 * a new vnode if successful lookup, or NULL on error. 5176 */ 5177 static int 5178 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5179 { 5180 COMPOUND4args_clnt args; 5181 COMPOUND4res_clnt res; 5182 fattr4 *ver_fattr; 5183 fattr4_change dchange; 5184 int32_t *ptr; 5185 int argoplist_size = 7 * sizeof (nfs_argop4); 5186 nfs_argop4 *argop; 5187 int doqueue; 5188 mntinfo4_t *mi; 5189 nfs4_recov_state_t recov_state; 5190 hrtime_t t; 5191 int isdotdot; 5192 vnode_t *nvp; 5193 nfs_fh4 *fhp; 5194 nfs4_sharedfh_t *sfhp; 5195 nfs4_access_type_t cacc; 5196 rnode4_t *nrp; 5197 rnode4_t *drp = VTOR4(dvp); 5198 nfs4_ga_res_t *garp = NULL; 5199 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5200 5201 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5202 ASSERT(nm != NULL); 5203 ASSERT(nm[0] != '\0'); 5204 ASSERT(dvp->v_type == VDIR); 5205 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5206 ASSERT(*vpp != NULL); 5207 5208 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5209 isdotdot = 1; 5210 args.ctag = TAG_LOOKUP_VPARENT; 5211 } else { 5212 /* 5213 * If dvp were a stub, it should have triggered and caused 5214 * a mount for us to get this far. 5215 */ 5216 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5217 5218 isdotdot = 0; 5219 args.ctag = TAG_LOOKUP_VALID; 5220 } 5221 5222 mi = VTOMI4(dvp); 5223 recov_state.rs_flags = 0; 5224 recov_state.rs_num_retry_despite_err = 0; 5225 5226 nvp = NULL; 5227 5228 /* Save the original mount point security information */ 5229 (void) save_mnt_secinfo(mi->mi_curr_serv); 5230 5231 recov_retry: 5232 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5233 &recov_state, NULL); 5234 if (e.error) { 5235 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5236 VN_RELE(*vpp); 5237 *vpp = NULL; 5238 return (e.error); 5239 } 5240 5241 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5242 5243 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5244 args.array_len = 7; 5245 args.array = argop; 5246 5247 /* 0. putfh file */ 5248 argop[0].argop = OP_CPUTFH; 5249 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5250 5251 /* 1. nverify the change info */ 5252 argop[1].argop = OP_NVERIFY; 5253 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5254 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5255 ver_fattr->attrlist4 = (char *)&dchange; 5256 ptr = (int32_t *)&dchange; 5257 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5258 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5259 5260 /* 2. getattr directory */ 5261 argop[2].argop = OP_GETATTR; 5262 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5263 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5264 5265 /* 3. access directory */ 5266 argop[3].argop = OP_ACCESS; 5267 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5268 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5269 5270 /* 4. lookup name */ 5271 if (isdotdot) { 5272 argop[4].argop = OP_LOOKUPP; 5273 } else { 5274 argop[4].argop = OP_CLOOKUP; 5275 argop[4].nfs_argop4_u.opclookup.cname = nm; 5276 } 5277 5278 /* 5. resulting file handle */ 5279 argop[5].argop = OP_GETFH; 5280 5281 /* 6. resulting file attributes */ 5282 argop[6].argop = OP_GETATTR; 5283 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5284 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5285 5286 doqueue = 1; 5287 t = gethrtime(); 5288 5289 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5290 5291 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5292 /* 5293 * For WRONGSEC of a non-dotdot case, send secinfo directly 5294 * from this thread, do not go thru the recovery thread since 5295 * we need the nm information. 5296 * 5297 * Not doing dotdot case because there is no specification 5298 * for (PUTFH, SECINFO "..") yet. 5299 */ 5300 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5301 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5302 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5303 &recov_state, FALSE); 5304 else 5305 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5306 &recov_state, TRUE); 5307 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5308 kmem_free(argop, argoplist_size); 5309 if (!e.error) 5310 goto recov_retry; 5311 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5312 VN_RELE(*vpp); 5313 *vpp = NULL; 5314 return (e.error); 5315 } 5316 5317 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5318 OP_LOOKUP, NULL) == FALSE) { 5319 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5320 &recov_state, TRUE); 5321 5322 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5323 kmem_free(argop, argoplist_size); 5324 goto recov_retry; 5325 } 5326 } 5327 5328 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5329 5330 if (e.error || res.array_len == 0) { 5331 /* 5332 * If e.error isn't set, then reply has no ops (or we couldn't 5333 * be here). The only legal way to reply without an op array 5334 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5335 * be in the reply for all other status values. 5336 * 5337 * For valid replies without an ops array, return ENOTSUP 5338 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5339 * return EIO -- don't trust status. 5340 */ 5341 if (e.error == 0) 5342 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5343 ENOTSUP : EIO; 5344 VN_RELE(*vpp); 5345 *vpp = NULL; 5346 kmem_free(argop, argoplist_size); 5347 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5348 return (e.error); 5349 } 5350 5351 if (res.status != NFS4ERR_SAME) { 5352 e.error = geterrno4(res.status); 5353 5354 /* 5355 * The NVERIFY "failed" so the directory has changed 5356 * First make sure PUTFH succeeded and NVERIFY "failed" 5357 * cleanly. 5358 */ 5359 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5360 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5361 nfs4_purge_stale_fh(e.error, dvp, cr); 5362 VN_RELE(*vpp); 5363 *vpp = NULL; 5364 goto exit; 5365 } 5366 5367 /* 5368 * We know the NVERIFY "failed" so we must: 5369 * purge the caches (access and indirectly dnlc if needed) 5370 */ 5371 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5372 5373 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5374 nfs4_purge_stale_fh(e.error, dvp, cr); 5375 VN_RELE(*vpp); 5376 *vpp = NULL; 5377 goto exit; 5378 } 5379 5380 /* 5381 * Install new cached attributes for the directory 5382 */ 5383 nfs4_attr_cache(dvp, 5384 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5385 t, cr, FALSE, NULL); 5386 5387 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5388 nfs4_purge_stale_fh(e.error, dvp, cr); 5389 VN_RELE(*vpp); 5390 *vpp = NULL; 5391 e.error = geterrno4(res.status); 5392 goto exit; 5393 } 5394 5395 /* 5396 * Now we know the directory is valid, 5397 * cache new directory access 5398 */ 5399 nfs4_access_cache(drp, 5400 args.array[3].nfs_argop4_u.opaccess.access, 5401 res.array[3].nfs_resop4_u.opaccess.access, cr); 5402 5403 /* 5404 * recheck VEXEC access 5405 */ 5406 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5407 if (cacc != NFS4_ACCESS_ALLOWED) { 5408 /* 5409 * Directory permissions might have been revoked 5410 */ 5411 if (cacc == NFS4_ACCESS_DENIED) { 5412 e.error = EACCES; 5413 VN_RELE(*vpp); 5414 *vpp = NULL; 5415 goto exit; 5416 } 5417 5418 /* 5419 * Somehow we must not have asked for enough 5420 * so try a singleton ACCESS, should never happen. 5421 */ 5422 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5423 if (e.error) { 5424 VN_RELE(*vpp); 5425 *vpp = NULL; 5426 goto exit; 5427 } 5428 } 5429 5430 e.error = geterrno4(res.status); 5431 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5432 /* 5433 * The lookup failed, probably no entry 5434 */ 5435 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5436 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5437 } else { 5438 /* 5439 * Might be some other error, so remove 5440 * the dnlc entry to make sure we start all 5441 * over again, next time. 5442 */ 5443 dnlc_remove(dvp, nm); 5444 } 5445 VN_RELE(*vpp); 5446 *vpp = NULL; 5447 goto exit; 5448 } 5449 5450 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5451 /* 5452 * The file exists but we can't get its fh for 5453 * some unknown reason. Remove it from the dnlc 5454 * and error out to be safe. 5455 */ 5456 dnlc_remove(dvp, nm); 5457 VN_RELE(*vpp); 5458 *vpp = NULL; 5459 goto exit; 5460 } 5461 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5462 if (fhp->nfs_fh4_len == 0) { 5463 /* 5464 * The file exists but a bogus fh 5465 * some unknown reason. Remove it from the dnlc 5466 * and error out to be safe. 5467 */ 5468 e.error = ENOENT; 5469 dnlc_remove(dvp, nm); 5470 VN_RELE(*vpp); 5471 *vpp = NULL; 5472 goto exit; 5473 } 5474 sfhp = sfh4_get(fhp, mi); 5475 5476 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5477 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5478 5479 /* 5480 * Make the new rnode 5481 */ 5482 if (isdotdot) { 5483 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5484 if (e.error) { 5485 sfh4_rele(&sfhp); 5486 VN_RELE(*vpp); 5487 *vpp = NULL; 5488 goto exit; 5489 } 5490 /* 5491 * XXX if nfs4_make_dotdot uses an existing rnode 5492 * XXX it doesn't update the attributes. 5493 * XXX for now just save them again to save an OTW 5494 */ 5495 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5496 } else { 5497 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5498 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5499 /* 5500 * If v_type == VNON, then garp was NULL because 5501 * the last op in the compound failed and makenfs4node 5502 * could not find the vnode for sfhp. It created 5503 * a new vnode, so we have nothing to purge here. 5504 */ 5505 if (nvp->v_type == VNON) { 5506 vattr_t vattr; 5507 5508 vattr.va_mask = AT_TYPE; 5509 /* 5510 * N.B. We've already called nfs4_end_fop above. 5511 */ 5512 e.error = nfs4getattr(nvp, &vattr, cr); 5513 if (e.error) { 5514 sfh4_rele(&sfhp); 5515 VN_RELE(*vpp); 5516 *vpp = NULL; 5517 VN_RELE(nvp); 5518 goto exit; 5519 } 5520 nvp->v_type = vattr.va_type; 5521 } 5522 } 5523 sfh4_rele(&sfhp); 5524 5525 nrp = VTOR4(nvp); 5526 mutex_enter(&nrp->r_statev4_lock); 5527 if (!nrp->created_v4) { 5528 mutex_exit(&nrp->r_statev4_lock); 5529 dnlc_update(dvp, nm, nvp); 5530 } else 5531 mutex_exit(&nrp->r_statev4_lock); 5532 5533 VN_RELE(*vpp); 5534 *vpp = nvp; 5535 } else { 5536 hrtime_t now; 5537 hrtime_t delta = 0; 5538 5539 e.error = 0; 5540 5541 /* 5542 * Because the NVERIFY "succeeded" we know that the 5543 * directory attributes are still valid 5544 * so update r_time_attr_inval 5545 */ 5546 now = gethrtime(); 5547 mutex_enter(&drp->r_statelock); 5548 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5549 delta = now - drp->r_time_attr_saved; 5550 if (delta < mi->mi_acdirmin) 5551 delta = mi->mi_acdirmin; 5552 else if (delta > mi->mi_acdirmax) 5553 delta = mi->mi_acdirmax; 5554 } 5555 drp->r_time_attr_inval = now + delta; 5556 mutex_exit(&drp->r_statelock); 5557 dnlc_update(dvp, nm, *vpp); 5558 5559 /* 5560 * Even though we have a valid directory attr cache 5561 * and dnlc entry, we may not have access. 5562 * This should almost always hit the cache. 5563 */ 5564 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5565 if (e.error) { 5566 VN_RELE(*vpp); 5567 *vpp = NULL; 5568 } 5569 5570 if (*vpp == DNLC_NO_VNODE) { 5571 VN_RELE(*vpp); 5572 *vpp = NULL; 5573 e.error = ENOENT; 5574 } 5575 } 5576 5577 exit: 5578 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5579 kmem_free(argop, argoplist_size); 5580 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5581 return (e.error); 5582 } 5583 5584 /* 5585 * We need to go over the wire to lookup the name, but 5586 * while we are there verify the directory has not 5587 * changed but if it has, get new attributes and check access 5588 * 5589 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5590 * NVERIFY GETATTR ACCESS 5591 * 5592 * With the results: 5593 * if the NVERIFY failed we must purge the caches, add new attributes, 5594 * and cache new access. 5595 * set a new r_time_attr_inval 5596 * add name to dnlc, possibly negative 5597 * if LOOKUP succeeded 5598 * cache new attributes 5599 */ 5600 static int 5601 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5602 { 5603 COMPOUND4args_clnt args; 5604 COMPOUND4res_clnt res; 5605 fattr4 *ver_fattr; 5606 fattr4_change dchange; 5607 int32_t *ptr; 5608 nfs4_ga_res_t *garp = NULL; 5609 int argoplist_size = 9 * sizeof (nfs_argop4); 5610 nfs_argop4 *argop; 5611 int doqueue; 5612 mntinfo4_t *mi; 5613 nfs4_recov_state_t recov_state; 5614 hrtime_t t; 5615 int isdotdot; 5616 vnode_t *nvp; 5617 nfs_fh4 *fhp; 5618 nfs4_sharedfh_t *sfhp; 5619 nfs4_access_type_t cacc; 5620 rnode4_t *nrp; 5621 rnode4_t *drp = VTOR4(dvp); 5622 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5623 5624 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5625 ASSERT(nm != NULL); 5626 ASSERT(nm[0] != '\0'); 5627 ASSERT(dvp->v_type == VDIR); 5628 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5629 ASSERT(*vpp == NULL); 5630 5631 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5632 isdotdot = 1; 5633 args.ctag = TAG_LOOKUP_PARENT; 5634 } else { 5635 /* 5636 * If dvp were a stub, it should have triggered and caused 5637 * a mount for us to get this far. 5638 */ 5639 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5640 5641 isdotdot = 0; 5642 args.ctag = TAG_LOOKUP; 5643 } 5644 5645 mi = VTOMI4(dvp); 5646 recov_state.rs_flags = 0; 5647 recov_state.rs_num_retry_despite_err = 0; 5648 5649 nvp = NULL; 5650 5651 /* Save the original mount point security information */ 5652 (void) save_mnt_secinfo(mi->mi_curr_serv); 5653 5654 recov_retry: 5655 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5656 &recov_state, NULL); 5657 if (e.error) { 5658 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5659 return (e.error); 5660 } 5661 5662 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5663 5664 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5665 args.array_len = 9; 5666 args.array = argop; 5667 5668 /* 0. putfh file */ 5669 argop[0].argop = OP_CPUTFH; 5670 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5671 5672 /* 1. savefh for the nverify */ 5673 argop[1].argop = OP_SAVEFH; 5674 5675 /* 2. lookup name */ 5676 if (isdotdot) { 5677 argop[2].argop = OP_LOOKUPP; 5678 } else { 5679 argop[2].argop = OP_CLOOKUP; 5680 argop[2].nfs_argop4_u.opclookup.cname = nm; 5681 } 5682 5683 /* 3. resulting file handle */ 5684 argop[3].argop = OP_GETFH; 5685 5686 /* 4. resulting file attributes */ 5687 argop[4].argop = OP_GETATTR; 5688 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5689 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5690 5691 /* 5. restorefh back the directory for the nverify */ 5692 argop[5].argop = OP_RESTOREFH; 5693 5694 /* 6. nverify the change info */ 5695 argop[6].argop = OP_NVERIFY; 5696 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5697 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5698 ver_fattr->attrlist4 = (char *)&dchange; 5699 ptr = (int32_t *)&dchange; 5700 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5701 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5702 5703 /* 7. getattr directory */ 5704 argop[7].argop = OP_GETATTR; 5705 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5706 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5707 5708 /* 8. access directory */ 5709 argop[8].argop = OP_ACCESS; 5710 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5711 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5712 5713 doqueue = 1; 5714 t = gethrtime(); 5715 5716 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5717 5718 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5719 /* 5720 * For WRONGSEC of a non-dotdot case, send secinfo directly 5721 * from this thread, do not go thru the recovery thread since 5722 * we need the nm information. 5723 * 5724 * Not doing dotdot case because there is no specification 5725 * for (PUTFH, SECINFO "..") yet. 5726 */ 5727 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5728 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5729 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5730 &recov_state, FALSE); 5731 else 5732 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5733 &recov_state, TRUE); 5734 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5735 kmem_free(argop, argoplist_size); 5736 if (!e.error) 5737 goto recov_retry; 5738 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5739 return (e.error); 5740 } 5741 5742 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5743 OP_LOOKUP, NULL) == FALSE) { 5744 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5745 &recov_state, TRUE); 5746 5747 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5748 kmem_free(argop, argoplist_size); 5749 goto recov_retry; 5750 } 5751 } 5752 5753 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5754 5755 if (e.error || res.array_len == 0) { 5756 /* 5757 * If e.error isn't set, then reply has no ops (or we couldn't 5758 * be here). The only legal way to reply without an op array 5759 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5760 * be in the reply for all other status values. 5761 * 5762 * For valid replies without an ops array, return ENOTSUP 5763 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5764 * return EIO -- don't trust status. 5765 */ 5766 if (e.error == 0) 5767 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5768 ENOTSUP : EIO; 5769 5770 kmem_free(argop, argoplist_size); 5771 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5772 return (e.error); 5773 } 5774 5775 e.error = geterrno4(res.status); 5776 5777 /* 5778 * The PUTFH and SAVEFH may have failed. 5779 */ 5780 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5781 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5782 nfs4_purge_stale_fh(e.error, dvp, cr); 5783 goto exit; 5784 } 5785 5786 /* 5787 * Check if the file exists, if it does delay entering 5788 * into the dnlc until after we update the directory 5789 * attributes so we don't cause it to get purged immediately. 5790 */ 5791 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5792 /* 5793 * The lookup failed, probably no entry 5794 */ 5795 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5796 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5797 goto exit; 5798 } 5799 5800 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5801 /* 5802 * The file exists but we can't get its fh for 5803 * some unknown reason. Error out to be safe. 5804 */ 5805 goto exit; 5806 } 5807 5808 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5809 if (fhp->nfs_fh4_len == 0) { 5810 /* 5811 * The file exists but a bogus fh 5812 * some unknown reason. Error out to be safe. 5813 */ 5814 e.error = EIO; 5815 goto exit; 5816 } 5817 sfhp = sfh4_get(fhp, mi); 5818 5819 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5820 sfh4_rele(&sfhp); 5821 goto exit; 5822 } 5823 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5824 5825 /* 5826 * The RESTOREFH may have failed 5827 */ 5828 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5829 sfh4_rele(&sfhp); 5830 e.error = EIO; 5831 goto exit; 5832 } 5833 5834 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5835 /* 5836 * First make sure the NVERIFY failed as we expected, 5837 * if it didn't then be conservative and error out 5838 * as we can't trust the directory. 5839 */ 5840 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5841 sfh4_rele(&sfhp); 5842 e.error = EIO; 5843 goto exit; 5844 } 5845 5846 /* 5847 * We know the NVERIFY "failed" so the directory has changed, 5848 * so we must: 5849 * purge the caches (access and indirectly dnlc if needed) 5850 */ 5851 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5852 5853 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5854 sfh4_rele(&sfhp); 5855 goto exit; 5856 } 5857 nfs4_attr_cache(dvp, 5858 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5859 t, cr, FALSE, NULL); 5860 5861 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5862 nfs4_purge_stale_fh(e.error, dvp, cr); 5863 sfh4_rele(&sfhp); 5864 e.error = geterrno4(res.status); 5865 goto exit; 5866 } 5867 5868 /* 5869 * Now we know the directory is valid, 5870 * cache new directory access 5871 */ 5872 nfs4_access_cache(drp, 5873 args.array[8].nfs_argop4_u.opaccess.access, 5874 res.array[8].nfs_resop4_u.opaccess.access, cr); 5875 5876 /* 5877 * recheck VEXEC access 5878 */ 5879 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5880 if (cacc != NFS4_ACCESS_ALLOWED) { 5881 /* 5882 * Directory permissions might have been revoked 5883 */ 5884 if (cacc == NFS4_ACCESS_DENIED) { 5885 sfh4_rele(&sfhp); 5886 e.error = EACCES; 5887 goto exit; 5888 } 5889 5890 /* 5891 * Somehow we must not have asked for enough 5892 * so try a singleton ACCESS should never happen 5893 */ 5894 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5895 if (e.error) { 5896 sfh4_rele(&sfhp); 5897 goto exit; 5898 } 5899 } 5900 5901 e.error = geterrno4(res.status); 5902 } else { 5903 hrtime_t now; 5904 hrtime_t delta = 0; 5905 5906 e.error = 0; 5907 5908 /* 5909 * Because the NVERIFY "succeeded" we know that the 5910 * directory attributes are still valid 5911 * so update r_time_attr_inval 5912 */ 5913 now = gethrtime(); 5914 mutex_enter(&drp->r_statelock); 5915 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5916 delta = now - drp->r_time_attr_saved; 5917 if (delta < mi->mi_acdirmin) 5918 delta = mi->mi_acdirmin; 5919 else if (delta > mi->mi_acdirmax) 5920 delta = mi->mi_acdirmax; 5921 } 5922 drp->r_time_attr_inval = now + delta; 5923 mutex_exit(&drp->r_statelock); 5924 5925 /* 5926 * Even though we have a valid directory attr cache, 5927 * we may not have access. 5928 * This should almost always hit the cache. 5929 */ 5930 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5931 if (e.error) { 5932 sfh4_rele(&sfhp); 5933 goto exit; 5934 } 5935 } 5936 5937 /* 5938 * Now we have successfully completed the lookup, if the 5939 * directory has changed we now have the valid attributes. 5940 * We also know we have directory access. 5941 * Create the new rnode and insert it in the dnlc. 5942 */ 5943 if (isdotdot) { 5944 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5945 if (e.error) { 5946 sfh4_rele(&sfhp); 5947 goto exit; 5948 } 5949 /* 5950 * XXX if nfs4_make_dotdot uses an existing rnode 5951 * XXX it doesn't update the attributes. 5952 * XXX for now just save them again to save an OTW 5953 */ 5954 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5955 } else { 5956 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5957 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5958 } 5959 sfh4_rele(&sfhp); 5960 5961 nrp = VTOR4(nvp); 5962 mutex_enter(&nrp->r_statev4_lock); 5963 if (!nrp->created_v4) { 5964 mutex_exit(&nrp->r_statev4_lock); 5965 dnlc_update(dvp, nm, nvp); 5966 } else 5967 mutex_exit(&nrp->r_statev4_lock); 5968 5969 *vpp = nvp; 5970 5971 exit: 5972 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5973 kmem_free(argop, argoplist_size); 5974 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5975 return (e.error); 5976 } 5977 5978 #ifdef DEBUG 5979 void 5980 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 5981 { 5982 uint_t i, len; 5983 zoneid_t zoneid = getzoneid(); 5984 char *s; 5985 5986 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 5987 for (i = 0; i < argcnt; i++) { 5988 nfs_argop4 *op = &argbase[i]; 5989 switch (op->argop) { 5990 case OP_CPUTFH: 5991 case OP_PUTFH: 5992 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 5993 break; 5994 case OP_PUTROOTFH: 5995 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 5996 break; 5997 case OP_CLOOKUP: 5998 s = op->nfs_argop4_u.opclookup.cname; 5999 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6000 break; 6001 case OP_LOOKUP: 6002 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6003 &len, NULL); 6004 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6005 kmem_free(s, len); 6006 break; 6007 case OP_LOOKUPP: 6008 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6009 break; 6010 case OP_GETFH: 6011 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6012 break; 6013 case OP_GETATTR: 6014 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6015 break; 6016 case OP_OPENATTR: 6017 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6018 break; 6019 default: 6020 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6021 op->argop); 6022 break; 6023 } 6024 } 6025 } 6026 #endif 6027 6028 /* 6029 * nfs4lookup_setup - constructs a multi-lookup compound request. 6030 * 6031 * Given the path "nm1/nm2/.../nmn", the following compound requests 6032 * may be created: 6033 * 6034 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6035 * is faster, for now. 6036 * 6037 * l4_getattrs indicates the type of compound requested. 6038 * 6039 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6040 * 6041 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6042 * 6043 * total number of ops is n + 1. 6044 * 6045 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6046 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6047 * before the last component, and only get attributes 6048 * for the last component. Note that the second-to-last 6049 * pathname component is XATTR_RPATH, which does NOT go 6050 * over-the-wire as a lookup. 6051 * 6052 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6053 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6054 * 6055 * and total number of ops is n + 5. 6056 * 6057 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6058 * attribute directory: create lookups plus an OPENATTR 6059 * replacing the last lookup. Note that the last pathname 6060 * component is XATTR_RPATH, which does NOT go over-the-wire 6061 * as a lookup. 6062 * 6063 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6064 * Openattr; Getfh; Getattr } 6065 * 6066 * and total number of ops is n + 5. 6067 * 6068 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6069 * nodes too. 6070 * 6071 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6072 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6073 * 6074 * and total number of ops is 3*n + 1. 6075 * 6076 * All cases: returns the index in the arg array of the final LOOKUP op, or 6077 * -1 if no LOOKUPs were used. 6078 */ 6079 int 6080 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6081 { 6082 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6083 nfs_argop4 *argbase, *argop; 6084 int arglen, argcnt; 6085 int n = 1; /* number of components */ 6086 int nga = 1; /* number of Getattr's in request */ 6087 char c = '\0', *s, *p; 6088 int lookup_idx = -1; 6089 int argoplist_size; 6090 6091 /* set lookuparg response result to 0 */ 6092 lookupargp->resp->status = NFS4_OK; 6093 6094 /* skip leading "/" or "." e.g. ".//./" if there is */ 6095 for (; ; nm++) { 6096 if (*nm != '/' && *nm != '.') 6097 break; 6098 6099 /* ".." is counted as 1 component */ 6100 if (*nm == '.' && *(nm + 1) == '.') 6101 break; 6102 } 6103 6104 /* 6105 * Find n = number of components - nm must be null terminated 6106 * Skip "." components. 6107 */ 6108 if (*nm != '\0') 6109 for (n = 1, s = nm; *s != '\0'; s++) { 6110 if ((*s == '/') && (*(s + 1) != '/') && 6111 (*(s + 1) != '\0') && 6112 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6113 *(s + 2) == '\0'))) 6114 n++; 6115 } 6116 else 6117 n = 0; 6118 6119 /* 6120 * nga is number of components that need Getfh+Getattr 6121 */ 6122 switch (l4_getattrs) { 6123 case LKP4_NO_ATTRIBUTES: 6124 nga = 0; 6125 break; 6126 case LKP4_ALL_ATTRIBUTES: 6127 nga = n; 6128 /* 6129 * Always have at least 1 getfh, getattr pair 6130 */ 6131 if (nga == 0) 6132 nga++; 6133 break; 6134 case LKP4_LAST_ATTRDIR: 6135 case LKP4_LAST_NAMED_ATTR: 6136 nga = n+1; 6137 break; 6138 } 6139 6140 /* 6141 * If change to use the filehandle attr instead of getfh 6142 * the following line can be deleted. 6143 */ 6144 nga *= 2; 6145 6146 /* 6147 * calculate number of ops in request as 6148 * header + trailer + lookups + getattrs 6149 */ 6150 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6151 6152 argoplist_size = arglen * sizeof (nfs_argop4); 6153 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6154 lookupargp->argsp->array = argop; 6155 6156 argcnt = lookupargp->header_len; 6157 argop += argcnt; 6158 6159 /* 6160 * loop and create a lookup op and possibly getattr/getfh for 6161 * each component. Skip "." components. 6162 */ 6163 for (s = nm; *s != '\0'; s = p) { 6164 /* 6165 * Set up a pathname struct for each component if needed 6166 */ 6167 while (*s == '/') 6168 s++; 6169 if (*s == '\0') 6170 break; 6171 6172 for (p = s; (*p != '/') && (*p != '\0'); p++) 6173 ; 6174 c = *p; 6175 *p = '\0'; 6176 6177 if (s[0] == '.' && s[1] == '\0') { 6178 *p = c; 6179 continue; 6180 } 6181 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6182 strcmp(s, XATTR_RPATH) == 0) { 6183 /* getfh XXX may not be needed in future */ 6184 argop->argop = OP_GETFH; 6185 argop++; 6186 argcnt++; 6187 6188 /* getattr */ 6189 argop->argop = OP_GETATTR; 6190 argop->nfs_argop4_u.opgetattr.attr_request = 6191 lookupargp->ga_bits; 6192 argop->nfs_argop4_u.opgetattr.mi = 6193 lookupargp->mi; 6194 argop++; 6195 argcnt++; 6196 6197 /* openattr */ 6198 argop->argop = OP_OPENATTR; 6199 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6200 strcmp(s, XATTR_RPATH) == 0) { 6201 /* openattr */ 6202 argop->argop = OP_OPENATTR; 6203 argop++; 6204 argcnt++; 6205 6206 /* getfh XXX may not be needed in future */ 6207 argop->argop = OP_GETFH; 6208 argop++; 6209 argcnt++; 6210 6211 /* getattr */ 6212 argop->argop = OP_GETATTR; 6213 argop->nfs_argop4_u.opgetattr.attr_request = 6214 lookupargp->ga_bits; 6215 argop->nfs_argop4_u.opgetattr.mi = 6216 lookupargp->mi; 6217 argop++; 6218 argcnt++; 6219 *p = c; 6220 continue; 6221 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6222 /* lookupp */ 6223 argop->argop = OP_LOOKUPP; 6224 } else { 6225 /* lookup */ 6226 argop->argop = OP_LOOKUP; 6227 (void) str_to_utf8(s, 6228 &argop->nfs_argop4_u.oplookup.objname); 6229 } 6230 lookup_idx = argcnt; 6231 argop++; 6232 argcnt++; 6233 6234 *p = c; 6235 6236 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6237 /* getfh XXX may not be needed in future */ 6238 argop->argop = OP_GETFH; 6239 argop++; 6240 argcnt++; 6241 6242 /* getattr */ 6243 argop->argop = OP_GETATTR; 6244 argop->nfs_argop4_u.opgetattr.attr_request = 6245 lookupargp->ga_bits; 6246 argop->nfs_argop4_u.opgetattr.mi = 6247 lookupargp->mi; 6248 argop++; 6249 argcnt++; 6250 } 6251 } 6252 6253 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6254 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6255 if (needgetfh) { 6256 /* stick in a post-lookup getfh */ 6257 argop->argop = OP_GETFH; 6258 argcnt++; 6259 argop++; 6260 } 6261 /* post-lookup getattr */ 6262 argop->argop = OP_GETATTR; 6263 argop->nfs_argop4_u.opgetattr.attr_request = 6264 lookupargp->ga_bits; 6265 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6266 argcnt++; 6267 } 6268 argcnt += lookupargp->trailer_len; /* actual op count */ 6269 lookupargp->argsp->array_len = argcnt; 6270 lookupargp->arglen = arglen; 6271 6272 #ifdef DEBUG 6273 if (nfs4_client_lookup_debug) 6274 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6275 #endif 6276 6277 return (lookup_idx); 6278 } 6279 6280 static int 6281 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6282 { 6283 COMPOUND4args_clnt args; 6284 COMPOUND4res_clnt res; 6285 GETFH4res *gf_res = NULL; 6286 nfs_argop4 argop[4]; 6287 nfs_resop4 *resop = NULL; 6288 nfs4_sharedfh_t *sfhp; 6289 hrtime_t t; 6290 nfs4_error_t e; 6291 6292 rnode4_t *drp; 6293 int doqueue = 1; 6294 vnode_t *vp; 6295 int needrecov = 0; 6296 nfs4_recov_state_t recov_state; 6297 6298 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6299 6300 *avp = NULL; 6301 recov_state.rs_flags = 0; 6302 recov_state.rs_num_retry_despite_err = 0; 6303 6304 recov_retry: 6305 /* COMPOUND: putfh, openattr, getfh, getattr */ 6306 args.array_len = 4; 6307 args.array = argop; 6308 args.ctag = TAG_OPENATTR; 6309 6310 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6311 if (e.error) 6312 return (e.error); 6313 6314 drp = VTOR4(dvp); 6315 6316 /* putfh */ 6317 argop[0].argop = OP_CPUTFH; 6318 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6319 6320 /* openattr */ 6321 argop[1].argop = OP_OPENATTR; 6322 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6323 6324 /* getfh */ 6325 argop[2].argop = OP_GETFH; 6326 6327 /* getattr */ 6328 argop[3].argop = OP_GETATTR; 6329 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6330 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6331 6332 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6333 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6334 rnode4info(drp))); 6335 6336 t = gethrtime(); 6337 6338 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6339 6340 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6341 if (needrecov) { 6342 bool_t abort; 6343 6344 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6345 "nfs4openattr: initiating recovery\n")); 6346 6347 abort = nfs4_start_recovery(&e, 6348 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6349 OP_OPENATTR, NULL); 6350 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6351 if (!e.error) { 6352 e.error = geterrno4(res.status); 6353 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6354 } 6355 if (abort == FALSE) 6356 goto recov_retry; 6357 return (e.error); 6358 } 6359 6360 if (e.error) { 6361 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6362 return (e.error); 6363 } 6364 6365 if (res.status) { 6366 /* 6367 * If OTW errro is NOTSUPP, then it should be 6368 * translated to EINVAL. All Solaris file system 6369 * implementations return EINVAL to the syscall layer 6370 * when the attrdir cannot be created due to an 6371 * implementation restriction or noxattr mount option. 6372 */ 6373 if (res.status == NFS4ERR_NOTSUPP) { 6374 mutex_enter(&drp->r_statelock); 6375 if (drp->r_xattr_dir) 6376 VN_RELE(drp->r_xattr_dir); 6377 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6378 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6379 mutex_exit(&drp->r_statelock); 6380 6381 e.error = EINVAL; 6382 } else { 6383 e.error = geterrno4(res.status); 6384 } 6385 6386 if (e.error) { 6387 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6388 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6389 needrecov); 6390 return (e.error); 6391 } 6392 } 6393 6394 resop = &res.array[0]; /* putfh res */ 6395 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6396 6397 resop = &res.array[1]; /* openattr res */ 6398 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6399 6400 resop = &res.array[2]; /* getfh res */ 6401 gf_res = &resop->nfs_resop4_u.opgetfh; 6402 if (gf_res->object.nfs_fh4_len == 0) { 6403 *avp = NULL; 6404 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6405 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6406 return (ENOENT); 6407 } 6408 6409 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6410 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6411 dvp->v_vfsp, t, cr, dvp, 6412 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6413 sfh4_rele(&sfhp); 6414 6415 if (e.error) 6416 PURGE_ATTRCACHE4(vp); 6417 6418 mutex_enter(&vp->v_lock); 6419 vp->v_flag |= V_XATTRDIR; 6420 mutex_exit(&vp->v_lock); 6421 6422 *avp = vp; 6423 6424 mutex_enter(&drp->r_statelock); 6425 if (drp->r_xattr_dir) 6426 VN_RELE(drp->r_xattr_dir); 6427 VN_HOLD(vp); 6428 drp->r_xattr_dir = vp; 6429 6430 /* 6431 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6432 * NULL. xattrs could be created at any time, and we have no 6433 * way to update pc4_xattr_exists in the base object if/when 6434 * it happens. 6435 */ 6436 drp->r_pathconf.pc4_xattr_valid = 0; 6437 6438 mutex_exit(&drp->r_statelock); 6439 6440 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6441 6442 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6443 6444 return (0); 6445 } 6446 6447 /* ARGSUSED */ 6448 static int 6449 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6450 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6451 vsecattr_t *vsecp) 6452 { 6453 int error; 6454 vnode_t *vp = NULL; 6455 rnode4_t *rp; 6456 struct vattr vattr; 6457 rnode4_t *drp; 6458 vnode_t *tempvp; 6459 enum createmode4 createmode; 6460 bool_t must_trunc = FALSE; 6461 int truncating = 0; 6462 6463 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6464 return (EPERM); 6465 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6466 return (EINVAL); 6467 } 6468 6469 /* . and .. have special meaning in the protocol, reject them. */ 6470 6471 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6472 return (EISDIR); 6473 6474 drp = VTOR4(dvp); 6475 6476 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6477 return (EINTR); 6478 6479 top: 6480 /* 6481 * We make a copy of the attributes because the caller does not 6482 * expect us to change what va points to. 6483 */ 6484 vattr = *va; 6485 6486 /* 6487 * If the pathname is "", then dvp is the root vnode of 6488 * a remote file mounted over a local directory. 6489 * All that needs to be done is access 6490 * checking and truncation. Note that we avoid doing 6491 * open w/ create because the parent directory might 6492 * be in pseudo-fs and the open would fail. 6493 */ 6494 if (*nm == '\0') { 6495 error = 0; 6496 VN_HOLD(dvp); 6497 vp = dvp; 6498 must_trunc = TRUE; 6499 } else { 6500 /* 6501 * We need to go over the wire, just to be sure whether the 6502 * file exists or not. Using the DNLC can be dangerous in 6503 * this case when making a decision regarding existence. 6504 */ 6505 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6506 } 6507 6508 if (exclusive) 6509 createmode = EXCLUSIVE4; 6510 else 6511 createmode = GUARDED4; 6512 6513 /* 6514 * error would be set if the file does not exist on the 6515 * server, so lets go create it. 6516 */ 6517 if (error) { 6518 goto create_otw; 6519 } 6520 6521 /* 6522 * File does exist on the server 6523 */ 6524 if (exclusive == EXCL) 6525 error = EEXIST; 6526 else if (vp->v_type == VDIR && (mode & VWRITE)) 6527 error = EISDIR; 6528 else { 6529 /* 6530 * If vnode is a device, create special vnode. 6531 */ 6532 if (ISVDEV(vp->v_type)) { 6533 tempvp = vp; 6534 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6535 VN_RELE(tempvp); 6536 } 6537 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6538 if ((vattr.va_mask & AT_SIZE) && 6539 vp->v_type == VREG) { 6540 rp = VTOR4(vp); 6541 /* 6542 * Check here for large file handled 6543 * by LF-unaware process (as 6544 * ufs_create() does) 6545 */ 6546 if (!(flags & FOFFMAX)) { 6547 mutex_enter(&rp->r_statelock); 6548 if (rp->r_size > MAXOFF32_T) 6549 error = EOVERFLOW; 6550 mutex_exit(&rp->r_statelock); 6551 } 6552 6553 /* if error is set then we need to return */ 6554 if (error) { 6555 nfs_rw_exit(&drp->r_rwlock); 6556 VN_RELE(vp); 6557 return (error); 6558 } 6559 6560 if (must_trunc) { 6561 vattr.va_mask = AT_SIZE; 6562 error = nfs4setattr(vp, &vattr, 0, cr, 6563 NULL); 6564 } else { 6565 /* 6566 * we know we have a regular file that already 6567 * exists and we may end up truncating the file 6568 * as a result of the open_otw, so flush out 6569 * any dirty pages for this file first. 6570 */ 6571 if (nfs4_has_pages(vp) && 6572 ((rp->r_flags & R4DIRTY) || 6573 rp->r_count > 0 || 6574 rp->r_mapcnt > 0)) { 6575 error = nfs4_putpage(vp, 6576 (offset_t)0, 0, 0, cr, ct); 6577 if (error && (error == ENOSPC || 6578 error == EDQUOT)) { 6579 mutex_enter( 6580 &rp->r_statelock); 6581 if (!rp->r_error) 6582 rp->r_error = 6583 error; 6584 mutex_exit( 6585 &rp->r_statelock); 6586 } 6587 } 6588 vattr.va_mask = (AT_SIZE | 6589 AT_TYPE | AT_MODE); 6590 vattr.va_type = VREG; 6591 createmode = UNCHECKED4; 6592 truncating = 1; 6593 goto create_otw; 6594 } 6595 } 6596 } 6597 } 6598 nfs_rw_exit(&drp->r_rwlock); 6599 if (error) { 6600 VN_RELE(vp); 6601 } else { 6602 vnode_t *tvp; 6603 rnode4_t *trp; 6604 /* 6605 * existing file got truncated, notify. 6606 */ 6607 tvp = vp; 6608 if (vp->v_type == VREG) { 6609 trp = VTOR4(vp); 6610 if (IS_SHADOW(vp, trp)) 6611 tvp = RTOV4(trp); 6612 } 6613 vnevent_create(tvp, ct); 6614 *vpp = vp; 6615 } 6616 return (error); 6617 6618 create_otw: 6619 dnlc_remove(dvp, nm); 6620 6621 ASSERT(vattr.va_mask & AT_TYPE); 6622 6623 /* 6624 * If not a regular file let nfs4mknod() handle it. 6625 */ 6626 if (vattr.va_type != VREG) { 6627 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6628 nfs_rw_exit(&drp->r_rwlock); 6629 return (error); 6630 } 6631 6632 /* 6633 * It _is_ a regular file. 6634 */ 6635 ASSERT(vattr.va_mask & AT_MODE); 6636 if (MANDMODE(vattr.va_mode)) { 6637 nfs_rw_exit(&drp->r_rwlock); 6638 return (EACCES); 6639 } 6640 6641 /* 6642 * If this happens to be a mknod of a regular file, then flags will 6643 * have neither FREAD or FWRITE. However, we must set at least one 6644 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6645 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6646 * set (based on openmode specified by app). 6647 */ 6648 if ((flags & (FREAD|FWRITE)) == 0) 6649 flags |= (FREAD|FWRITE); 6650 6651 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6652 6653 if (vp != NULL) { 6654 /* if create was successful, throw away the file's pages */ 6655 if (!error && (vattr.va_mask & AT_SIZE)) 6656 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6657 cr); 6658 /* release the lookup hold */ 6659 VN_RELE(vp); 6660 vp = NULL; 6661 } 6662 6663 /* 6664 * validate that we opened a regular file. This handles a misbehaving 6665 * server that returns an incorrect FH. 6666 */ 6667 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6668 error = EISDIR; 6669 VN_RELE(*vpp); 6670 } 6671 6672 /* 6673 * If this is not an exclusive create, then the CREATE 6674 * request will be made with the GUARDED mode set. This 6675 * means that the server will return EEXIST if the file 6676 * exists. The file could exist because of a retransmitted 6677 * request. In this case, we recover by starting over and 6678 * checking to see whether the file exists. This second 6679 * time through it should and a CREATE request will not be 6680 * sent. 6681 * 6682 * This handles the problem of a dangling CREATE request 6683 * which contains attributes which indicate that the file 6684 * should be truncated. This retransmitted request could 6685 * possibly truncate valid data in the file if not caught 6686 * by the duplicate request mechanism on the server or if 6687 * not caught by other means. The scenario is: 6688 * 6689 * Client transmits CREATE request with size = 0 6690 * Client times out, retransmits request. 6691 * Response to the first request arrives from the server 6692 * and the client proceeds on. 6693 * Client writes data to the file. 6694 * The server now processes retransmitted CREATE request 6695 * and truncates file. 6696 * 6697 * The use of the GUARDED CREATE request prevents this from 6698 * happening because the retransmitted CREATE would fail 6699 * with EEXIST and would not truncate the file. 6700 */ 6701 if (error == EEXIST && exclusive == NONEXCL) { 6702 #ifdef DEBUG 6703 nfs4_create_misses++; 6704 #endif 6705 goto top; 6706 } 6707 nfs_rw_exit(&drp->r_rwlock); 6708 if (truncating && !error && *vpp) { 6709 vnode_t *tvp; 6710 rnode4_t *trp; 6711 /* 6712 * existing file got truncated, notify. 6713 */ 6714 tvp = *vpp; 6715 trp = VTOR4(tvp); 6716 if (IS_SHADOW(tvp, trp)) 6717 tvp = RTOV4(trp); 6718 vnevent_create(tvp, ct); 6719 } 6720 return (error); 6721 } 6722 6723 /* 6724 * Create compound (for mkdir, mknod, symlink): 6725 * { Putfh <dfh>; Create; Getfh; Getattr } 6726 * It's okay if setattr failed to set gid - this is not considered 6727 * an error, but purge attrs in that case. 6728 */ 6729 static int 6730 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6731 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6732 { 6733 int need_end_op = FALSE; 6734 COMPOUND4args_clnt args; 6735 COMPOUND4res_clnt res, *resp = NULL; 6736 nfs_argop4 *argop; 6737 nfs_resop4 *resop; 6738 int doqueue; 6739 mntinfo4_t *mi; 6740 rnode4_t *drp = VTOR4(dvp); 6741 change_info4 *cinfo; 6742 GETFH4res *gf_res; 6743 struct vattr vattr; 6744 vnode_t *vp; 6745 fattr4 *crattr; 6746 bool_t needrecov = FALSE; 6747 nfs4_recov_state_t recov_state; 6748 nfs4_sharedfh_t *sfhp = NULL; 6749 hrtime_t t; 6750 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6751 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6752 dirattr_info_t dinfo, *dinfop; 6753 servinfo4_t *svp; 6754 bitmap4 supp_attrs; 6755 6756 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6757 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6758 6759 mi = VTOMI4(dvp); 6760 6761 /* 6762 * Make sure we properly deal with setting the right gid 6763 * on a new directory to reflect the parent's setgid bit 6764 */ 6765 setgid_flag = 0; 6766 if (type == NF4DIR) { 6767 struct vattr dva; 6768 6769 va->va_mode &= ~VSGID; 6770 dva.va_mask = AT_MODE | AT_GID; 6771 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6772 6773 /* 6774 * If the parent's directory has the setgid bit set 6775 * _and_ the client was able to get a valid mapping 6776 * for the parent dir's owner_group, we want to 6777 * append NVERIFY(owner_group == dva.va_gid) and 6778 * SETTATTR to the CREATE compound. 6779 */ 6780 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6781 setgid_flag = 1; 6782 va->va_mode |= VSGID; 6783 if (dva.va_gid != GID_NOBODY) { 6784 va->va_mask |= AT_GID; 6785 va->va_gid = dva.va_gid; 6786 } 6787 } 6788 } 6789 } 6790 6791 /* 6792 * Create ops: 6793 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6794 * 5:restorefh(dir) 6:getattr(dir) 6795 * 6796 * if (setgid) 6797 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6798 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6799 * 8:nverify 9:setattr 6800 */ 6801 if (setgid_flag) { 6802 numops = 10; 6803 idx_create = 1; 6804 idx_fattr = 3; 6805 } else { 6806 numops = 7; 6807 idx_create = 2; 6808 idx_fattr = 4; 6809 } 6810 6811 ASSERT(nfs_zone() == mi->mi_zone); 6812 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6813 return (EINTR); 6814 } 6815 recov_state.rs_flags = 0; 6816 recov_state.rs_num_retry_despite_err = 0; 6817 6818 argoplist_size = numops * sizeof (nfs_argop4); 6819 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6820 6821 recov_retry: 6822 if (type == NF4LNK) 6823 args.ctag = TAG_SYMLINK; 6824 else if (type == NF4DIR) 6825 args.ctag = TAG_MKDIR; 6826 else 6827 args.ctag = TAG_MKNOD; 6828 6829 args.array_len = numops; 6830 args.array = argop; 6831 6832 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6833 nfs_rw_exit(&drp->r_rwlock); 6834 kmem_free(argop, argoplist_size); 6835 return (e.error); 6836 } 6837 need_end_op = TRUE; 6838 6839 6840 /* 0: putfh directory */ 6841 argop[0].argop = OP_CPUTFH; 6842 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6843 6844 /* 1/2: Create object */ 6845 argop[idx_create].argop = OP_CCREATE; 6846 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6847 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6848 if (type == NF4LNK) { 6849 /* 6850 * symlink, treat name as data 6851 */ 6852 ASSERT(data != NULL); 6853 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6854 (char *)data; 6855 } 6856 if (type == NF4BLK || type == NF4CHR) { 6857 ASSERT(data != NULL); 6858 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6859 *((specdata4 *)data); 6860 } 6861 6862 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6863 6864 svp = drp->r_server; 6865 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6866 supp_attrs = svp->sv_supp_attrs; 6867 nfs_rw_exit(&svp->sv_lock); 6868 6869 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6870 nfs_rw_exit(&drp->r_rwlock); 6871 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6872 e.error = EINVAL; 6873 kmem_free(argop, argoplist_size); 6874 return (e.error); 6875 } 6876 6877 /* 2/3: getfh fh of created object */ 6878 ASSERT(idx_create + 1 == idx_fattr - 1); 6879 argop[idx_create + 1].argop = OP_GETFH; 6880 6881 /* 3/4: getattr of new object */ 6882 argop[idx_fattr].argop = OP_GETATTR; 6883 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6884 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6885 6886 if (setgid_flag) { 6887 vattr_t _v; 6888 6889 argop[4].argop = OP_SAVEFH; 6890 6891 argop[5].argop = OP_CPUTFH; 6892 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6893 6894 argop[6].argop = OP_GETATTR; 6895 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6896 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6897 6898 argop[7].argop = OP_RESTOREFH; 6899 6900 /* 6901 * nverify 6902 * 6903 * XXX - Revisit the last argument to nfs4_end_op() 6904 * once 5020486 is fixed. 6905 */ 6906 _v.va_mask = AT_GID; 6907 _v.va_gid = va->va_gid; 6908 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6909 supp_attrs)) { 6910 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6911 nfs_rw_exit(&drp->r_rwlock); 6912 nfs4_fattr4_free(crattr); 6913 kmem_free(argop, argoplist_size); 6914 return (e.error); 6915 } 6916 6917 /* 6918 * setattr 6919 * 6920 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6921 * so no need for stateid or flags. Also we specify NULL 6922 * rp since we're only interested in setting owner_group 6923 * attributes. 6924 */ 6925 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6926 &e.error, 0); 6927 6928 if (e.error) { 6929 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6930 nfs_rw_exit(&drp->r_rwlock); 6931 nfs4_fattr4_free(crattr); 6932 nfs4args_verify_free(&argop[8]); 6933 kmem_free(argop, argoplist_size); 6934 return (e.error); 6935 } 6936 } else { 6937 argop[1].argop = OP_SAVEFH; 6938 6939 argop[5].argop = OP_RESTOREFH; 6940 6941 argop[6].argop = OP_GETATTR; 6942 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6943 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6944 } 6945 6946 dnlc_remove(dvp, nm); 6947 6948 doqueue = 1; 6949 t = gethrtime(); 6950 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6951 6952 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6953 if (e.error) { 6954 PURGE_ATTRCACHE4(dvp); 6955 if (!needrecov) 6956 goto out; 6957 } 6958 6959 if (needrecov) { 6960 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6961 OP_CREATE, NULL) == FALSE) { 6962 nfs4_end_op(mi, dvp, NULL, &recov_state, 6963 needrecov); 6964 need_end_op = FALSE; 6965 nfs4_fattr4_free(crattr); 6966 if (setgid_flag) { 6967 nfs4args_verify_free(&argop[8]); 6968 nfs4args_setattr_free(&argop[9]); 6969 } 6970 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6971 goto recov_retry; 6972 } 6973 } 6974 6975 resp = &res; 6976 6977 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 6978 6979 if (res.status == NFS4ERR_BADOWNER) 6980 nfs4_log_badowner(mi, OP_CREATE); 6981 6982 e.error = geterrno4(res.status); 6983 6984 /* 6985 * This check is left over from when create was implemented 6986 * using a setattr op (instead of createattrs). If the 6987 * putfh/create/getfh failed, the error was returned. If 6988 * setattr/getattr failed, we keep going. 6989 * 6990 * It might be better to get rid of the GETFH also, and just 6991 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 6992 * Then if any of the operations failed, we could return the 6993 * error now, and remove much of the error code below. 6994 */ 6995 if (res.array_len <= idx_fattr) { 6996 /* 6997 * Either Putfh, Create or Getfh failed. 6998 */ 6999 PURGE_ATTRCACHE4(dvp); 7000 /* 7001 * nfs4_purge_stale_fh() may generate otw calls through 7002 * nfs4_invalidate_pages. Hence the need to call 7003 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7004 */ 7005 nfs4_end_op(mi, dvp, NULL, &recov_state, 7006 needrecov); 7007 need_end_op = FALSE; 7008 nfs4_purge_stale_fh(e.error, dvp, cr); 7009 goto out; 7010 } 7011 } 7012 7013 resop = &res.array[idx_create]; /* create res */ 7014 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7015 7016 resop = &res.array[idx_create + 1]; /* getfh res */ 7017 gf_res = &resop->nfs_resop4_u.opgetfh; 7018 7019 sfhp = sfh4_get(&gf_res->object, mi); 7020 if (e.error) { 7021 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7022 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7023 if (vp->v_type == VNON) { 7024 vattr.va_mask = AT_TYPE; 7025 /* 7026 * Need to call nfs4_end_op before nfs4getattr to avoid 7027 * potential nfs4_start_op deadlock. See RFE 4777612. 7028 */ 7029 nfs4_end_op(mi, dvp, NULL, &recov_state, 7030 needrecov); 7031 need_end_op = FALSE; 7032 e.error = nfs4getattr(vp, &vattr, cr); 7033 if (e.error) { 7034 VN_RELE(vp); 7035 *vpp = NULL; 7036 goto out; 7037 } 7038 vp->v_type = vattr.va_type; 7039 } 7040 e.error = 0; 7041 } else { 7042 *vpp = vp = makenfs4node(sfhp, 7043 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7044 dvp->v_vfsp, t, cr, 7045 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7046 } 7047 7048 /* 7049 * If compound succeeded, then update dir attrs 7050 */ 7051 if (res.status == NFS4_OK) { 7052 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7053 dinfo.di_cred = cr; 7054 dinfo.di_time_call = t; 7055 dinfop = &dinfo; 7056 } else 7057 dinfop = NULL; 7058 7059 /* Update directory cache attribute, readdir and dnlc caches */ 7060 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7061 7062 out: 7063 if (sfhp != NULL) 7064 sfh4_rele(&sfhp); 7065 nfs_rw_exit(&drp->r_rwlock); 7066 nfs4_fattr4_free(crattr); 7067 if (setgid_flag) { 7068 nfs4args_verify_free(&argop[8]); 7069 nfs4args_setattr_free(&argop[9]); 7070 } 7071 if (resp) 7072 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7073 if (need_end_op) 7074 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7075 7076 kmem_free(argop, argoplist_size); 7077 return (e.error); 7078 } 7079 7080 /* ARGSUSED */ 7081 static int 7082 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7083 int mode, vnode_t **vpp, cred_t *cr) 7084 { 7085 int error; 7086 vnode_t *vp; 7087 nfs_ftype4 type; 7088 specdata4 spec, *specp = NULL; 7089 7090 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7091 7092 switch (va->va_type) { 7093 case VCHR: 7094 case VBLK: 7095 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7096 spec.specdata1 = getmajor(va->va_rdev); 7097 spec.specdata2 = getminor(va->va_rdev); 7098 specp = &spec; 7099 break; 7100 7101 case VFIFO: 7102 type = NF4FIFO; 7103 break; 7104 case VSOCK: 7105 type = NF4SOCK; 7106 break; 7107 7108 default: 7109 return (EINVAL); 7110 } 7111 7112 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7113 if (error) { 7114 return (error); 7115 } 7116 7117 /* 7118 * This might not be needed any more; special case to deal 7119 * with problematic v2/v3 servers. Since create was unable 7120 * to set group correctly, not sure what hope setattr has. 7121 */ 7122 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7123 va->va_mask = AT_GID; 7124 (void) nfs4setattr(vp, va, 0, cr, NULL); 7125 } 7126 7127 /* 7128 * If vnode is a device create special vnode 7129 */ 7130 if (ISVDEV(vp->v_type)) { 7131 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7132 VN_RELE(vp); 7133 } else { 7134 *vpp = vp; 7135 } 7136 return (error); 7137 } 7138 7139 /* 7140 * Remove requires that the current fh be the target directory. 7141 * After the operation, the current fh is unchanged. 7142 * The compound op structure is: 7143 * PUTFH(targetdir), REMOVE 7144 * 7145 * Weirdness: if the vnode to be removed is open 7146 * we rename it instead of removing it and nfs_inactive 7147 * will remove the new name. 7148 */ 7149 /* ARGSUSED */ 7150 static int 7151 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7152 { 7153 COMPOUND4args_clnt args; 7154 COMPOUND4res_clnt res, *resp = NULL; 7155 REMOVE4res *rm_res; 7156 nfs_argop4 argop[3]; 7157 nfs_resop4 *resop; 7158 vnode_t *vp; 7159 char *tmpname; 7160 int doqueue; 7161 mntinfo4_t *mi; 7162 rnode4_t *rp; 7163 rnode4_t *drp; 7164 int needrecov = 0; 7165 nfs4_recov_state_t recov_state; 7166 int isopen; 7167 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7168 dirattr_info_t dinfo; 7169 7170 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7171 return (EPERM); 7172 drp = VTOR4(dvp); 7173 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7174 return (EINTR); 7175 7176 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7177 if (e.error) { 7178 nfs_rw_exit(&drp->r_rwlock); 7179 return (e.error); 7180 } 7181 7182 if (vp->v_type == VDIR) { 7183 VN_RELE(vp); 7184 nfs_rw_exit(&drp->r_rwlock); 7185 return (EISDIR); 7186 } 7187 7188 /* 7189 * First just remove the entry from the name cache, as it 7190 * is most likely the only entry for this vp. 7191 */ 7192 dnlc_remove(dvp, nm); 7193 7194 rp = VTOR4(vp); 7195 7196 /* 7197 * For regular file types, check to see if the file is open by looking 7198 * at the open streams. 7199 * For all other types, check the reference count on the vnode. Since 7200 * they are not opened OTW they never have an open stream. 7201 * 7202 * If the file is open, rename it to .nfsXXXX. 7203 */ 7204 if (vp->v_type != VREG) { 7205 /* 7206 * If the file has a v_count > 1 then there may be more than one 7207 * entry in the name cache due multiple links or an open file, 7208 * but we don't have the real reference count so flush all 7209 * possible entries. 7210 */ 7211 if (vp->v_count > 1) 7212 dnlc_purge_vp(vp); 7213 7214 /* 7215 * Now we have the real reference count. 7216 */ 7217 isopen = vp->v_count > 1; 7218 } else { 7219 mutex_enter(&rp->r_os_lock); 7220 isopen = list_head(&rp->r_open_streams) != NULL; 7221 mutex_exit(&rp->r_os_lock); 7222 } 7223 7224 mutex_enter(&rp->r_statelock); 7225 if (isopen && 7226 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7227 mutex_exit(&rp->r_statelock); 7228 tmpname = newname(); 7229 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7230 if (e.error) 7231 kmem_free(tmpname, MAXNAMELEN); 7232 else { 7233 mutex_enter(&rp->r_statelock); 7234 if (rp->r_unldvp == NULL) { 7235 VN_HOLD(dvp); 7236 rp->r_unldvp = dvp; 7237 if (rp->r_unlcred != NULL) 7238 crfree(rp->r_unlcred); 7239 crhold(cr); 7240 rp->r_unlcred = cr; 7241 rp->r_unlname = tmpname; 7242 } else { 7243 kmem_free(rp->r_unlname, MAXNAMELEN); 7244 rp->r_unlname = tmpname; 7245 } 7246 mutex_exit(&rp->r_statelock); 7247 } 7248 VN_RELE(vp); 7249 nfs_rw_exit(&drp->r_rwlock); 7250 return (e.error); 7251 } 7252 /* 7253 * Actually remove the file/dir 7254 */ 7255 mutex_exit(&rp->r_statelock); 7256 7257 /* 7258 * We need to flush any dirty pages which happen to 7259 * be hanging around before removing the file. 7260 * This shouldn't happen very often since in NFSv4 7261 * we should be close to open consistent. 7262 */ 7263 if (nfs4_has_pages(vp) && 7264 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7265 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7266 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7267 mutex_enter(&rp->r_statelock); 7268 if (!rp->r_error) 7269 rp->r_error = e.error; 7270 mutex_exit(&rp->r_statelock); 7271 } 7272 } 7273 7274 mi = VTOMI4(dvp); 7275 7276 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7277 recov_state.rs_flags = 0; 7278 recov_state.rs_num_retry_despite_err = 0; 7279 7280 recov_retry: 7281 /* 7282 * Remove ops: putfh dir; remove 7283 */ 7284 args.ctag = TAG_REMOVE; 7285 args.array_len = 3; 7286 args.array = argop; 7287 7288 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7289 if (e.error) { 7290 nfs_rw_exit(&drp->r_rwlock); 7291 VN_RELE(vp); 7292 return (e.error); 7293 } 7294 7295 /* putfh directory */ 7296 argop[0].argop = OP_CPUTFH; 7297 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7298 7299 /* remove */ 7300 argop[1].argop = OP_CREMOVE; 7301 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7302 7303 /* getattr dir */ 7304 argop[2].argop = OP_GETATTR; 7305 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7306 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7307 7308 doqueue = 1; 7309 dinfo.di_time_call = gethrtime(); 7310 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7311 7312 PURGE_ATTRCACHE4(vp); 7313 7314 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7315 if (e.error) 7316 PURGE_ATTRCACHE4(dvp); 7317 7318 if (needrecov) { 7319 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7320 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7321 if (!e.error) 7322 (void) xdr_free(xdr_COMPOUND4res_clnt, 7323 (caddr_t)&res); 7324 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7325 needrecov); 7326 goto recov_retry; 7327 } 7328 } 7329 7330 /* 7331 * Matching nfs4_end_op() for start_op() above. 7332 * There is a path in the code below which calls 7333 * nfs4_purge_stale_fh(), which may generate otw calls through 7334 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7335 * here to avoid nfs4_start_op() deadlock. 7336 */ 7337 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7338 7339 if (!e.error) { 7340 resp = &res; 7341 7342 if (res.status) { 7343 e.error = geterrno4(res.status); 7344 PURGE_ATTRCACHE4(dvp); 7345 nfs4_purge_stale_fh(e.error, dvp, cr); 7346 } else { 7347 resop = &res.array[1]; /* remove res */ 7348 rm_res = &resop->nfs_resop4_u.opremove; 7349 7350 dinfo.di_garp = 7351 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7352 dinfo.di_cred = cr; 7353 7354 /* Update directory attr, readdir and dnlc caches */ 7355 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7356 &dinfo); 7357 } 7358 } 7359 nfs_rw_exit(&drp->r_rwlock); 7360 if (resp) 7361 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7362 7363 if (e.error == 0) { 7364 vnode_t *tvp; 7365 rnode4_t *trp; 7366 trp = VTOR4(vp); 7367 tvp = vp; 7368 if (IS_SHADOW(vp, trp)) 7369 tvp = RTOV4(trp); 7370 vnevent_remove(tvp, dvp, nm, ct); 7371 } 7372 VN_RELE(vp); 7373 return (e.error); 7374 } 7375 7376 /* 7377 * Link requires that the current fh be the target directory and the 7378 * saved fh be the source fh. After the operation, the current fh is unchanged. 7379 * Thus the compound op structure is: 7380 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7381 * GETATTR(file) 7382 */ 7383 /* ARGSUSED */ 7384 static int 7385 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7386 caller_context_t *ct, int flags) 7387 { 7388 COMPOUND4args_clnt args; 7389 COMPOUND4res_clnt res, *resp = NULL; 7390 LINK4res *ln_res; 7391 int argoplist_size = 7 * sizeof (nfs_argop4); 7392 nfs_argop4 *argop; 7393 nfs_resop4 *resop; 7394 vnode_t *realvp, *nvp; 7395 int doqueue; 7396 mntinfo4_t *mi; 7397 rnode4_t *tdrp; 7398 bool_t needrecov = FALSE; 7399 nfs4_recov_state_t recov_state; 7400 hrtime_t t; 7401 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7402 dirattr_info_t dinfo; 7403 7404 ASSERT(*tnm != '\0'); 7405 ASSERT(tdvp->v_type == VDIR); 7406 ASSERT(nfs4_consistent_type(tdvp)); 7407 ASSERT(nfs4_consistent_type(svp)); 7408 7409 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7410 return (EPERM); 7411 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7412 svp = realvp; 7413 ASSERT(nfs4_consistent_type(svp)); 7414 } 7415 7416 tdrp = VTOR4(tdvp); 7417 mi = VTOMI4(svp); 7418 7419 if (!(mi->mi_flags & MI4_LINK)) { 7420 return (EOPNOTSUPP); 7421 } 7422 recov_state.rs_flags = 0; 7423 recov_state.rs_num_retry_despite_err = 0; 7424 7425 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7426 return (EINTR); 7427 7428 recov_retry: 7429 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7430 7431 args.ctag = TAG_LINK; 7432 7433 /* 7434 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7435 * restorefh; getattr(fl) 7436 */ 7437 args.array_len = 7; 7438 args.array = argop; 7439 7440 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7441 if (e.error) { 7442 kmem_free(argop, argoplist_size); 7443 nfs_rw_exit(&tdrp->r_rwlock); 7444 return (e.error); 7445 } 7446 7447 /* 0. putfh file */ 7448 argop[0].argop = OP_CPUTFH; 7449 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7450 7451 /* 1. save current fh to free up the space for the dir */ 7452 argop[1].argop = OP_SAVEFH; 7453 7454 /* 2. putfh targetdir */ 7455 argop[2].argop = OP_CPUTFH; 7456 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7457 7458 /* 3. link: current_fh is targetdir, saved_fh is source */ 7459 argop[3].argop = OP_CLINK; 7460 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7461 7462 /* 4. Get attributes of dir */ 7463 argop[4].argop = OP_GETATTR; 7464 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7465 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7466 7467 /* 5. If link was successful, restore current vp to file */ 7468 argop[5].argop = OP_RESTOREFH; 7469 7470 /* 6. Get attributes of linked object */ 7471 argop[6].argop = OP_GETATTR; 7472 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7473 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7474 7475 dnlc_remove(tdvp, tnm); 7476 7477 doqueue = 1; 7478 t = gethrtime(); 7479 7480 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7481 7482 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7483 if (e.error != 0 && !needrecov) { 7484 PURGE_ATTRCACHE4(tdvp); 7485 PURGE_ATTRCACHE4(svp); 7486 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7487 goto out; 7488 } 7489 7490 if (needrecov) { 7491 bool_t abort; 7492 7493 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7494 NULL, NULL, OP_LINK, NULL); 7495 if (abort == FALSE) { 7496 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7497 needrecov); 7498 kmem_free(argop, argoplist_size); 7499 if (!e.error) 7500 (void) xdr_free(xdr_COMPOUND4res_clnt, 7501 (caddr_t)&res); 7502 goto recov_retry; 7503 } else { 7504 if (e.error != 0) { 7505 PURGE_ATTRCACHE4(tdvp); 7506 PURGE_ATTRCACHE4(svp); 7507 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7508 &recov_state, needrecov); 7509 goto out; 7510 } 7511 /* fall through for res.status case */ 7512 } 7513 } 7514 7515 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7516 7517 resp = &res; 7518 if (res.status) { 7519 /* If link succeeded, then don't return error */ 7520 e.error = geterrno4(res.status); 7521 if (res.array_len <= 4) { 7522 /* 7523 * Either Putfh, Savefh, Putfh dir, or Link failed 7524 */ 7525 PURGE_ATTRCACHE4(svp); 7526 PURGE_ATTRCACHE4(tdvp); 7527 if (e.error == EOPNOTSUPP) { 7528 mutex_enter(&mi->mi_lock); 7529 mi->mi_flags &= ~MI4_LINK; 7530 mutex_exit(&mi->mi_lock); 7531 } 7532 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7533 /* XXX-LP */ 7534 if (e.error == EISDIR && crgetuid(cr) != 0) 7535 e.error = EPERM; 7536 goto out; 7537 } 7538 } 7539 7540 /* either no error or one of the postop getattr failed */ 7541 7542 /* 7543 * XXX - if LINK succeeded, but no attrs were returned for link 7544 * file, purge its cache. 7545 * 7546 * XXX Perform a simplified version of wcc checking. Instead of 7547 * have another getattr to get pre-op, just purge cache if 7548 * any of the ops prior to and including the getattr failed. 7549 * If the getattr succeeded then update the attrcache accordingly. 7550 */ 7551 7552 /* 7553 * update cache with link file postattrs. 7554 * Note: at this point resop points to link res. 7555 */ 7556 resop = &res.array[3]; /* link res */ 7557 ln_res = &resop->nfs_resop4_u.oplink; 7558 if (res.status == NFS4_OK) 7559 e.error = nfs4_update_attrcache(res.status, 7560 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7561 t, svp, cr); 7562 7563 /* 7564 * Call makenfs4node to create the new shadow vp for tnm. 7565 * We pass NULL attrs because we just cached attrs for 7566 * the src object. All we're trying to accomplish is to 7567 * to create the new shadow vnode. 7568 */ 7569 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7570 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7571 7572 /* Update target cache attribute, readdir and dnlc caches */ 7573 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7574 dinfo.di_time_call = t; 7575 dinfo.di_cred = cr; 7576 7577 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7578 ASSERT(nfs4_consistent_type(tdvp)); 7579 ASSERT(nfs4_consistent_type(svp)); 7580 ASSERT(nfs4_consistent_type(nvp)); 7581 VN_RELE(nvp); 7582 7583 if (!e.error) { 7584 vnode_t *tvp; 7585 rnode4_t *trp; 7586 /* 7587 * Notify the source file of this link operation. 7588 */ 7589 trp = VTOR4(svp); 7590 tvp = svp; 7591 if (IS_SHADOW(svp, trp)) 7592 tvp = RTOV4(trp); 7593 vnevent_link(tvp, ct); 7594 } 7595 out: 7596 kmem_free(argop, argoplist_size); 7597 if (resp) 7598 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7599 7600 nfs_rw_exit(&tdrp->r_rwlock); 7601 7602 return (e.error); 7603 } 7604 7605 /* ARGSUSED */ 7606 static int 7607 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7608 caller_context_t *ct, int flags) 7609 { 7610 vnode_t *realvp; 7611 7612 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7613 return (EPERM); 7614 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7615 ndvp = realvp; 7616 7617 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7618 } 7619 7620 /* 7621 * nfs4rename does the real work of renaming in NFS Version 4. 7622 * 7623 * A file handle is considered volatile for renaming purposes if either 7624 * of the volatile bits are turned on. However, the compound may differ 7625 * based on the likelihood of the filehandle to change during rename. 7626 */ 7627 static int 7628 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7629 caller_context_t *ct) 7630 { 7631 int error; 7632 mntinfo4_t *mi; 7633 vnode_t *nvp = NULL; 7634 vnode_t *ovp = NULL; 7635 char *tmpname = NULL; 7636 rnode4_t *rp; 7637 rnode4_t *odrp; 7638 rnode4_t *ndrp; 7639 int did_link = 0; 7640 int do_link = 1; 7641 nfsstat4 stat = NFS4_OK; 7642 7643 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7644 ASSERT(nfs4_consistent_type(odvp)); 7645 ASSERT(nfs4_consistent_type(ndvp)); 7646 7647 if (onm[0] == '.' && (onm[1] == '\0' || 7648 (onm[1] == '.' && onm[2] == '\0'))) 7649 return (EINVAL); 7650 7651 if (nnm[0] == '.' && (nnm[1] == '\0' || 7652 (nnm[1] == '.' && nnm[2] == '\0'))) 7653 return (EINVAL); 7654 7655 odrp = VTOR4(odvp); 7656 ndrp = VTOR4(ndvp); 7657 if ((intptr_t)odrp < (intptr_t)ndrp) { 7658 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7659 return (EINTR); 7660 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7661 nfs_rw_exit(&odrp->r_rwlock); 7662 return (EINTR); 7663 } 7664 } else { 7665 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7666 return (EINTR); 7667 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7668 nfs_rw_exit(&ndrp->r_rwlock); 7669 return (EINTR); 7670 } 7671 } 7672 7673 /* 7674 * Lookup the target file. If it exists, it needs to be 7675 * checked to see whether it is a mount point and whether 7676 * it is active (open). 7677 */ 7678 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7679 if (!error) { 7680 int isactive; 7681 7682 ASSERT(nfs4_consistent_type(nvp)); 7683 /* 7684 * If this file has been mounted on, then just 7685 * return busy because renaming to it would remove 7686 * the mounted file system from the name space. 7687 */ 7688 if (vn_ismntpt(nvp)) { 7689 VN_RELE(nvp); 7690 nfs_rw_exit(&odrp->r_rwlock); 7691 nfs_rw_exit(&ndrp->r_rwlock); 7692 return (EBUSY); 7693 } 7694 7695 /* 7696 * First just remove the entry from the name cache, as it 7697 * is most likely the only entry for this vp. 7698 */ 7699 dnlc_remove(ndvp, nnm); 7700 7701 rp = VTOR4(nvp); 7702 7703 if (nvp->v_type != VREG) { 7704 /* 7705 * Purge the name cache of all references to this vnode 7706 * so that we can check the reference count to infer 7707 * whether it is active or not. 7708 */ 7709 if (nvp->v_count > 1) 7710 dnlc_purge_vp(nvp); 7711 7712 isactive = nvp->v_count > 1; 7713 } else { 7714 mutex_enter(&rp->r_os_lock); 7715 isactive = list_head(&rp->r_open_streams) != NULL; 7716 mutex_exit(&rp->r_os_lock); 7717 } 7718 7719 /* 7720 * If the vnode is active and is not a directory, 7721 * arrange to rename it to a 7722 * temporary file so that it will continue to be 7723 * accessible. This implements the "unlink-open-file" 7724 * semantics for the target of a rename operation. 7725 * Before doing this though, make sure that the 7726 * source and target files are not already the same. 7727 */ 7728 if (isactive && nvp->v_type != VDIR) { 7729 /* 7730 * Lookup the source name. 7731 */ 7732 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7733 7734 /* 7735 * The source name *should* already exist. 7736 */ 7737 if (error) { 7738 VN_RELE(nvp); 7739 nfs_rw_exit(&odrp->r_rwlock); 7740 nfs_rw_exit(&ndrp->r_rwlock); 7741 return (error); 7742 } 7743 7744 ASSERT(nfs4_consistent_type(ovp)); 7745 7746 /* 7747 * Compare the two vnodes. If they are the same, 7748 * just release all held vnodes and return success. 7749 */ 7750 if (VN_CMP(ovp, nvp)) { 7751 VN_RELE(ovp); 7752 VN_RELE(nvp); 7753 nfs_rw_exit(&odrp->r_rwlock); 7754 nfs_rw_exit(&ndrp->r_rwlock); 7755 return (0); 7756 } 7757 7758 /* 7759 * Can't mix and match directories and non- 7760 * directories in rename operations. We already 7761 * know that the target is not a directory. If 7762 * the source is a directory, return an error. 7763 */ 7764 if (ovp->v_type == VDIR) { 7765 VN_RELE(ovp); 7766 VN_RELE(nvp); 7767 nfs_rw_exit(&odrp->r_rwlock); 7768 nfs_rw_exit(&ndrp->r_rwlock); 7769 return (ENOTDIR); 7770 } 7771 link_call: 7772 /* 7773 * The target file exists, is not the same as 7774 * the source file, and is active. We first 7775 * try to Link it to a temporary filename to 7776 * avoid having the server removing the file 7777 * completely (which could cause data loss to 7778 * the user's POV in the event the Rename fails 7779 * -- see bug 1165874). 7780 */ 7781 /* 7782 * The do_link and did_link booleans are 7783 * introduced in the event we get NFS4ERR_FILE_OPEN 7784 * returned for the Rename. Some servers can 7785 * not Rename over an Open file, so they return 7786 * this error. The client needs to Remove the 7787 * newly created Link and do two Renames, just 7788 * as if the server didn't support LINK. 7789 */ 7790 tmpname = newname(); 7791 error = 0; 7792 7793 if (do_link) { 7794 error = nfs4_link(ndvp, nvp, tmpname, cr, 7795 NULL, 0); 7796 } 7797 if (error == EOPNOTSUPP || !do_link) { 7798 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7799 cr, NULL, 0); 7800 did_link = 0; 7801 } else { 7802 did_link = 1; 7803 } 7804 if (error) { 7805 kmem_free(tmpname, MAXNAMELEN); 7806 VN_RELE(ovp); 7807 VN_RELE(nvp); 7808 nfs_rw_exit(&odrp->r_rwlock); 7809 nfs_rw_exit(&ndrp->r_rwlock); 7810 return (error); 7811 } 7812 7813 mutex_enter(&rp->r_statelock); 7814 if (rp->r_unldvp == NULL) { 7815 VN_HOLD(ndvp); 7816 rp->r_unldvp = ndvp; 7817 if (rp->r_unlcred != NULL) 7818 crfree(rp->r_unlcred); 7819 crhold(cr); 7820 rp->r_unlcred = cr; 7821 rp->r_unlname = tmpname; 7822 } else { 7823 if (rp->r_unlname) 7824 kmem_free(rp->r_unlname, MAXNAMELEN); 7825 rp->r_unlname = tmpname; 7826 } 7827 mutex_exit(&rp->r_statelock); 7828 } 7829 7830 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7831 7832 ASSERT(nfs4_consistent_type(nvp)); 7833 } 7834 7835 if (ovp == NULL) { 7836 /* 7837 * When renaming directories to be a subdirectory of a 7838 * different parent, the dnlc entry for ".." will no 7839 * longer be valid, so it must be removed. 7840 * 7841 * We do a lookup here to determine whether we are renaming 7842 * a directory and we need to check if we are renaming 7843 * an unlinked file. This might have already been done 7844 * in previous code, so we check ovp == NULL to avoid 7845 * doing it twice. 7846 */ 7847 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7848 /* 7849 * The source name *should* already exist. 7850 */ 7851 if (error) { 7852 nfs_rw_exit(&odrp->r_rwlock); 7853 nfs_rw_exit(&ndrp->r_rwlock); 7854 if (nvp) { 7855 VN_RELE(nvp); 7856 } 7857 return (error); 7858 } 7859 ASSERT(ovp != NULL); 7860 ASSERT(nfs4_consistent_type(ovp)); 7861 } 7862 7863 /* 7864 * Is the object being renamed a dir, and if so, is 7865 * it being renamed to a child of itself? The underlying 7866 * fs should ultimately return EINVAL for this case; 7867 * however, buggy beta non-Solaris NFSv4 servers at 7868 * interop testing events have allowed this behavior, 7869 * and it caused our client to panic due to a recursive 7870 * mutex_enter in fn_move. 7871 * 7872 * The tedious locking in fn_move could be changed to 7873 * deal with this case, and the client could avoid the 7874 * panic; however, the client would just confuse itself 7875 * later and misbehave. A better way to handle the broken 7876 * server is to detect this condition and return EINVAL 7877 * without ever sending the the bogus rename to the server. 7878 * We know the rename is invalid -- just fail it now. 7879 */ 7880 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7881 VN_RELE(ovp); 7882 nfs_rw_exit(&odrp->r_rwlock); 7883 nfs_rw_exit(&ndrp->r_rwlock); 7884 if (nvp) { 7885 VN_RELE(nvp); 7886 } 7887 return (EINVAL); 7888 } 7889 7890 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7891 7892 /* 7893 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7894 * possible for the filehandle to change due to the rename. 7895 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7896 * the fh will not change because of the rename, but we still need 7897 * to update its rnode entry with the new name for 7898 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7899 * has no effect on these for now, but for future improvements, 7900 * we might want to use it too to simplify handling of files 7901 * that are open with that flag on. (XXX) 7902 */ 7903 mi = VTOMI4(odvp); 7904 if (NFS4_VOLATILE_FH(mi)) 7905 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7906 &stat); 7907 else 7908 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7909 &stat); 7910 7911 ASSERT(nfs4_consistent_type(odvp)); 7912 ASSERT(nfs4_consistent_type(ndvp)); 7913 ASSERT(nfs4_consistent_type(ovp)); 7914 7915 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7916 do_link = 0; 7917 /* 7918 * Before the 'link_call' code, we did a nfs4_lookup 7919 * that puts a VN_HOLD on nvp. After the nfs4_link 7920 * call we call VN_RELE to match that hold. We need 7921 * to place an additional VN_HOLD here since we will 7922 * be hitting that VN_RELE again. 7923 */ 7924 VN_HOLD(nvp); 7925 7926 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7927 7928 /* Undo the unlinked file naming stuff we just did */ 7929 mutex_enter(&rp->r_statelock); 7930 if (rp->r_unldvp) { 7931 VN_RELE(ndvp); 7932 rp->r_unldvp = NULL; 7933 if (rp->r_unlcred != NULL) 7934 crfree(rp->r_unlcred); 7935 rp->r_unlcred = NULL; 7936 /* rp->r_unlanme points to tmpname */ 7937 if (rp->r_unlname) 7938 kmem_free(rp->r_unlname, MAXNAMELEN); 7939 rp->r_unlname = NULL; 7940 } 7941 mutex_exit(&rp->r_statelock); 7942 7943 if (nvp) { 7944 VN_RELE(nvp); 7945 } 7946 goto link_call; 7947 } 7948 7949 if (error) { 7950 VN_RELE(ovp); 7951 nfs_rw_exit(&odrp->r_rwlock); 7952 nfs_rw_exit(&ndrp->r_rwlock); 7953 if (nvp) { 7954 VN_RELE(nvp); 7955 } 7956 return (error); 7957 } 7958 7959 /* 7960 * when renaming directories to be a subdirectory of a 7961 * different parent, the dnlc entry for ".." will no 7962 * longer be valid, so it must be removed 7963 */ 7964 rp = VTOR4(ovp); 7965 if (ndvp != odvp) { 7966 if (ovp->v_type == VDIR) { 7967 dnlc_remove(ovp, ".."); 7968 if (rp->r_dir != NULL) 7969 nfs4_purge_rddir_cache(ovp); 7970 } 7971 } 7972 7973 /* 7974 * If we are renaming the unlinked file, update the 7975 * r_unldvp and r_unlname as needed. 7976 */ 7977 mutex_enter(&rp->r_statelock); 7978 if (rp->r_unldvp != NULL) { 7979 if (strcmp(rp->r_unlname, onm) == 0) { 7980 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 7981 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 7982 if (ndvp != rp->r_unldvp) { 7983 VN_RELE(rp->r_unldvp); 7984 rp->r_unldvp = ndvp; 7985 VN_HOLD(ndvp); 7986 } 7987 } 7988 } 7989 mutex_exit(&rp->r_statelock); 7990 7991 /* 7992 * Notify the rename vnevents to source vnode, and to the target 7993 * vnode if it already existed. 7994 */ 7995 if (error == 0) { 7996 vnode_t *tvp; 7997 rnode4_t *trp; 7998 /* 7999 * Notify the vnode. Each links is represented by 8000 * a different vnode, in nfsv4. 8001 */ 8002 if (nvp) { 8003 trp = VTOR4(nvp); 8004 tvp = nvp; 8005 if (IS_SHADOW(nvp, trp)) 8006 tvp = RTOV4(trp); 8007 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8008 } 8009 8010 /* 8011 * if the source and destination directory are not the 8012 * same notify the destination directory. 8013 */ 8014 if (VTOR4(odvp) != VTOR4(ndvp)) { 8015 trp = VTOR4(ndvp); 8016 tvp = ndvp; 8017 if (IS_SHADOW(ndvp, trp)) 8018 tvp = RTOV4(trp); 8019 vnevent_rename_dest_dir(tvp, ct); 8020 } 8021 8022 trp = VTOR4(ovp); 8023 tvp = ovp; 8024 if (IS_SHADOW(ovp, trp)) 8025 tvp = RTOV4(trp); 8026 vnevent_rename_src(tvp, odvp, onm, ct); 8027 } 8028 8029 if (nvp) { 8030 VN_RELE(nvp); 8031 } 8032 VN_RELE(ovp); 8033 8034 nfs_rw_exit(&odrp->r_rwlock); 8035 nfs_rw_exit(&ndrp->r_rwlock); 8036 8037 return (error); 8038 } 8039 8040 /* 8041 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8042 * when it is known that the filehandle is persistent through rename. 8043 * 8044 * Rename requires that the current fh be the target directory and the 8045 * saved fh be the source directory. After the operation, the current fh 8046 * is unchanged. 8047 * The compound op structure for persistent fh rename is: 8048 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8049 * Rather than bother with the directory postop args, we'll simply 8050 * update that a change occurred in the cache, so no post-op getattrs. 8051 */ 8052 static int 8053 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8054 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8055 { 8056 COMPOUND4args_clnt args; 8057 COMPOUND4res_clnt res, *resp = NULL; 8058 nfs_argop4 *argop; 8059 nfs_resop4 *resop; 8060 int doqueue, argoplist_size; 8061 mntinfo4_t *mi; 8062 rnode4_t *odrp = VTOR4(odvp); 8063 rnode4_t *ndrp = VTOR4(ndvp); 8064 RENAME4res *rn_res; 8065 bool_t needrecov; 8066 nfs4_recov_state_t recov_state; 8067 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8068 dirattr_info_t dinfo, *dinfop; 8069 8070 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8071 8072 recov_state.rs_flags = 0; 8073 recov_state.rs_num_retry_despite_err = 0; 8074 8075 /* 8076 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8077 * 8078 * If source/target are different dirs, then append putfh(src); getattr 8079 */ 8080 args.array_len = (odvp == ndvp) ? 5 : 7; 8081 argoplist_size = args.array_len * sizeof (nfs_argop4); 8082 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8083 8084 recov_retry: 8085 *statp = NFS4_OK; 8086 8087 /* No need to Lookup the file, persistent fh */ 8088 args.ctag = TAG_RENAME; 8089 8090 mi = VTOMI4(odvp); 8091 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8092 if (e.error) { 8093 kmem_free(argop, argoplist_size); 8094 return (e.error); 8095 } 8096 8097 /* 0: putfh source directory */ 8098 argop[0].argop = OP_CPUTFH; 8099 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8100 8101 /* 1: Save source fh to free up current for target */ 8102 argop[1].argop = OP_SAVEFH; 8103 8104 /* 2: putfh targetdir */ 8105 argop[2].argop = OP_CPUTFH; 8106 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8107 8108 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8109 argop[3].argop = OP_CRENAME; 8110 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8111 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8112 8113 /* 4: getattr (targetdir) */ 8114 argop[4].argop = OP_GETATTR; 8115 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8116 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8117 8118 if (ndvp != odvp) { 8119 8120 /* 5: putfh (sourcedir) */ 8121 argop[5].argop = OP_CPUTFH; 8122 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8123 8124 /* 6: getattr (sourcedir) */ 8125 argop[6].argop = OP_GETATTR; 8126 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8127 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8128 } 8129 8130 dnlc_remove(odvp, onm); 8131 dnlc_remove(ndvp, nnm); 8132 8133 doqueue = 1; 8134 dinfo.di_time_call = gethrtime(); 8135 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8136 8137 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8138 if (e.error) { 8139 PURGE_ATTRCACHE4(odvp); 8140 PURGE_ATTRCACHE4(ndvp); 8141 } else { 8142 *statp = res.status; 8143 } 8144 8145 if (needrecov) { 8146 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8147 OP_RENAME, NULL) == FALSE) { 8148 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8149 if (!e.error) 8150 (void) xdr_free(xdr_COMPOUND4res_clnt, 8151 (caddr_t)&res); 8152 goto recov_retry; 8153 } 8154 } 8155 8156 if (!e.error) { 8157 resp = &res; 8158 /* 8159 * as long as OP_RENAME 8160 */ 8161 if (res.status != NFS4_OK && res.array_len <= 4) { 8162 e.error = geterrno4(res.status); 8163 PURGE_ATTRCACHE4(odvp); 8164 PURGE_ATTRCACHE4(ndvp); 8165 /* 8166 * System V defines rename to return EEXIST, not 8167 * ENOTEMPTY if the target directory is not empty. 8168 * Over the wire, the error is NFSERR_ENOTEMPTY 8169 * which geterrno4 maps to ENOTEMPTY. 8170 */ 8171 if (e.error == ENOTEMPTY) 8172 e.error = EEXIST; 8173 } else { 8174 8175 resop = &res.array[3]; /* rename res */ 8176 rn_res = &resop->nfs_resop4_u.oprename; 8177 8178 if (res.status == NFS4_OK) { 8179 /* 8180 * Update target attribute, readdir and dnlc 8181 * caches. 8182 */ 8183 dinfo.di_garp = 8184 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8185 dinfo.di_cred = cr; 8186 dinfop = &dinfo; 8187 } else 8188 dinfop = NULL; 8189 8190 nfs4_update_dircaches(&rn_res->target_cinfo, 8191 ndvp, NULL, NULL, dinfop); 8192 8193 /* 8194 * Update source attribute, readdir and dnlc caches 8195 * 8196 */ 8197 if (ndvp != odvp) { 8198 if (dinfop) 8199 dinfo.di_garp = 8200 &(res.array[6].nfs_resop4_u. 8201 opgetattr.ga_res); 8202 8203 nfs4_update_dircaches(&rn_res->source_cinfo, 8204 odvp, NULL, NULL, dinfop); 8205 } 8206 8207 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8208 nnm); 8209 } 8210 } 8211 8212 if (resp) 8213 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8214 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8215 kmem_free(argop, argoplist_size); 8216 8217 return (e.error); 8218 } 8219 8220 /* 8221 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8222 * it is possible for the filehandle to change due to the rename. 8223 * 8224 * The compound req in this case includes a post-rename lookup and getattr 8225 * to ensure that we have the correct fh and attributes for the object. 8226 * 8227 * Rename requires that the current fh be the target directory and the 8228 * saved fh be the source directory. After the operation, the current fh 8229 * is unchanged. 8230 * 8231 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8232 * update the filehandle for the renamed object. We also get the old 8233 * filehandle for historical reasons; this should be taken out sometime. 8234 * This results in a rather cumbersome compound... 8235 * 8236 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8237 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8238 * 8239 */ 8240 static int 8241 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8242 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8243 { 8244 COMPOUND4args_clnt args; 8245 COMPOUND4res_clnt res, *resp = NULL; 8246 int argoplist_size; 8247 nfs_argop4 *argop; 8248 nfs_resop4 *resop; 8249 int doqueue; 8250 mntinfo4_t *mi; 8251 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8252 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8253 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8254 RENAME4res *rn_res; 8255 GETFH4res *ngf_res; 8256 bool_t needrecov; 8257 nfs4_recov_state_t recov_state; 8258 hrtime_t t; 8259 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8260 dirattr_info_t dinfo, *dinfop = &dinfo; 8261 8262 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8263 8264 recov_state.rs_flags = 0; 8265 recov_state.rs_num_retry_despite_err = 0; 8266 8267 recov_retry: 8268 *statp = NFS4_OK; 8269 8270 /* 8271 * There is a window between the RPC and updating the path and 8272 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8273 * code, so that it doesn't try to use the old path during that 8274 * window. 8275 */ 8276 mutex_enter(&orp->r_statelock); 8277 while (orp->r_flags & R4RECEXPFH) { 8278 klwp_t *lwp = ttolwp(curthread); 8279 8280 if (lwp != NULL) 8281 lwp->lwp_nostop++; 8282 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8283 mutex_exit(&orp->r_statelock); 8284 if (lwp != NULL) 8285 lwp->lwp_nostop--; 8286 return (EINTR); 8287 } 8288 if (lwp != NULL) 8289 lwp->lwp_nostop--; 8290 } 8291 orp->r_flags |= R4RECEXPFH; 8292 mutex_exit(&orp->r_statelock); 8293 8294 mi = VTOMI4(odvp); 8295 8296 args.ctag = TAG_RENAME_VFH; 8297 args.array_len = (odvp == ndvp) ? 10 : 12; 8298 argoplist_size = args.array_len * sizeof (nfs_argop4); 8299 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8300 8301 /* 8302 * Rename ops: 8303 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8304 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8305 * LOOKUP(trgt), GETFH(new), GETATTR, 8306 * 8307 * if (odvp != ndvp) 8308 * add putfh(sourcedir), getattr(sourcedir) } 8309 */ 8310 args.array = argop; 8311 8312 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8313 &recov_state, NULL); 8314 if (e.error) { 8315 kmem_free(argop, argoplist_size); 8316 mutex_enter(&orp->r_statelock); 8317 orp->r_flags &= ~R4RECEXPFH; 8318 cv_broadcast(&orp->r_cv); 8319 mutex_exit(&orp->r_statelock); 8320 return (e.error); 8321 } 8322 8323 /* 0: putfh source directory */ 8324 argop[0].argop = OP_CPUTFH; 8325 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8326 8327 /* 1: Save source fh to free up current for target */ 8328 argop[1].argop = OP_SAVEFH; 8329 8330 /* 2: Lookup pre-rename fh of renamed object */ 8331 argop[2].argop = OP_CLOOKUP; 8332 argop[2].nfs_argop4_u.opclookup.cname = onm; 8333 8334 /* 3: getfh fh of renamed object (before rename) */ 8335 argop[3].argop = OP_GETFH; 8336 8337 /* 4: putfh targetdir */ 8338 argop[4].argop = OP_CPUTFH; 8339 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8340 8341 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8342 argop[5].argop = OP_CRENAME; 8343 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8344 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8345 8346 /* 6: getattr of target dir (post op attrs) */ 8347 argop[6].argop = OP_GETATTR; 8348 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8349 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8350 8351 /* 7: Lookup post-rename fh of renamed object */ 8352 argop[7].argop = OP_CLOOKUP; 8353 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8354 8355 /* 8: getfh fh of renamed object (after rename) */ 8356 argop[8].argop = OP_GETFH; 8357 8358 /* 9: getattr of renamed object */ 8359 argop[9].argop = OP_GETATTR; 8360 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8361 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8362 8363 /* 8364 * If source/target dirs are different, then get new post-op 8365 * attrs for source dir also. 8366 */ 8367 if (ndvp != odvp) { 8368 /* 10: putfh (sourcedir) */ 8369 argop[10].argop = OP_CPUTFH; 8370 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8371 8372 /* 11: getattr (sourcedir) */ 8373 argop[11].argop = OP_GETATTR; 8374 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8375 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8376 } 8377 8378 dnlc_remove(odvp, onm); 8379 dnlc_remove(ndvp, nnm); 8380 8381 doqueue = 1; 8382 t = gethrtime(); 8383 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8384 8385 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8386 if (e.error) { 8387 PURGE_ATTRCACHE4(odvp); 8388 PURGE_ATTRCACHE4(ndvp); 8389 if (!needrecov) { 8390 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8391 &recov_state, needrecov); 8392 goto out; 8393 } 8394 } else { 8395 *statp = res.status; 8396 } 8397 8398 if (needrecov) { 8399 bool_t abort; 8400 8401 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8402 OP_RENAME, NULL); 8403 if (abort == FALSE) { 8404 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8405 &recov_state, needrecov); 8406 kmem_free(argop, argoplist_size); 8407 if (!e.error) 8408 (void) xdr_free(xdr_COMPOUND4res_clnt, 8409 (caddr_t)&res); 8410 mutex_enter(&orp->r_statelock); 8411 orp->r_flags &= ~R4RECEXPFH; 8412 cv_broadcast(&orp->r_cv); 8413 mutex_exit(&orp->r_statelock); 8414 goto recov_retry; 8415 } else { 8416 if (e.error != 0) { 8417 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8418 &recov_state, needrecov); 8419 goto out; 8420 } 8421 /* fall through for res.status case */ 8422 } 8423 } 8424 8425 resp = &res; 8426 /* 8427 * If OP_RENAME (or any prev op) failed, then return an error. 8428 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8429 */ 8430 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8431 /* 8432 * Error in an op other than last Getattr 8433 */ 8434 e.error = geterrno4(res.status); 8435 PURGE_ATTRCACHE4(odvp); 8436 PURGE_ATTRCACHE4(ndvp); 8437 /* 8438 * System V defines rename to return EEXIST, not 8439 * ENOTEMPTY if the target directory is not empty. 8440 * Over the wire, the error is NFSERR_ENOTEMPTY 8441 * which geterrno4 maps to ENOTEMPTY. 8442 */ 8443 if (e.error == ENOTEMPTY) 8444 e.error = EEXIST; 8445 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8446 needrecov); 8447 goto out; 8448 } 8449 8450 /* rename results */ 8451 rn_res = &res.array[5].nfs_resop4_u.oprename; 8452 8453 if (res.status == NFS4_OK) { 8454 /* Update target attribute, readdir and dnlc caches */ 8455 dinfo.di_garp = 8456 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8457 dinfo.di_cred = cr; 8458 dinfo.di_time_call = t; 8459 } else 8460 dinfop = NULL; 8461 8462 /* Update source cache attribute, readdir and dnlc caches */ 8463 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8464 8465 /* Update source cache attribute, readdir and dnlc caches */ 8466 if (ndvp != odvp) { 8467 8468 /* 8469 * If dinfop is non-NULL, then compound succeded, so 8470 * set di_garp to attrs for source dir. dinfop is only 8471 * set to NULL when compound fails. 8472 */ 8473 if (dinfop) 8474 dinfo.di_garp = 8475 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8476 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8477 dinfop); 8478 } 8479 8480 /* 8481 * Update the rnode with the new component name and args, 8482 * and if the file handle changed, also update it with the new fh. 8483 * This is only necessary if the target object has an rnode 8484 * entry and there is no need to create one for it. 8485 */ 8486 resop = &res.array[8]; /* getfh new res */ 8487 ngf_res = &resop->nfs_resop4_u.opgetfh; 8488 8489 /* 8490 * Update the path and filehandle for the renamed object. 8491 */ 8492 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8493 8494 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8495 8496 if (res.status == NFS4_OK) { 8497 resop++; /* getattr res */ 8498 e.error = nfs4_update_attrcache(res.status, 8499 &resop->nfs_resop4_u.opgetattr.ga_res, 8500 t, ovp, cr); 8501 } 8502 8503 out: 8504 kmem_free(argop, argoplist_size); 8505 if (resp) 8506 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8507 mutex_enter(&orp->r_statelock); 8508 orp->r_flags &= ~R4RECEXPFH; 8509 cv_broadcast(&orp->r_cv); 8510 mutex_exit(&orp->r_statelock); 8511 8512 return (e.error); 8513 } 8514 8515 /* ARGSUSED */ 8516 static int 8517 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8518 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8519 { 8520 int error; 8521 vnode_t *vp; 8522 8523 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8524 return (EPERM); 8525 /* 8526 * As ".." has special meaning and rather than send a mkdir 8527 * over the wire to just let the server freak out, we just 8528 * short circuit it here and return EEXIST 8529 */ 8530 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8531 return (EEXIST); 8532 8533 /* 8534 * Decision to get the right gid and setgid bit of the 8535 * new directory is now made in call_nfs4_create_req. 8536 */ 8537 va->va_mask |= AT_MODE; 8538 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8539 if (error) 8540 return (error); 8541 8542 *vpp = vp; 8543 return (0); 8544 } 8545 8546 8547 /* 8548 * rmdir is using the same remove v4 op as does remove. 8549 * Remove requires that the current fh be the target directory. 8550 * After the operation, the current fh is unchanged. 8551 * The compound op structure is: 8552 * PUTFH(targetdir), REMOVE 8553 */ 8554 /*ARGSUSED4*/ 8555 static int 8556 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8557 caller_context_t *ct, int flags) 8558 { 8559 int need_end_op = FALSE; 8560 COMPOUND4args_clnt args; 8561 COMPOUND4res_clnt res, *resp = NULL; 8562 REMOVE4res *rm_res; 8563 nfs_argop4 argop[3]; 8564 nfs_resop4 *resop; 8565 vnode_t *vp; 8566 int doqueue; 8567 mntinfo4_t *mi; 8568 rnode4_t *drp; 8569 bool_t needrecov = FALSE; 8570 nfs4_recov_state_t recov_state; 8571 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8572 dirattr_info_t dinfo, *dinfop; 8573 8574 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8575 return (EPERM); 8576 /* 8577 * As ".." has special meaning and rather than send a rmdir 8578 * over the wire to just let the server freak out, we just 8579 * short circuit it here and return EEXIST 8580 */ 8581 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8582 return (EEXIST); 8583 8584 drp = VTOR4(dvp); 8585 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8586 return (EINTR); 8587 8588 /* 8589 * Attempt to prevent a rmdir(".") from succeeding. 8590 */ 8591 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8592 if (e.error) { 8593 nfs_rw_exit(&drp->r_rwlock); 8594 return (e.error); 8595 } 8596 if (vp == cdir) { 8597 VN_RELE(vp); 8598 nfs_rw_exit(&drp->r_rwlock); 8599 return (EINVAL); 8600 } 8601 8602 /* 8603 * Since nfsv4 remove op works on both files and directories, 8604 * check that the removed object is indeed a directory. 8605 */ 8606 if (vp->v_type != VDIR) { 8607 VN_RELE(vp); 8608 nfs_rw_exit(&drp->r_rwlock); 8609 return (ENOTDIR); 8610 } 8611 8612 /* 8613 * First just remove the entry from the name cache, as it 8614 * is most likely an entry for this vp. 8615 */ 8616 dnlc_remove(dvp, nm); 8617 8618 /* 8619 * If there vnode reference count is greater than one, then 8620 * there may be additional references in the DNLC which will 8621 * need to be purged. First, trying removing the entry for 8622 * the parent directory and see if that removes the additional 8623 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8624 * to completely remove any references to the directory which 8625 * might still exist in the DNLC. 8626 */ 8627 if (vp->v_count > 1) { 8628 dnlc_remove(vp, ".."); 8629 if (vp->v_count > 1) 8630 dnlc_purge_vp(vp); 8631 } 8632 8633 mi = VTOMI4(dvp); 8634 recov_state.rs_flags = 0; 8635 recov_state.rs_num_retry_despite_err = 0; 8636 8637 recov_retry: 8638 args.ctag = TAG_RMDIR; 8639 8640 /* 8641 * Rmdir ops: putfh dir; remove 8642 */ 8643 args.array_len = 3; 8644 args.array = argop; 8645 8646 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8647 if (e.error) { 8648 nfs_rw_exit(&drp->r_rwlock); 8649 return (e.error); 8650 } 8651 need_end_op = TRUE; 8652 8653 /* putfh directory */ 8654 argop[0].argop = OP_CPUTFH; 8655 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8656 8657 /* remove */ 8658 argop[1].argop = OP_CREMOVE; 8659 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8660 8661 /* getattr (postop attrs for dir that contained removed dir) */ 8662 argop[2].argop = OP_GETATTR; 8663 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8664 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8665 8666 dinfo.di_time_call = gethrtime(); 8667 doqueue = 1; 8668 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8669 8670 PURGE_ATTRCACHE4(vp); 8671 8672 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8673 if (e.error) { 8674 PURGE_ATTRCACHE4(dvp); 8675 } 8676 8677 if (needrecov) { 8678 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8679 NULL, OP_REMOVE, NULL) == FALSE) { 8680 if (!e.error) 8681 (void) xdr_free(xdr_COMPOUND4res_clnt, 8682 (caddr_t)&res); 8683 8684 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8685 needrecov); 8686 need_end_op = FALSE; 8687 goto recov_retry; 8688 } 8689 } 8690 8691 if (!e.error) { 8692 resp = &res; 8693 8694 /* 8695 * Only return error if first 2 ops (OP_REMOVE or earlier) 8696 * failed. 8697 */ 8698 if (res.status != NFS4_OK && res.array_len <= 2) { 8699 e.error = geterrno4(res.status); 8700 PURGE_ATTRCACHE4(dvp); 8701 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8702 &recov_state, needrecov); 8703 need_end_op = FALSE; 8704 nfs4_purge_stale_fh(e.error, dvp, cr); 8705 /* 8706 * System V defines rmdir to return EEXIST, not 8707 * ENOTEMPTY if the directory is not empty. Over 8708 * the wire, the error is NFSERR_ENOTEMPTY which 8709 * geterrno4 maps to ENOTEMPTY. 8710 */ 8711 if (e.error == ENOTEMPTY) 8712 e.error = EEXIST; 8713 } else { 8714 resop = &res.array[1]; /* remove res */ 8715 rm_res = &resop->nfs_resop4_u.opremove; 8716 8717 if (res.status == NFS4_OK) { 8718 resop = &res.array[2]; /* dir attrs */ 8719 dinfo.di_garp = 8720 &resop->nfs_resop4_u.opgetattr.ga_res; 8721 dinfo.di_cred = cr; 8722 dinfop = &dinfo; 8723 } else 8724 dinfop = NULL; 8725 8726 /* Update dir attribute, readdir and dnlc caches */ 8727 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8728 dinfop); 8729 8730 /* destroy rddir cache for dir that was removed */ 8731 if (VTOR4(vp)->r_dir != NULL) 8732 nfs4_purge_rddir_cache(vp); 8733 } 8734 } 8735 8736 if (need_end_op) 8737 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8738 8739 nfs_rw_exit(&drp->r_rwlock); 8740 8741 if (resp) 8742 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8743 8744 if (e.error == 0) { 8745 vnode_t *tvp; 8746 rnode4_t *trp; 8747 trp = VTOR4(vp); 8748 tvp = vp; 8749 if (IS_SHADOW(vp, trp)) 8750 tvp = RTOV4(trp); 8751 vnevent_rmdir(tvp, dvp, nm, ct); 8752 } 8753 8754 VN_RELE(vp); 8755 8756 return (e.error); 8757 } 8758 8759 /* ARGSUSED */ 8760 static int 8761 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8762 caller_context_t *ct, int flags) 8763 { 8764 int error; 8765 vnode_t *vp; 8766 rnode4_t *rp; 8767 char *contents; 8768 mntinfo4_t *mi = VTOMI4(dvp); 8769 8770 if (nfs_zone() != mi->mi_zone) 8771 return (EPERM); 8772 if (!(mi->mi_flags & MI4_SYMLINK)) 8773 return (EOPNOTSUPP); 8774 8775 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8776 if (error) 8777 return (error); 8778 8779 ASSERT(nfs4_consistent_type(vp)); 8780 rp = VTOR4(vp); 8781 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8782 8783 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8784 8785 if (contents != NULL) { 8786 mutex_enter(&rp->r_statelock); 8787 if (rp->r_symlink.contents == NULL) { 8788 rp->r_symlink.len = strlen(tnm); 8789 bcopy(tnm, contents, rp->r_symlink.len); 8790 rp->r_symlink.contents = contents; 8791 rp->r_symlink.size = MAXPATHLEN; 8792 mutex_exit(&rp->r_statelock); 8793 } else { 8794 mutex_exit(&rp->r_statelock); 8795 kmem_free((void *)contents, MAXPATHLEN); 8796 } 8797 } 8798 } 8799 VN_RELE(vp); 8800 8801 return (error); 8802 } 8803 8804 8805 /* 8806 * Read directory entries. 8807 * There are some weird things to look out for here. The uio_loffset 8808 * field is either 0 or it is the offset returned from a previous 8809 * readdir. It is an opaque value used by the server to find the 8810 * correct directory block to read. The count field is the number 8811 * of blocks to read on the server. This is advisory only, the server 8812 * may return only one block's worth of entries. Entries may be compressed 8813 * on the server. 8814 */ 8815 /* ARGSUSED */ 8816 static int 8817 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8818 caller_context_t *ct, int flags) 8819 { 8820 int error; 8821 uint_t count; 8822 rnode4_t *rp; 8823 rddir4_cache *rdc; 8824 rddir4_cache *rrdc; 8825 8826 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8827 return (EIO); 8828 rp = VTOR4(vp); 8829 8830 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8831 8832 /* 8833 * Make sure that the directory cache is valid. 8834 */ 8835 if (rp->r_dir != NULL) { 8836 if (nfs_disable_rddir_cache != 0) { 8837 /* 8838 * Setting nfs_disable_rddir_cache in /etc/system 8839 * allows interoperability with servers that do not 8840 * properly update the attributes of directories. 8841 * Any cached information gets purged before an 8842 * access is made to it. 8843 */ 8844 nfs4_purge_rddir_cache(vp); 8845 } 8846 8847 error = nfs4_validate_caches(vp, cr); 8848 if (error) 8849 return (error); 8850 } 8851 8852 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8853 8854 /* 8855 * Short circuit last readdir which always returns 0 bytes. 8856 * This can be done after the directory has been read through 8857 * completely at least once. This will set r_direof which 8858 * can be used to find the value of the last cookie. 8859 */ 8860 mutex_enter(&rp->r_statelock); 8861 if (rp->r_direof != NULL && 8862 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8863 mutex_exit(&rp->r_statelock); 8864 #ifdef DEBUG 8865 nfs4_readdir_cache_shorts++; 8866 #endif 8867 if (eofp) 8868 *eofp = 1; 8869 return (0); 8870 } 8871 8872 /* 8873 * Look for a cache entry. Cache entries are identified 8874 * by the NFS cookie value and the byte count requested. 8875 */ 8876 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8877 8878 /* 8879 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8880 */ 8881 if (rdc == NULL) { 8882 mutex_exit(&rp->r_statelock); 8883 return (EINTR); 8884 } 8885 8886 /* 8887 * Check to see if we need to fill this entry in. 8888 */ 8889 if (rdc->flags & RDDIRREQ) { 8890 rdc->flags &= ~RDDIRREQ; 8891 rdc->flags |= RDDIR; 8892 mutex_exit(&rp->r_statelock); 8893 8894 /* 8895 * Do the readdir. 8896 */ 8897 nfs4readdir(vp, rdc, cr); 8898 8899 /* 8900 * Reacquire the lock, so that we can continue 8901 */ 8902 mutex_enter(&rp->r_statelock); 8903 /* 8904 * The entry is now complete 8905 */ 8906 rdc->flags &= ~RDDIR; 8907 } 8908 8909 ASSERT(!(rdc->flags & RDDIR)); 8910 8911 /* 8912 * If an error occurred while attempting 8913 * to fill the cache entry, mark the entry invalid and 8914 * just return the error. 8915 */ 8916 if (rdc->error) { 8917 error = rdc->error; 8918 rdc->flags |= RDDIRREQ; 8919 rddir4_cache_rele(rp, rdc); 8920 mutex_exit(&rp->r_statelock); 8921 return (error); 8922 } 8923 8924 /* 8925 * The cache entry is complete and good, 8926 * copyout the dirent structs to the calling 8927 * thread. 8928 */ 8929 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8930 8931 /* 8932 * If no error occurred during the copyout, 8933 * update the offset in the uio struct to 8934 * contain the value of the next NFS 4 cookie 8935 * and set the eof value appropriately. 8936 */ 8937 if (!error) { 8938 uiop->uio_loffset = rdc->nfs4_ncookie; 8939 if (eofp) 8940 *eofp = rdc->eof; 8941 } 8942 8943 /* 8944 * Decide whether to do readahead. Don't if we 8945 * have already read to the end of directory. 8946 */ 8947 if (rdc->eof) { 8948 /* 8949 * Make the entry the direof only if it is cached 8950 */ 8951 if (rdc->flags & RDDIRCACHED) 8952 rp->r_direof = rdc; 8953 rddir4_cache_rele(rp, rdc); 8954 mutex_exit(&rp->r_statelock); 8955 return (error); 8956 } 8957 8958 /* Determine if a readdir readahead should be done */ 8959 if (!(rp->r_flags & R4LOOKUP)) { 8960 rddir4_cache_rele(rp, rdc); 8961 mutex_exit(&rp->r_statelock); 8962 return (error); 8963 } 8964 8965 /* 8966 * Now look for a readahead entry. 8967 * 8968 * Check to see whether we found an entry for the readahead. 8969 * If so, we don't need to do anything further, so free the new 8970 * entry if one was allocated. Otherwise, allocate a new entry, add 8971 * it to the cache, and then initiate an asynchronous readdir 8972 * operation to fill it. 8973 */ 8974 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 8975 8976 /* 8977 * A readdir cache entry could not be obtained for the readahead. In 8978 * this case we skip the readahead and return. 8979 */ 8980 if (rrdc == NULL) { 8981 rddir4_cache_rele(rp, rdc); 8982 mutex_exit(&rp->r_statelock); 8983 return (error); 8984 } 8985 8986 /* 8987 * Check to see if we need to fill this entry in. 8988 */ 8989 if (rrdc->flags & RDDIRREQ) { 8990 rrdc->flags &= ~RDDIRREQ; 8991 rrdc->flags |= RDDIR; 8992 rddir4_cache_rele(rp, rdc); 8993 mutex_exit(&rp->r_statelock); 8994 #ifdef DEBUG 8995 nfs4_readdir_readahead++; 8996 #endif 8997 /* 8998 * Do the readdir. 8999 */ 9000 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9001 return (error); 9002 } 9003 9004 rddir4_cache_rele(rp, rrdc); 9005 rddir4_cache_rele(rp, rdc); 9006 mutex_exit(&rp->r_statelock); 9007 return (error); 9008 } 9009 9010 static int 9011 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9012 { 9013 int error; 9014 rnode4_t *rp; 9015 9016 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9017 9018 rp = VTOR4(vp); 9019 9020 /* 9021 * Obtain the readdir results for the caller. 9022 */ 9023 nfs4readdir(vp, rdc, cr); 9024 9025 mutex_enter(&rp->r_statelock); 9026 /* 9027 * The entry is now complete 9028 */ 9029 rdc->flags &= ~RDDIR; 9030 9031 error = rdc->error; 9032 if (error) 9033 rdc->flags |= RDDIRREQ; 9034 rddir4_cache_rele(rp, rdc); 9035 mutex_exit(&rp->r_statelock); 9036 9037 return (error); 9038 } 9039 9040 /* 9041 * Read directory entries. 9042 * There are some weird things to look out for here. The uio_loffset 9043 * field is either 0 or it is the offset returned from a previous 9044 * readdir. It is an opaque value used by the server to find the 9045 * correct directory block to read. The count field is the number 9046 * of blocks to read on the server. This is advisory only, the server 9047 * may return only one block's worth of entries. Entries may be compressed 9048 * on the server. 9049 * 9050 * Generates the following compound request: 9051 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9052 * must include a Lookupp as well. In this case, send: 9053 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9054 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9055 * 9056 * Get complete attributes and filehandles for entries if this is the 9057 * first read of the directory. Otherwise, just get fileid's. 9058 */ 9059 static void 9060 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9061 { 9062 COMPOUND4args_clnt args; 9063 COMPOUND4res_clnt res; 9064 READDIR4args *rargs; 9065 READDIR4res_clnt *rd_res; 9066 bitmap4 rd_bitsval; 9067 nfs_argop4 argop[5]; 9068 nfs_resop4 *resop; 9069 rnode4_t *rp = VTOR4(vp); 9070 mntinfo4_t *mi = VTOMI4(vp); 9071 int doqueue; 9072 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9073 vnode_t *dvp; 9074 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9075 int num_ops, res_opcnt; 9076 bool_t needrecov = FALSE; 9077 nfs4_recov_state_t recov_state; 9078 hrtime_t t; 9079 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9080 9081 ASSERT(nfs_zone() == mi->mi_zone); 9082 ASSERT(rdc->flags & RDDIR); 9083 ASSERT(rdc->entries == NULL); 9084 9085 /* 9086 * If rp were a stub, it should have triggered and caused 9087 * a mount for us to get this far. 9088 */ 9089 ASSERT(!RP_ISSTUB(rp)); 9090 9091 num_ops = 2; 9092 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9093 /* 9094 * Since nfsv4 readdir may not return entries for "." and "..", 9095 * the client must recreate them: 9096 * To find the correct nodeid, do the following: 9097 * For current node, get nodeid from dnlc. 9098 * - if current node is rootvp, set pnodeid to nodeid. 9099 * - else if parent is in the dnlc, get its nodeid from there. 9100 * - else add LOOKUPP+GETATTR to compound. 9101 */ 9102 nodeid = rp->r_attr.va_nodeid; 9103 if (vp->v_flag & VROOT) { 9104 pnodeid = nodeid; /* root of mount point */ 9105 } else { 9106 dvp = dnlc_lookup(vp, ".."); 9107 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9108 /* parent in dnlc cache - no need for otw */ 9109 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9110 } else { 9111 /* 9112 * parent not in dnlc cache, 9113 * do lookupp to get its id 9114 */ 9115 num_ops = 5; 9116 pnodeid = 0; /* set later by getattr parent */ 9117 } 9118 if (dvp) 9119 VN_RELE(dvp); 9120 } 9121 } 9122 recov_state.rs_flags = 0; 9123 recov_state.rs_num_retry_despite_err = 0; 9124 9125 /* Save the original mount point security flavor */ 9126 (void) save_mnt_secinfo(mi->mi_curr_serv); 9127 9128 recov_retry: 9129 args.ctag = TAG_READDIR; 9130 9131 args.array = argop; 9132 args.array_len = num_ops; 9133 9134 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9135 &recov_state, NULL)) { 9136 /* 9137 * If readdir a node that is a stub for a crossed mount point, 9138 * keep the original secinfo flavor for the current file 9139 * system, not the crossed one. 9140 */ 9141 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9142 rdc->error = e.error; 9143 return; 9144 } 9145 9146 /* 9147 * Determine which attrs to request for dirents. This code 9148 * must be protected by nfs4_start/end_fop because of r_server 9149 * (which will change during failover recovery). 9150 * 9151 */ 9152 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9153 /* 9154 * Get all vattr attrs plus filehandle and rdattr_error 9155 */ 9156 rd_bitsval = NFS4_VATTR_MASK | 9157 FATTR4_RDATTR_ERROR_MASK | 9158 FATTR4_FILEHANDLE_MASK; 9159 9160 if (rp->r_flags & R4READDIRWATTR) { 9161 mutex_enter(&rp->r_statelock); 9162 rp->r_flags &= ~R4READDIRWATTR; 9163 mutex_exit(&rp->r_statelock); 9164 } 9165 } else { 9166 servinfo4_t *svp = rp->r_server; 9167 9168 /* 9169 * Already read directory. Use readdir with 9170 * no attrs (except for mounted_on_fileid) for updates. 9171 */ 9172 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9173 9174 /* 9175 * request mounted on fileid if supported, else request 9176 * fileid. maybe we should verify that fileid is supported 9177 * and request something else if not. 9178 */ 9179 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9180 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9181 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9182 nfs_rw_exit(&svp->sv_lock); 9183 } 9184 9185 /* putfh directory fh */ 9186 argop[0].argop = OP_CPUTFH; 9187 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9188 9189 argop[1].argop = OP_READDIR; 9190 rargs = &argop[1].nfs_argop4_u.opreaddir; 9191 /* 9192 * 1 and 2 are reserved for client "." and ".." entry offset. 9193 * cookie 0 should be used over-the-wire to start reading at 9194 * the beginning of the directory excluding "." and "..". 9195 */ 9196 if (rdc->nfs4_cookie == 0 || 9197 rdc->nfs4_cookie == 1 || 9198 rdc->nfs4_cookie == 2) { 9199 rargs->cookie = (nfs_cookie4)0; 9200 rargs->cookieverf = 0; 9201 } else { 9202 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9203 mutex_enter(&rp->r_statelock); 9204 rargs->cookieverf = rp->r_cookieverf4; 9205 mutex_exit(&rp->r_statelock); 9206 } 9207 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9208 rargs->maxcount = mi->mi_tsize; 9209 rargs->attr_request = rd_bitsval; 9210 rargs->rdc = rdc; 9211 rargs->dvp = vp; 9212 rargs->mi = mi; 9213 rargs->cr = cr; 9214 9215 9216 /* 9217 * If count < than the minimum required, we return no entries 9218 * and fail with EINVAL 9219 */ 9220 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9221 rdc->error = EINVAL; 9222 goto out; 9223 } 9224 9225 if (args.array_len == 5) { 9226 /* 9227 * Add lookupp and getattr for parent nodeid. 9228 */ 9229 argop[2].argop = OP_LOOKUPP; 9230 9231 argop[3].argop = OP_GETFH; 9232 9233 /* getattr parent */ 9234 argop[4].argop = OP_GETATTR; 9235 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9236 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9237 } 9238 9239 doqueue = 1; 9240 9241 if (mi->mi_io_kstats) { 9242 mutex_enter(&mi->mi_lock); 9243 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9244 mutex_exit(&mi->mi_lock); 9245 } 9246 9247 /* capture the time of this call */ 9248 rargs->t = t = gethrtime(); 9249 9250 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9251 9252 if (mi->mi_io_kstats) { 9253 mutex_enter(&mi->mi_lock); 9254 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9255 mutex_exit(&mi->mi_lock); 9256 } 9257 9258 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9259 9260 /* 9261 * If RPC error occurred and it isn't an error that 9262 * triggers recovery, then go ahead and fail now. 9263 */ 9264 if (e.error != 0 && !needrecov) { 9265 rdc->error = e.error; 9266 goto out; 9267 } 9268 9269 if (needrecov) { 9270 bool_t abort; 9271 9272 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9273 "nfs4readdir: initiating recovery.\n")); 9274 9275 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9276 NULL, OP_READDIR, NULL); 9277 if (abort == FALSE) { 9278 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9279 &recov_state, needrecov); 9280 if (!e.error) 9281 (void) xdr_free(xdr_COMPOUND4res_clnt, 9282 (caddr_t)&res); 9283 if (rdc->entries != NULL) { 9284 kmem_free(rdc->entries, rdc->entlen); 9285 rdc->entries = NULL; 9286 } 9287 goto recov_retry; 9288 } 9289 9290 if (e.error != 0) { 9291 rdc->error = e.error; 9292 goto out; 9293 } 9294 9295 /* fall through for res.status case */ 9296 } 9297 9298 res_opcnt = res.array_len; 9299 9300 /* 9301 * If compound failed first 2 ops (PUTFH+READDIR), then return 9302 * failure here. Subsequent ops are for filling out dot-dot 9303 * dirent, and if they fail, we still want to give the caller 9304 * the dirents returned by (the successful) READDIR op, so we need 9305 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9306 * 9307 * One example where PUTFH+READDIR ops would succeed but 9308 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9309 * but lacks x. In this case, a POSIX server's VOP_READDIR 9310 * would succeed; however, VOP_LOOKUP(..) would fail since no 9311 * x perm. We need to come up with a non-vendor-specific way 9312 * for a POSIX server to return d_ino from dotdot's dirent if 9313 * client only requests mounted_on_fileid, and just say the 9314 * LOOKUPP succeeded and fill out the GETATTR. However, if 9315 * client requested any mandatory attrs, server would be required 9316 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9317 * for dotdot. 9318 */ 9319 9320 if (res.status) { 9321 if (res_opcnt <= 2) { 9322 e.error = geterrno4(res.status); 9323 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9324 &recov_state, needrecov); 9325 nfs4_purge_stale_fh(e.error, vp, cr); 9326 rdc->error = e.error; 9327 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9328 if (rdc->entries != NULL) { 9329 kmem_free(rdc->entries, rdc->entlen); 9330 rdc->entries = NULL; 9331 } 9332 /* 9333 * If readdir a node that is a stub for a 9334 * crossed mount point, keep the original 9335 * secinfo flavor for the current file system, 9336 * not the crossed one. 9337 */ 9338 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9339 return; 9340 } 9341 } 9342 9343 resop = &res.array[1]; /* readdir res */ 9344 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9345 9346 mutex_enter(&rp->r_statelock); 9347 rp->r_cookieverf4 = rd_res->cookieverf; 9348 mutex_exit(&rp->r_statelock); 9349 9350 /* 9351 * For "." and ".." entries 9352 * e.g. 9353 * seek(cookie=0) -> "." entry with d_off = 1 9354 * seek(cookie=1) -> ".." entry with d_off = 2 9355 */ 9356 if (cookie == (nfs_cookie4) 0) { 9357 if (rd_res->dotp) 9358 rd_res->dotp->d_ino = nodeid; 9359 if (rd_res->dotdotp) 9360 rd_res->dotdotp->d_ino = pnodeid; 9361 } 9362 if (cookie == (nfs_cookie4) 1) { 9363 if (rd_res->dotdotp) 9364 rd_res->dotdotp->d_ino = pnodeid; 9365 } 9366 9367 9368 /* LOOKUPP+GETATTR attemped */ 9369 if (args.array_len == 5 && rd_res->dotdotp) { 9370 if (res.status == NFS4_OK && res_opcnt == 5) { 9371 nfs_fh4 *fhp; 9372 nfs4_sharedfh_t *sfhp; 9373 vnode_t *pvp; 9374 nfs4_ga_res_t *garp; 9375 9376 resop++; /* lookupp */ 9377 resop++; /* getfh */ 9378 fhp = &resop->nfs_resop4_u.opgetfh.object; 9379 9380 resop++; /* getattr of parent */ 9381 9382 /* 9383 * First, take care of finishing the 9384 * readdir results. 9385 */ 9386 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9387 /* 9388 * The d_ino of .. must be the inode number 9389 * of the mounted filesystem. 9390 */ 9391 if (garp->n4g_va.va_mask & AT_NODEID) 9392 rd_res->dotdotp->d_ino = 9393 garp->n4g_va.va_nodeid; 9394 9395 9396 /* 9397 * Next, create the ".." dnlc entry 9398 */ 9399 sfhp = sfh4_get(fhp, mi); 9400 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9401 dnlc_update(vp, "..", pvp); 9402 VN_RELE(pvp); 9403 } 9404 sfh4_rele(&sfhp); 9405 } 9406 } 9407 9408 if (mi->mi_io_kstats) { 9409 mutex_enter(&mi->mi_lock); 9410 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9411 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9412 mutex_exit(&mi->mi_lock); 9413 } 9414 9415 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9416 9417 out: 9418 /* 9419 * If readdir a node that is a stub for a crossed mount point, 9420 * keep the original secinfo flavor for the current file system, 9421 * not the crossed one. 9422 */ 9423 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9424 9425 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9426 } 9427 9428 9429 static int 9430 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9431 { 9432 rnode4_t *rp = VTOR4(bp->b_vp); 9433 int count; 9434 int error; 9435 cred_t *cred_otw = NULL; 9436 offset_t offset; 9437 nfs4_open_stream_t *osp = NULL; 9438 bool_t first_time = TRUE; /* first time getting otw cred */ 9439 bool_t last_time = FALSE; /* last time getting otw cred */ 9440 9441 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9442 9443 DTRACE_IO1(start, struct buf *, bp); 9444 offset = ldbtob(bp->b_lblkno); 9445 9446 if (bp->b_flags & B_READ) { 9447 read_again: 9448 /* 9449 * Releases the osp, if it is provided. 9450 * Puts a hold on the cred_otw and the new osp (if found). 9451 */ 9452 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9453 &first_time, &last_time); 9454 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9455 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9456 readahead, NULL); 9457 crfree(cred_otw); 9458 if (!error) { 9459 if (bp->b_resid) { 9460 /* 9461 * Didn't get it all because we hit EOF, 9462 * zero all the memory beyond the EOF. 9463 */ 9464 /* bzero(rdaddr + */ 9465 bzero(bp->b_un.b_addr + 9466 bp->b_bcount - bp->b_resid, bp->b_resid); 9467 } 9468 mutex_enter(&rp->r_statelock); 9469 if (bp->b_resid == bp->b_bcount && 9470 offset >= rp->r_size) { 9471 /* 9472 * We didn't read anything at all as we are 9473 * past EOF. Return an error indicator back 9474 * but don't destroy the pages (yet). 9475 */ 9476 error = NFS_EOF; 9477 } 9478 mutex_exit(&rp->r_statelock); 9479 } else if (error == EACCES && last_time == FALSE) { 9480 goto read_again; 9481 } 9482 } else { 9483 if (!(rp->r_flags & R4STALE)) { 9484 write_again: 9485 /* 9486 * Releases the osp, if it is provided. 9487 * Puts a hold on the cred_otw and the new 9488 * osp (if found). 9489 */ 9490 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9491 &first_time, &last_time); 9492 mutex_enter(&rp->r_statelock); 9493 count = MIN(bp->b_bcount, rp->r_size - offset); 9494 mutex_exit(&rp->r_statelock); 9495 if (count < 0) 9496 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9497 #ifdef DEBUG 9498 if (count == 0) { 9499 zoneid_t zoneid = getzoneid(); 9500 9501 zcmn_err(zoneid, CE_WARN, 9502 "nfs4_bio: zero length write at %lld", 9503 offset); 9504 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9505 "b_bcount=%ld, file size=%lld", 9506 rp->r_flags, (long)bp->b_bcount, 9507 rp->r_size); 9508 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9509 if (nfs4_bio_do_stop) 9510 debug_enter("nfs4_bio"); 9511 } 9512 #endif 9513 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9514 count, cred_otw, stab_comm); 9515 if (error == EACCES && last_time == FALSE) { 9516 crfree(cred_otw); 9517 goto write_again; 9518 } 9519 bp->b_error = error; 9520 if (error && error != EINTR && 9521 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9522 /* 9523 * Don't print EDQUOT errors on the console. 9524 * Don't print asynchronous EACCES errors. 9525 * Don't print EFBIG errors. 9526 * Print all other write errors. 9527 */ 9528 if (error != EDQUOT && error != EFBIG && 9529 (error != EACCES || 9530 !(bp->b_flags & B_ASYNC))) 9531 nfs4_write_error(bp->b_vp, 9532 error, cred_otw); 9533 /* 9534 * Update r_error and r_flags as appropriate. 9535 * If the error was ESTALE, then mark the 9536 * rnode as not being writeable and save 9537 * the error status. Otherwise, save any 9538 * errors which occur from asynchronous 9539 * page invalidations. Any errors occurring 9540 * from other operations should be saved 9541 * by the caller. 9542 */ 9543 mutex_enter(&rp->r_statelock); 9544 if (error == ESTALE) { 9545 rp->r_flags |= R4STALE; 9546 if (!rp->r_error) 9547 rp->r_error = error; 9548 } else if (!rp->r_error && 9549 (bp->b_flags & 9550 (B_INVAL|B_FORCE|B_ASYNC)) == 9551 (B_INVAL|B_FORCE|B_ASYNC)) { 9552 rp->r_error = error; 9553 } 9554 mutex_exit(&rp->r_statelock); 9555 } 9556 crfree(cred_otw); 9557 } else 9558 error = rp->r_error; 9559 } 9560 9561 if (error != 0 && error != NFS_EOF) 9562 bp->b_flags |= B_ERROR; 9563 9564 if (osp) 9565 open_stream_rele(osp, rp); 9566 9567 DTRACE_IO1(done, struct buf *, bp); 9568 9569 return (error); 9570 } 9571 9572 /* ARGSUSED */ 9573 int 9574 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9575 { 9576 return (EREMOTE); 9577 } 9578 9579 /* ARGSUSED2 */ 9580 int 9581 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9582 { 9583 rnode4_t *rp = VTOR4(vp); 9584 9585 if (!write_lock) { 9586 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9587 return (V_WRITELOCK_FALSE); 9588 } 9589 9590 if ((rp->r_flags & R4DIRECTIO) || 9591 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9592 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9593 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9594 return (V_WRITELOCK_FALSE); 9595 nfs_rw_exit(&rp->r_rwlock); 9596 } 9597 9598 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9599 return (V_WRITELOCK_TRUE); 9600 } 9601 9602 /* ARGSUSED */ 9603 void 9604 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9605 { 9606 rnode4_t *rp = VTOR4(vp); 9607 9608 nfs_rw_exit(&rp->r_rwlock); 9609 } 9610 9611 /* ARGSUSED */ 9612 static int 9613 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9614 { 9615 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9616 return (EIO); 9617 9618 /* 9619 * Because we stuff the readdir cookie into the offset field 9620 * someone may attempt to do an lseek with the cookie which 9621 * we want to succeed. 9622 */ 9623 if (vp->v_type == VDIR) 9624 return (0); 9625 if (*noffp < 0) 9626 return (EINVAL); 9627 return (0); 9628 } 9629 9630 9631 /* 9632 * Return all the pages from [off..off+len) in file 9633 */ 9634 /* ARGSUSED */ 9635 static int 9636 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9637 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9638 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9639 { 9640 rnode4_t *rp; 9641 int error; 9642 mntinfo4_t *mi; 9643 9644 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9645 return (EIO); 9646 rp = VTOR4(vp); 9647 if (IS_SHADOW(vp, rp)) 9648 vp = RTOV4(rp); 9649 9650 if (vp->v_flag & VNOMAP) 9651 return (ENOSYS); 9652 9653 if (protp != NULL) 9654 *protp = PROT_ALL; 9655 9656 /* 9657 * Now validate that the caches are up to date. 9658 */ 9659 if (error = nfs4_validate_caches(vp, cr)) 9660 return (error); 9661 9662 mi = VTOMI4(vp); 9663 retry: 9664 mutex_enter(&rp->r_statelock); 9665 9666 /* 9667 * Don't create dirty pages faster than they 9668 * can be cleaned so that the system doesn't 9669 * get imbalanced. If the async queue is 9670 * maxed out, then wait for it to drain before 9671 * creating more dirty pages. Also, wait for 9672 * any threads doing pagewalks in the vop_getattr 9673 * entry points so that they don't block for 9674 * long periods. 9675 */ 9676 if (rw == S_CREATE) { 9677 while ((mi->mi_max_threads != 0 && 9678 rp->r_awcount > 2 * mi->mi_max_threads) || 9679 rp->r_gcount > 0) 9680 cv_wait(&rp->r_cv, &rp->r_statelock); 9681 } 9682 9683 /* 9684 * If we are getting called as a side effect of an nfs_write() 9685 * operation the local file size might not be extended yet. 9686 * In this case we want to be able to return pages of zeroes. 9687 */ 9688 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9689 NFS4_DEBUG(nfs4_pageio_debug, 9690 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9691 "len=%llu, size=%llu, attrsize =%llu", off, 9692 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9693 mutex_exit(&rp->r_statelock); 9694 return (EFAULT); /* beyond EOF */ 9695 } 9696 9697 mutex_exit(&rp->r_statelock); 9698 9699 if (len <= PAGESIZE) { 9700 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9701 seg, addr, rw, cr); 9702 NFS4_DEBUG(nfs4_pageio_debug && error, 9703 (CE_NOTE, "getpage error %d; off=%lld, " 9704 "len=%lld", error, off, (u_longlong_t)len)); 9705 } else { 9706 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9707 pl, plsz, seg, addr, rw, cr); 9708 NFS4_DEBUG(nfs4_pageio_debug && error, 9709 (CE_NOTE, "getpages error %d; off=%lld, " 9710 "len=%lld", error, off, (u_longlong_t)len)); 9711 } 9712 9713 switch (error) { 9714 case NFS_EOF: 9715 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9716 goto retry; 9717 case ESTALE: 9718 nfs4_purge_stale_fh(error, vp, cr); 9719 } 9720 9721 return (error); 9722 } 9723 9724 /* 9725 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9726 */ 9727 /* ARGSUSED */ 9728 static int 9729 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9730 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9731 enum seg_rw rw, cred_t *cr) 9732 { 9733 rnode4_t *rp; 9734 uint_t bsize; 9735 struct buf *bp; 9736 page_t *pp; 9737 u_offset_t lbn; 9738 u_offset_t io_off; 9739 u_offset_t blkoff; 9740 u_offset_t rablkoff; 9741 size_t io_len; 9742 uint_t blksize; 9743 int error; 9744 int readahead; 9745 int readahead_issued = 0; 9746 int ra_window; /* readahead window */ 9747 page_t *pagefound; 9748 page_t *savepp; 9749 9750 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9751 return (EIO); 9752 9753 rp = VTOR4(vp); 9754 ASSERT(!IS_SHADOW(vp, rp)); 9755 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9756 9757 reread: 9758 bp = NULL; 9759 pp = NULL; 9760 pagefound = NULL; 9761 9762 if (pl != NULL) 9763 pl[0] = NULL; 9764 9765 error = 0; 9766 lbn = off / bsize; 9767 blkoff = lbn * bsize; 9768 9769 /* 9770 * Queueing up the readahead before doing the synchronous read 9771 * results in a significant increase in read throughput because 9772 * of the increased parallelism between the async threads and 9773 * the process context. 9774 */ 9775 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9776 rw != S_CREATE && 9777 !(vp->v_flag & VNOCACHE)) { 9778 mutex_enter(&rp->r_statelock); 9779 9780 /* 9781 * Calculate the number of readaheads to do. 9782 * a) No readaheads at offset = 0. 9783 * b) Do maximum(nfs4_nra) readaheads when the readahead 9784 * window is closed. 9785 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9786 * upon how far the readahead window is open or close. 9787 * d) No readaheads if rp->r_nextr is not within the scope 9788 * of the readahead window (random i/o). 9789 */ 9790 9791 if (off == 0) 9792 readahead = 0; 9793 else if (blkoff == rp->r_nextr) 9794 readahead = nfs4_nra; 9795 else if (rp->r_nextr > blkoff && 9796 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9797 <= (nfs4_nra - 1))) 9798 readahead = nfs4_nra - ra_window; 9799 else 9800 readahead = 0; 9801 9802 rablkoff = rp->r_nextr; 9803 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9804 mutex_exit(&rp->r_statelock); 9805 if (nfs4_async_readahead(vp, rablkoff + bsize, 9806 addr + (rablkoff + bsize - off), 9807 seg, cr, nfs4_readahead) < 0) { 9808 mutex_enter(&rp->r_statelock); 9809 break; 9810 } 9811 readahead--; 9812 rablkoff += bsize; 9813 /* 9814 * Indicate that we did a readahead so 9815 * readahead offset is not updated 9816 * by the synchronous read below. 9817 */ 9818 readahead_issued = 1; 9819 mutex_enter(&rp->r_statelock); 9820 /* 9821 * set readahead offset to 9822 * offset of last async readahead 9823 * request. 9824 */ 9825 rp->r_nextr = rablkoff; 9826 } 9827 mutex_exit(&rp->r_statelock); 9828 } 9829 9830 again: 9831 if ((pagefound = page_exists(vp, off)) == NULL) { 9832 if (pl == NULL) { 9833 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9834 nfs4_readahead); 9835 } else if (rw == S_CREATE) { 9836 /* 9837 * Block for this page is not allocated, or the offset 9838 * is beyond the current allocation size, or we're 9839 * allocating a swap slot and the page was not found, 9840 * so allocate it and return a zero page. 9841 */ 9842 if ((pp = page_create_va(vp, off, 9843 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9844 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9845 io_len = PAGESIZE; 9846 mutex_enter(&rp->r_statelock); 9847 rp->r_nextr = off + PAGESIZE; 9848 mutex_exit(&rp->r_statelock); 9849 } else { 9850 /* 9851 * Need to go to server to get a block 9852 */ 9853 mutex_enter(&rp->r_statelock); 9854 if (blkoff < rp->r_size && 9855 blkoff + bsize > rp->r_size) { 9856 /* 9857 * If less than a block left in 9858 * file read less than a block. 9859 */ 9860 if (rp->r_size <= off) { 9861 /* 9862 * Trying to access beyond EOF, 9863 * set up to get at least one page. 9864 */ 9865 blksize = off + PAGESIZE - blkoff; 9866 } else 9867 blksize = rp->r_size - blkoff; 9868 } else if ((off == 0) || 9869 (off != rp->r_nextr && !readahead_issued)) { 9870 blksize = PAGESIZE; 9871 blkoff = off; /* block = page here */ 9872 } else 9873 blksize = bsize; 9874 mutex_exit(&rp->r_statelock); 9875 9876 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9877 &io_len, blkoff, blksize, 0); 9878 9879 /* 9880 * Some other thread has entered the page, 9881 * so just use it. 9882 */ 9883 if (pp == NULL) 9884 goto again; 9885 9886 /* 9887 * Now round the request size up to page boundaries. 9888 * This ensures that the entire page will be 9889 * initialized to zeroes if EOF is encountered. 9890 */ 9891 io_len = ptob(btopr(io_len)); 9892 9893 bp = pageio_setup(pp, io_len, vp, B_READ); 9894 ASSERT(bp != NULL); 9895 9896 /* 9897 * pageio_setup should have set b_addr to 0. This 9898 * is correct since we want to do I/O on a page 9899 * boundary. bp_mapin will use this addr to calculate 9900 * an offset, and then set b_addr to the kernel virtual 9901 * address it allocated for us. 9902 */ 9903 ASSERT(bp->b_un.b_addr == 0); 9904 9905 bp->b_edev = 0; 9906 bp->b_dev = 0; 9907 bp->b_lblkno = lbtodb(io_off); 9908 bp->b_file = vp; 9909 bp->b_offset = (offset_t)off; 9910 bp_mapin(bp); 9911 9912 /* 9913 * If doing a write beyond what we believe is EOF, 9914 * don't bother trying to read the pages from the 9915 * server, we'll just zero the pages here. We 9916 * don't check that the rw flag is S_WRITE here 9917 * because some implementations may attempt a 9918 * read access to the buffer before copying data. 9919 */ 9920 mutex_enter(&rp->r_statelock); 9921 if (io_off >= rp->r_size && seg == segkmap) { 9922 mutex_exit(&rp->r_statelock); 9923 bzero(bp->b_un.b_addr, io_len); 9924 } else { 9925 mutex_exit(&rp->r_statelock); 9926 error = nfs4_bio(bp, NULL, cr, FALSE); 9927 } 9928 9929 /* 9930 * Unmap the buffer before freeing it. 9931 */ 9932 bp_mapout(bp); 9933 pageio_done(bp); 9934 9935 savepp = pp; 9936 do { 9937 pp->p_fsdata = C_NOCOMMIT; 9938 } while ((pp = pp->p_next) != savepp); 9939 9940 if (error == NFS_EOF) { 9941 /* 9942 * If doing a write system call just return 9943 * zeroed pages, else user tried to get pages 9944 * beyond EOF, return error. We don't check 9945 * that the rw flag is S_WRITE here because 9946 * some implementations may attempt a read 9947 * access to the buffer before copying data. 9948 */ 9949 if (seg == segkmap) 9950 error = 0; 9951 else 9952 error = EFAULT; 9953 } 9954 9955 if (!readahead_issued && !error) { 9956 mutex_enter(&rp->r_statelock); 9957 rp->r_nextr = io_off + io_len; 9958 mutex_exit(&rp->r_statelock); 9959 } 9960 } 9961 } 9962 9963 out: 9964 if (pl == NULL) 9965 return (error); 9966 9967 if (error) { 9968 if (pp != NULL) 9969 pvn_read_done(pp, B_ERROR); 9970 return (error); 9971 } 9972 9973 if (pagefound) { 9974 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 9975 9976 /* 9977 * Page exists in the cache, acquire the appropriate lock. 9978 * If this fails, start all over again. 9979 */ 9980 if ((pp = page_lookup(vp, off, se)) == NULL) { 9981 #ifdef DEBUG 9982 nfs4_lostpage++; 9983 #endif 9984 goto reread; 9985 } 9986 pl[0] = pp; 9987 pl[1] = NULL; 9988 return (0); 9989 } 9990 9991 if (pp != NULL) 9992 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 9993 9994 return (error); 9995 } 9996 9997 static void 9998 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 9999 cred_t *cr) 10000 { 10001 int error; 10002 page_t *pp; 10003 u_offset_t io_off; 10004 size_t io_len; 10005 struct buf *bp; 10006 uint_t bsize, blksize; 10007 rnode4_t *rp = VTOR4(vp); 10008 page_t *savepp; 10009 10010 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10011 10012 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10013 10014 mutex_enter(&rp->r_statelock); 10015 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10016 /* 10017 * If less than a block left in file read less 10018 * than a block. 10019 */ 10020 blksize = rp->r_size - blkoff; 10021 } else 10022 blksize = bsize; 10023 mutex_exit(&rp->r_statelock); 10024 10025 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10026 &io_off, &io_len, blkoff, blksize, 1); 10027 /* 10028 * The isra flag passed to the kluster function is 1, we may have 10029 * gotten a return value of NULL for a variety of reasons (# of free 10030 * pages < minfree, someone entered the page on the vnode etc). In all 10031 * cases, we want to punt on the readahead. 10032 */ 10033 if (pp == NULL) 10034 return; 10035 10036 /* 10037 * Now round the request size up to page boundaries. 10038 * This ensures that the entire page will be 10039 * initialized to zeroes if EOF is encountered. 10040 */ 10041 io_len = ptob(btopr(io_len)); 10042 10043 bp = pageio_setup(pp, io_len, vp, B_READ); 10044 ASSERT(bp != NULL); 10045 10046 /* 10047 * pageio_setup should have set b_addr to 0. This is correct since 10048 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10049 * to calculate an offset, and then set b_addr to the kernel virtual 10050 * address it allocated for us. 10051 */ 10052 ASSERT(bp->b_un.b_addr == 0); 10053 10054 bp->b_edev = 0; 10055 bp->b_dev = 0; 10056 bp->b_lblkno = lbtodb(io_off); 10057 bp->b_file = vp; 10058 bp->b_offset = (offset_t)blkoff; 10059 bp_mapin(bp); 10060 10061 /* 10062 * If doing a write beyond what we believe is EOF, don't bother trying 10063 * to read the pages from the server, we'll just zero the pages here. 10064 * We don't check that the rw flag is S_WRITE here because some 10065 * implementations may attempt a read access to the buffer before 10066 * copying data. 10067 */ 10068 mutex_enter(&rp->r_statelock); 10069 if (io_off >= rp->r_size && seg == segkmap) { 10070 mutex_exit(&rp->r_statelock); 10071 bzero(bp->b_un.b_addr, io_len); 10072 error = 0; 10073 } else { 10074 mutex_exit(&rp->r_statelock); 10075 error = nfs4_bio(bp, NULL, cr, TRUE); 10076 if (error == NFS_EOF) 10077 error = 0; 10078 } 10079 10080 /* 10081 * Unmap the buffer before freeing it. 10082 */ 10083 bp_mapout(bp); 10084 pageio_done(bp); 10085 10086 savepp = pp; 10087 do { 10088 pp->p_fsdata = C_NOCOMMIT; 10089 } while ((pp = pp->p_next) != savepp); 10090 10091 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10092 10093 /* 10094 * In case of error set readahead offset 10095 * to the lowest offset. 10096 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10097 */ 10098 if (error && rp->r_nextr > io_off) { 10099 mutex_enter(&rp->r_statelock); 10100 if (rp->r_nextr > io_off) 10101 rp->r_nextr = io_off; 10102 mutex_exit(&rp->r_statelock); 10103 } 10104 } 10105 10106 /* 10107 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10108 * If len == 0, do from off to EOF. 10109 * 10110 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10111 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10112 * (from pageout). 10113 */ 10114 /* ARGSUSED */ 10115 static int 10116 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10117 caller_context_t *ct) 10118 { 10119 int error; 10120 rnode4_t *rp; 10121 10122 ASSERT(cr != NULL); 10123 10124 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10125 return (EIO); 10126 10127 rp = VTOR4(vp); 10128 if (IS_SHADOW(vp, rp)) 10129 vp = RTOV4(rp); 10130 10131 /* 10132 * XXX - Why should this check be made here? 10133 */ 10134 if (vp->v_flag & VNOMAP) 10135 return (ENOSYS); 10136 10137 if (len == 0 && !(flags & B_INVAL) && 10138 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10139 return (0); 10140 10141 mutex_enter(&rp->r_statelock); 10142 rp->r_count++; 10143 mutex_exit(&rp->r_statelock); 10144 error = nfs4_putpages(vp, off, len, flags, cr); 10145 mutex_enter(&rp->r_statelock); 10146 rp->r_count--; 10147 cv_broadcast(&rp->r_cv); 10148 mutex_exit(&rp->r_statelock); 10149 10150 return (error); 10151 } 10152 10153 /* 10154 * Write out a single page, possibly klustering adjacent dirty pages. 10155 */ 10156 int 10157 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10158 int flags, cred_t *cr) 10159 { 10160 u_offset_t io_off; 10161 u_offset_t lbn_off; 10162 u_offset_t lbn; 10163 size_t io_len; 10164 uint_t bsize; 10165 int error; 10166 rnode4_t *rp; 10167 10168 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10169 ASSERT(pp != NULL); 10170 ASSERT(cr != NULL); 10171 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10172 10173 rp = VTOR4(vp); 10174 ASSERT(rp->r_count > 0); 10175 ASSERT(!IS_SHADOW(vp, rp)); 10176 10177 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10178 lbn = pp->p_offset / bsize; 10179 lbn_off = lbn * bsize; 10180 10181 /* 10182 * Find a kluster that fits in one block, or in 10183 * one page if pages are bigger than blocks. If 10184 * there is less file space allocated than a whole 10185 * page, we'll shorten the i/o request below. 10186 */ 10187 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10188 roundup(bsize, PAGESIZE), flags); 10189 10190 /* 10191 * pvn_write_kluster shouldn't have returned a page with offset 10192 * behind the original page we were given. Verify that. 10193 */ 10194 ASSERT((pp->p_offset / bsize) >= lbn); 10195 10196 /* 10197 * Now pp will have the list of kept dirty pages marked for 10198 * write back. It will also handle invalidation and freeing 10199 * of pages that are not dirty. Check for page length rounding 10200 * problems. 10201 */ 10202 if (io_off + io_len > lbn_off + bsize) { 10203 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10204 io_len = lbn_off + bsize - io_off; 10205 } 10206 /* 10207 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10208 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10209 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10210 * progress and the r_size has not been made consistent with the 10211 * new size of the file. When the uiomove() completes the r_size is 10212 * updated and the R4MODINPROGRESS flag is cleared. 10213 * 10214 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10215 * consistent value of r_size. Without this handshaking, it is 10216 * possible that nfs4_bio() picks up the old value of r_size 10217 * before the uiomove() in writerp4() completes. This will result 10218 * in the write through nfs4_bio() being dropped. 10219 * 10220 * More precisely, there is a window between the time the uiomove() 10221 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10222 * operation intervenes in this window, the page will be picked up, 10223 * because it is dirty (it will be unlocked, unless it was 10224 * pagecreate'd). When the page is picked up as dirty, the dirty 10225 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10226 * checked. This will still be the old size. Therefore the page will 10227 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10228 * the page will be found to be clean and the write will be dropped. 10229 */ 10230 if (rp->r_flags & R4MODINPROGRESS) { 10231 mutex_enter(&rp->r_statelock); 10232 if ((rp->r_flags & R4MODINPROGRESS) && 10233 rp->r_modaddr + MAXBSIZE > io_off && 10234 rp->r_modaddr < io_off + io_len) { 10235 page_t *plist; 10236 /* 10237 * A write is in progress for this region of the file. 10238 * If we did not detect R4MODINPROGRESS here then this 10239 * path through nfs_putapage() would eventually go to 10240 * nfs4_bio() and may not write out all of the data 10241 * in the pages. We end up losing data. So we decide 10242 * to set the modified bit on each page in the page 10243 * list and mark the rnode with R4DIRTY. This write 10244 * will be restarted at some later time. 10245 */ 10246 plist = pp; 10247 while (plist != NULL) { 10248 pp = plist; 10249 page_sub(&plist, pp); 10250 hat_setmod(pp); 10251 page_io_unlock(pp); 10252 page_unlock(pp); 10253 } 10254 rp->r_flags |= R4DIRTY; 10255 mutex_exit(&rp->r_statelock); 10256 if (offp) 10257 *offp = io_off; 10258 if (lenp) 10259 *lenp = io_len; 10260 return (0); 10261 } 10262 mutex_exit(&rp->r_statelock); 10263 } 10264 10265 if (flags & B_ASYNC) { 10266 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10267 nfs4_sync_putapage); 10268 } else 10269 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10270 10271 if (offp) 10272 *offp = io_off; 10273 if (lenp) 10274 *lenp = io_len; 10275 return (error); 10276 } 10277 10278 static int 10279 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10280 int flags, cred_t *cr) 10281 { 10282 int error; 10283 rnode4_t *rp; 10284 10285 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10286 10287 flags |= B_WRITE; 10288 10289 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10290 10291 rp = VTOR4(vp); 10292 10293 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10294 error == EACCES) && 10295 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10296 if (!(rp->r_flags & R4OUTOFSPACE)) { 10297 mutex_enter(&rp->r_statelock); 10298 rp->r_flags |= R4OUTOFSPACE; 10299 mutex_exit(&rp->r_statelock); 10300 } 10301 flags |= B_ERROR; 10302 pvn_write_done(pp, flags); 10303 /* 10304 * If this was not an async thread, then try again to 10305 * write out the pages, but this time, also destroy 10306 * them whether or not the write is successful. This 10307 * will prevent memory from filling up with these 10308 * pages and destroying them is the only alternative 10309 * if they can't be written out. 10310 * 10311 * Don't do this if this is an async thread because 10312 * when the pages are unlocked in pvn_write_done, 10313 * some other thread could have come along, locked 10314 * them, and queued for an async thread. It would be 10315 * possible for all of the async threads to be tied 10316 * up waiting to lock the pages again and they would 10317 * all already be locked and waiting for an async 10318 * thread to handle them. Deadlock. 10319 */ 10320 if (!(flags & B_ASYNC)) { 10321 error = nfs4_putpage(vp, io_off, io_len, 10322 B_INVAL | B_FORCE, cr, NULL); 10323 } 10324 } else { 10325 if (error) 10326 flags |= B_ERROR; 10327 else if (rp->r_flags & R4OUTOFSPACE) { 10328 mutex_enter(&rp->r_statelock); 10329 rp->r_flags &= ~R4OUTOFSPACE; 10330 mutex_exit(&rp->r_statelock); 10331 } 10332 pvn_write_done(pp, flags); 10333 if (freemem < desfree) 10334 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10335 NFS4_WRITE_NOWAIT); 10336 } 10337 10338 return (error); 10339 } 10340 10341 #ifdef DEBUG 10342 int nfs4_force_open_before_mmap = 0; 10343 #endif 10344 10345 /* ARGSUSED */ 10346 static int 10347 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10348 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10349 caller_context_t *ct) 10350 { 10351 struct segvn_crargs vn_a; 10352 int error = 0; 10353 rnode4_t *rp = VTOR4(vp); 10354 mntinfo4_t *mi = VTOMI4(vp); 10355 10356 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10357 return (EIO); 10358 10359 if (vp->v_flag & VNOMAP) 10360 return (ENOSYS); 10361 10362 if (off < 0 || (off + len) < 0) 10363 return (ENXIO); 10364 10365 if (vp->v_type != VREG) 10366 return (ENODEV); 10367 10368 /* 10369 * If the file is delegated to the client don't do anything. 10370 * If the file is not delegated, then validate the data cache. 10371 */ 10372 mutex_enter(&rp->r_statev4_lock); 10373 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10374 mutex_exit(&rp->r_statev4_lock); 10375 error = nfs4_validate_caches(vp, cr); 10376 if (error) 10377 return (error); 10378 } else { 10379 mutex_exit(&rp->r_statev4_lock); 10380 } 10381 10382 /* 10383 * Check to see if the vnode is currently marked as not cachable. 10384 * This means portions of the file are locked (through VOP_FRLOCK). 10385 * In this case the map request must be refused. We use 10386 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10387 * 10388 * Atomically increment r_inmap after acquiring r_rwlock. The 10389 * idea here is to acquire r_rwlock to block read/write and 10390 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10391 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10392 * and we can prevent the deadlock that would have occurred 10393 * when nfs4_addmap() would have acquired it out of order. 10394 * 10395 * Since we are not protecting r_inmap by any lock, we do not 10396 * hold any lock when we decrement it. We atomically decrement 10397 * r_inmap after we release r_lkserlock. 10398 */ 10399 10400 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR(vp))) 10401 return (EINTR); 10402 atomic_add_int(&rp->r_inmap, 1); 10403 nfs_rw_exit(&rp->r_rwlock); 10404 10405 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10406 atomic_add_int(&rp->r_inmap, -1); 10407 return (EINTR); 10408 } 10409 10410 10411 if (vp->v_flag & VNOCACHE) { 10412 error = EAGAIN; 10413 goto done; 10414 } 10415 10416 /* 10417 * Don't allow concurrent locks and mapping if mandatory locking is 10418 * enabled. 10419 */ 10420 if (flk_has_remote_locks(vp)) { 10421 struct vattr va; 10422 va.va_mask = AT_MODE; 10423 error = nfs4getattr(vp, &va, cr); 10424 if (error != 0) 10425 goto done; 10426 if (MANDLOCK(vp, va.va_mode)) { 10427 error = EAGAIN; 10428 goto done; 10429 } 10430 } 10431 10432 /* 10433 * It is possible that the rnode has a lost lock request that we 10434 * are still trying to recover, and that the request conflicts with 10435 * this map request. 10436 * 10437 * An alternative approach would be for nfs4_safemap() to consider 10438 * queued lock requests when deciding whether to set or clear 10439 * VNOCACHE. This would require the frlock code path to call 10440 * nfs4_safemap() after enqueing a lost request. 10441 */ 10442 if (nfs4_map_lost_lock_conflict(vp)) { 10443 error = EAGAIN; 10444 goto done; 10445 } 10446 10447 as_rangelock(as); 10448 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10449 if (error != 0) { 10450 as_rangeunlock(as); 10451 goto done; 10452 } 10453 10454 if (vp->v_type == VREG) { 10455 /* 10456 * We need to retrieve the open stream 10457 */ 10458 nfs4_open_stream_t *osp = NULL; 10459 nfs4_open_owner_t *oop = NULL; 10460 10461 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10462 if (oop != NULL) { 10463 /* returns with 'os_sync_lock' held */ 10464 osp = find_open_stream(oop, rp); 10465 open_owner_rele(oop); 10466 } 10467 if (osp == NULL) { 10468 #ifdef DEBUG 10469 if (nfs4_force_open_before_mmap) { 10470 error = EIO; 10471 goto done; 10472 } 10473 #endif 10474 /* returns with 'os_sync_lock' held */ 10475 error = open_and_get_osp(vp, cr, &osp); 10476 if (osp == NULL) { 10477 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10478 "nfs4_map: we tried to OPEN the file " 10479 "but again no osp, so fail with EIO")); 10480 goto done; 10481 } 10482 } 10483 10484 if (osp->os_failed_reopen) { 10485 mutex_exit(&osp->os_sync_lock); 10486 open_stream_rele(osp, rp); 10487 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10488 "nfs4_map: os_failed_reopen set on " 10489 "osp %p, cr %p, rp %s", (void *)osp, 10490 (void *)cr, rnode4info(rp))); 10491 error = EIO; 10492 goto done; 10493 } 10494 mutex_exit(&osp->os_sync_lock); 10495 open_stream_rele(osp, rp); 10496 } 10497 10498 vn_a.vp = vp; 10499 vn_a.offset = off; 10500 vn_a.type = (flags & MAP_TYPE); 10501 vn_a.prot = (uchar_t)prot; 10502 vn_a.maxprot = (uchar_t)maxprot; 10503 vn_a.flags = (flags & ~MAP_TYPE); 10504 vn_a.cred = cr; 10505 vn_a.amp = NULL; 10506 vn_a.szc = 0; 10507 vn_a.lgrp_mem_policy_flags = 0; 10508 10509 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10510 as_rangeunlock(as); 10511 10512 done: 10513 nfs_rw_exit(&rp->r_lkserlock); 10514 atomic_add_int(&rp->r_inmap, -1); 10515 return (error); 10516 } 10517 10518 /* 10519 * We're most likely dealing with a kernel module that likes to READ 10520 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10521 * officially OPEN the file to create the necessary client state 10522 * for bookkeeping of os_mmap_read/write counts. 10523 * 10524 * Since VOP_MAP only passes in a pointer to the vnode rather than 10525 * a double pointer, we can't handle the case where nfs4open_otw() 10526 * returns a different vnode than the one passed into VOP_MAP (since 10527 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10528 * we return NULL and let nfs4_map() fail. Note: the only case where 10529 * this should happen is if the file got removed and replaced with the 10530 * same name on the server (in addition to the fact that we're trying 10531 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10532 */ 10533 static int 10534 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10535 { 10536 rnode4_t *rp, *drp; 10537 vnode_t *dvp, *open_vp; 10538 char file_name[MAXNAMELEN]; 10539 int just_created; 10540 nfs4_open_stream_t *osp; 10541 nfs4_open_owner_t *oop; 10542 int error; 10543 10544 *ospp = NULL; 10545 open_vp = map_vp; 10546 10547 rp = VTOR4(open_vp); 10548 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10549 return (error); 10550 drp = VTOR4(dvp); 10551 10552 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10553 VN_RELE(dvp); 10554 return (EINTR); 10555 } 10556 10557 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10558 nfs_rw_exit(&drp->r_rwlock); 10559 VN_RELE(dvp); 10560 return (error); 10561 } 10562 10563 mutex_enter(&rp->r_statev4_lock); 10564 if (rp->created_v4) { 10565 rp->created_v4 = 0; 10566 mutex_exit(&rp->r_statev4_lock); 10567 10568 dnlc_update(dvp, file_name, open_vp); 10569 /* This is needed so we don't bump the open ref count */ 10570 just_created = 1; 10571 } else { 10572 mutex_exit(&rp->r_statev4_lock); 10573 just_created = 0; 10574 } 10575 10576 VN_HOLD(map_vp); 10577 10578 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10579 just_created); 10580 if (error) { 10581 nfs_rw_exit(&drp->r_rwlock); 10582 VN_RELE(dvp); 10583 VN_RELE(map_vp); 10584 return (error); 10585 } 10586 10587 nfs_rw_exit(&drp->r_rwlock); 10588 VN_RELE(dvp); 10589 10590 /* 10591 * If nfs4open_otw() returned a different vnode then "undo" 10592 * the open and return failure to the caller. 10593 */ 10594 if (!VN_CMP(open_vp, map_vp)) { 10595 nfs4_error_t e; 10596 10597 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10598 "open returned a different vnode")); 10599 /* 10600 * If there's an error, ignore it, 10601 * and let VOP_INACTIVE handle it. 10602 */ 10603 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10604 CLOSE_NORM, 0, 0, 0); 10605 VN_RELE(map_vp); 10606 return (EIO); 10607 } 10608 10609 VN_RELE(map_vp); 10610 10611 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10612 if (!oop) { 10613 nfs4_error_t e; 10614 10615 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10616 "no open owner")); 10617 /* 10618 * If there's an error, ignore it, 10619 * and let VOP_INACTIVE handle it. 10620 */ 10621 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10622 CLOSE_NORM, 0, 0, 0); 10623 return (EIO); 10624 } 10625 osp = find_open_stream(oop, rp); 10626 open_owner_rele(oop); 10627 *ospp = osp; 10628 return (0); 10629 } 10630 10631 /* 10632 * Please be aware that when this function is called, the address space write 10633 * a_lock is held. Do not put over the wire calls in this function. 10634 */ 10635 /* ARGSUSED */ 10636 static int 10637 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10638 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10639 caller_context_t *ct) 10640 { 10641 rnode4_t *rp; 10642 int error = 0; 10643 mntinfo4_t *mi; 10644 10645 mi = VTOMI4(vp); 10646 rp = VTOR4(vp); 10647 10648 if (nfs_zone() != mi->mi_zone) 10649 return (EIO); 10650 if (vp->v_flag & VNOMAP) 10651 return (ENOSYS); 10652 10653 /* 10654 * Don't need to update the open stream first, since this 10655 * mmap can't add any additional share access that isn't 10656 * already contained in the open stream (for the case where we 10657 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10658 * take into account os_mmap_read[write] counts). 10659 */ 10660 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10661 10662 if (vp->v_type == VREG) { 10663 /* 10664 * We need to retrieve the open stream and update the counts. 10665 * If there is no open stream here, something is wrong. 10666 */ 10667 nfs4_open_stream_t *osp = NULL; 10668 nfs4_open_owner_t *oop = NULL; 10669 10670 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10671 if (oop != NULL) { 10672 /* returns with 'os_sync_lock' held */ 10673 osp = find_open_stream(oop, rp); 10674 open_owner_rele(oop); 10675 } 10676 if (osp == NULL) { 10677 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10678 "nfs4_addmap: we should have an osp" 10679 "but we don't, so fail with EIO")); 10680 error = EIO; 10681 goto out; 10682 } 10683 10684 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10685 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10686 10687 /* 10688 * Update the map count in the open stream. 10689 * This is necessary in the case where we 10690 * open/mmap/close/, then the server reboots, and we 10691 * attempt to reopen. If the mmap doesn't add share 10692 * access then we send an invalid reopen with 10693 * access = NONE. 10694 * 10695 * We need to specifically check each PROT_* so a mmap 10696 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10697 * read and write access. A simple comparison of prot 10698 * to ~PROT_WRITE to determine read access is insufficient 10699 * since prot can be |= with PROT_USER, etc. 10700 */ 10701 10702 /* 10703 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10704 */ 10705 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10706 osp->os_mmap_write += btopr(len); 10707 if (maxprot & PROT_READ) 10708 osp->os_mmap_read += btopr(len); 10709 if (maxprot & PROT_EXEC) 10710 osp->os_mmap_read += btopr(len); 10711 /* 10712 * Ensure that os_mmap_read gets incremented, even if 10713 * maxprot were to look like PROT_NONE. 10714 */ 10715 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10716 !(maxprot & PROT_EXEC)) 10717 osp->os_mmap_read += btopr(len); 10718 osp->os_mapcnt += btopr(len); 10719 mutex_exit(&osp->os_sync_lock); 10720 open_stream_rele(osp, rp); 10721 } 10722 10723 out: 10724 /* 10725 * If we got an error, then undo our 10726 * incrementing of 'r_mapcnt'. 10727 */ 10728 10729 if (error) { 10730 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10731 ASSERT(rp->r_mapcnt >= 0); 10732 } 10733 return (error); 10734 } 10735 10736 /* ARGSUSED */ 10737 static int 10738 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10739 { 10740 10741 return (VTOR4(vp1) == VTOR4(vp2)); 10742 } 10743 10744 /* ARGSUSED */ 10745 static int 10746 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10747 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10748 caller_context_t *ct) 10749 { 10750 int rc; 10751 u_offset_t start, end; 10752 rnode4_t *rp; 10753 int error = 0, intr = INTR4(vp); 10754 nfs4_error_t e; 10755 10756 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10757 return (EIO); 10758 10759 /* check for valid cmd parameter */ 10760 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10761 return (EINVAL); 10762 10763 /* Verify l_type. */ 10764 switch (bfp->l_type) { 10765 case F_RDLCK: 10766 if (cmd != F_GETLK && !(flag & FREAD)) 10767 return (EBADF); 10768 break; 10769 case F_WRLCK: 10770 if (cmd != F_GETLK && !(flag & FWRITE)) 10771 return (EBADF); 10772 break; 10773 case F_UNLCK: 10774 intr = 0; 10775 break; 10776 10777 default: 10778 return (EINVAL); 10779 } 10780 10781 /* check the validity of the lock range */ 10782 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10783 return (rc); 10784 if (rc = flk_check_lock_data(start, end, MAXEND)) 10785 return (rc); 10786 10787 /* 10788 * If the filesystem is mounted using local locking, pass the 10789 * request off to the local locking code. 10790 */ 10791 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10792 if (cmd == F_SETLK || cmd == F_SETLKW) { 10793 /* 10794 * For complete safety, we should be holding 10795 * r_lkserlock. However, we can't call 10796 * nfs4_safelock and then fs_frlock while 10797 * holding r_lkserlock, so just invoke 10798 * nfs4_safelock and expect that this will 10799 * catch enough of the cases. 10800 */ 10801 if (!nfs4_safelock(vp, bfp, cr)) 10802 return (EAGAIN); 10803 } 10804 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10805 } 10806 10807 rp = VTOR4(vp); 10808 10809 /* 10810 * Check whether the given lock request can proceed, given the 10811 * current file mappings. 10812 */ 10813 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10814 return (EINTR); 10815 if (cmd == F_SETLK || cmd == F_SETLKW) { 10816 if (!nfs4_safelock(vp, bfp, cr)) { 10817 rc = EAGAIN; 10818 goto done; 10819 } 10820 } 10821 10822 /* 10823 * Flush the cache after waiting for async I/O to finish. For new 10824 * locks, this is so that the process gets the latest bits from the 10825 * server. For unlocks, this is so that other clients see the 10826 * latest bits once the file has been unlocked. If currently dirty 10827 * pages can't be flushed, then don't allow a lock to be set. But 10828 * allow unlocks to succeed, to avoid having orphan locks on the 10829 * server. 10830 */ 10831 if (cmd != F_GETLK) { 10832 mutex_enter(&rp->r_statelock); 10833 while (rp->r_count > 0) { 10834 if (intr) { 10835 klwp_t *lwp = ttolwp(curthread); 10836 10837 if (lwp != NULL) 10838 lwp->lwp_nostop++; 10839 if (cv_wait_sig(&rp->r_cv, 10840 &rp->r_statelock) == 0) { 10841 if (lwp != NULL) 10842 lwp->lwp_nostop--; 10843 rc = EINTR; 10844 break; 10845 } 10846 if (lwp != NULL) 10847 lwp->lwp_nostop--; 10848 } else 10849 cv_wait(&rp->r_cv, &rp->r_statelock); 10850 } 10851 mutex_exit(&rp->r_statelock); 10852 if (rc != 0) 10853 goto done; 10854 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10855 if (error) { 10856 if (error == ENOSPC || error == EDQUOT) { 10857 mutex_enter(&rp->r_statelock); 10858 if (!rp->r_error) 10859 rp->r_error = error; 10860 mutex_exit(&rp->r_statelock); 10861 } 10862 if (bfp->l_type != F_UNLCK) { 10863 rc = ENOLCK; 10864 goto done; 10865 } 10866 } 10867 } 10868 10869 /* 10870 * Call the lock manager to do the real work of contacting 10871 * the server and obtaining the lock. 10872 */ 10873 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10874 cr, &e, NULL, NULL); 10875 rc = e.error; 10876 10877 if (rc == 0) 10878 nfs4_lockcompletion(vp, cmd); 10879 10880 done: 10881 nfs_rw_exit(&rp->r_lkserlock); 10882 10883 return (rc); 10884 } 10885 10886 /* 10887 * Free storage space associated with the specified vnode. The portion 10888 * to be freed is specified by bfp->l_start and bfp->l_len (already 10889 * normalized to a "whence" of 0). 10890 * 10891 * This is an experimental facility whose continued existence is not 10892 * guaranteed. Currently, we only support the special case 10893 * of l_len == 0, meaning free to end of file. 10894 */ 10895 /* ARGSUSED */ 10896 static int 10897 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10898 offset_t offset, cred_t *cr, caller_context_t *ct) 10899 { 10900 int error; 10901 10902 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10903 return (EIO); 10904 ASSERT(vp->v_type == VREG); 10905 if (cmd != F_FREESP) 10906 return (EINVAL); 10907 10908 error = convoff(vp, bfp, 0, offset); 10909 if (!error) { 10910 ASSERT(bfp->l_start >= 0); 10911 if (bfp->l_len == 0) { 10912 struct vattr va; 10913 10914 va.va_mask = AT_SIZE; 10915 va.va_size = bfp->l_start; 10916 error = nfs4setattr(vp, &va, 0, cr, NULL); 10917 } else 10918 error = EINVAL; 10919 } 10920 10921 return (error); 10922 } 10923 10924 /* ARGSUSED */ 10925 int 10926 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 10927 { 10928 rnode4_t *rp; 10929 rp = VTOR4(vp); 10930 10931 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 10932 vp = RTOV4(rp); 10933 } 10934 *vpp = vp; 10935 return (0); 10936 } 10937 10938 /* 10939 * Setup and add an address space callback to do the work of the delmap call. 10940 * The callback will (and must be) deleted in the actual callback function. 10941 * 10942 * This is done in order to take care of the problem that we have with holding 10943 * the address space's a_lock for a long period of time (e.g. if the NFS server 10944 * is down). Callbacks will be executed in the address space code while the 10945 * a_lock is not held. Holding the address space's a_lock causes things such 10946 * as ps and fork to hang because they are trying to acquire this lock as well. 10947 */ 10948 /* ARGSUSED */ 10949 static int 10950 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10951 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 10952 caller_context_t *ct) 10953 { 10954 int caller_found; 10955 int error; 10956 rnode4_t *rp; 10957 nfs4_delmap_args_t *dmapp; 10958 nfs4_delmapcall_t *delmap_call; 10959 10960 if (vp->v_flag & VNOMAP) 10961 return (ENOSYS); 10962 10963 /* 10964 * A process may not change zones if it has NFS pages mmap'ed 10965 * in, so we can't legitimately get here from the wrong zone. 10966 */ 10967 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10968 10969 rp = VTOR4(vp); 10970 10971 /* 10972 * The way that the address space of this process deletes its mapping 10973 * of this file is via the following call chains: 10974 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10975 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 10976 * 10977 * With the use of address space callbacks we are allowed to drop the 10978 * address space lock, a_lock, while executing the NFS operations that 10979 * need to go over the wire. Returning EAGAIN to the caller of this 10980 * function is what drives the execution of the callback that we add 10981 * below. The callback will be executed by the address space code 10982 * after dropping the a_lock. When the callback is finished, since 10983 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 10984 * is called again on the same segment to finish the rest of the work 10985 * that needs to happen during unmapping. 10986 * 10987 * This action of calling back into the segment driver causes 10988 * nfs4_delmap() to get called again, but since the callback was 10989 * already executed at this point, it already did the work and there 10990 * is nothing left for us to do. 10991 * 10992 * To Summarize: 10993 * - The first time nfs4_delmap is called by the current thread is when 10994 * we add the caller associated with this delmap to the delmap caller 10995 * list, add the callback, and return EAGAIN. 10996 * - The second time in this call chain when nfs4_delmap is called we 10997 * will find this caller in the delmap caller list and realize there 10998 * is no more work to do thus removing this caller from the list and 10999 * returning the error that was set in the callback execution. 11000 */ 11001 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11002 if (caller_found) { 11003 /* 11004 * 'error' is from the actual delmap operations. To avoid 11005 * hangs, we need to handle the return of EAGAIN differently 11006 * since this is what drives the callback execution. 11007 * In this case, we don't want to return EAGAIN and do the 11008 * callback execution because there are none to execute. 11009 */ 11010 if (error == EAGAIN) 11011 return (0); 11012 else 11013 return (error); 11014 } 11015 11016 /* current caller was not in the list */ 11017 delmap_call = nfs4_init_delmapcall(); 11018 11019 mutex_enter(&rp->r_statelock); 11020 list_insert_tail(&rp->r_indelmap, delmap_call); 11021 mutex_exit(&rp->r_statelock); 11022 11023 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11024 11025 dmapp->vp = vp; 11026 dmapp->off = off; 11027 dmapp->addr = addr; 11028 dmapp->len = len; 11029 dmapp->prot = prot; 11030 dmapp->maxprot = maxprot; 11031 dmapp->flags = flags; 11032 dmapp->cr = cr; 11033 dmapp->caller = delmap_call; 11034 11035 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11036 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11037 11038 return (error ? error : EAGAIN); 11039 } 11040 11041 static nfs4_delmapcall_t * 11042 nfs4_init_delmapcall() 11043 { 11044 nfs4_delmapcall_t *delmap_call; 11045 11046 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11047 delmap_call->call_id = curthread; 11048 delmap_call->error = 0; 11049 11050 return (delmap_call); 11051 } 11052 11053 static void 11054 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11055 { 11056 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11057 } 11058 11059 /* 11060 * Searches for the current delmap caller (based on curthread) in the list of 11061 * callers. If it is found, we remove it and free the delmap caller. 11062 * Returns: 11063 * 0 if the caller wasn't found 11064 * 1 if the caller was found, removed and freed. *errp will be set 11065 * to what the result of the delmap was. 11066 */ 11067 static int 11068 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11069 { 11070 nfs4_delmapcall_t *delmap_call; 11071 11072 /* 11073 * If the list doesn't exist yet, we create it and return 11074 * that the caller wasn't found. No list = no callers. 11075 */ 11076 mutex_enter(&rp->r_statelock); 11077 if (!(rp->r_flags & R4DELMAPLIST)) { 11078 /* The list does not exist */ 11079 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11080 offsetof(nfs4_delmapcall_t, call_node)); 11081 rp->r_flags |= R4DELMAPLIST; 11082 mutex_exit(&rp->r_statelock); 11083 return (0); 11084 } else { 11085 /* The list exists so search it */ 11086 for (delmap_call = list_head(&rp->r_indelmap); 11087 delmap_call != NULL; 11088 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11089 if (delmap_call->call_id == curthread) { 11090 /* current caller is in the list */ 11091 *errp = delmap_call->error; 11092 list_remove(&rp->r_indelmap, delmap_call); 11093 mutex_exit(&rp->r_statelock); 11094 nfs4_free_delmapcall(delmap_call); 11095 return (1); 11096 } 11097 } 11098 } 11099 mutex_exit(&rp->r_statelock); 11100 return (0); 11101 } 11102 11103 /* 11104 * Remove some pages from an mmap'd vnode. Just update the 11105 * count of pages. If doing close-to-open, then flush and 11106 * commit all of the pages associated with this file. 11107 * Otherwise, start an asynchronous page flush to write out 11108 * any dirty pages. This will also associate a credential 11109 * with the rnode which can be used to write the pages. 11110 */ 11111 /* ARGSUSED */ 11112 static void 11113 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11114 { 11115 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11116 rnode4_t *rp; 11117 mntinfo4_t *mi; 11118 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11119 11120 rp = VTOR4(dmapp->vp); 11121 mi = VTOMI4(dmapp->vp); 11122 11123 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11124 ASSERT(rp->r_mapcnt >= 0); 11125 11126 /* 11127 * Initiate a page flush and potential commit if there are 11128 * pages, the file system was not mounted readonly, the segment 11129 * was mapped shared, and the pages themselves were writeable. 11130 */ 11131 if (nfs4_has_pages(dmapp->vp) && 11132 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11133 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11134 mutex_enter(&rp->r_statelock); 11135 rp->r_flags |= R4DIRTY; 11136 mutex_exit(&rp->r_statelock); 11137 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11138 dmapp->len, dmapp->cr); 11139 if (!e.error) { 11140 mutex_enter(&rp->r_statelock); 11141 e.error = rp->r_error; 11142 rp->r_error = 0; 11143 mutex_exit(&rp->r_statelock); 11144 } 11145 } else 11146 e.error = 0; 11147 11148 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11149 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11150 B_INVAL, dmapp->cr, NULL); 11151 11152 if (e.error) { 11153 e.stat = puterrno4(e.error); 11154 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11155 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11156 dmapp->caller->error = e.error; 11157 } 11158 11159 /* Check to see if we need to close the file */ 11160 11161 if (dmapp->vp->v_type == VREG) { 11162 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11163 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11164 11165 if (e.error != 0 || e.stat != NFS4_OK) { 11166 /* 11167 * Since it is possible that e.error == 0 and 11168 * e.stat != NFS4_OK (and vice versa), 11169 * we do the proper checking in order to get both 11170 * e.error and e.stat reporting the correct info. 11171 */ 11172 if (e.stat == NFS4_OK) 11173 e.stat = puterrno4(e.error); 11174 if (e.error == 0) 11175 e.error = geterrno4(e.stat); 11176 11177 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11178 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11179 dmapp->caller->error = e.error; 11180 } 11181 } 11182 11183 (void) as_delete_callback(as, arg); 11184 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11185 } 11186 11187 11188 static uint_t 11189 fattr4_maxfilesize_to_bits(uint64_t ll) 11190 { 11191 uint_t l = 1; 11192 11193 if (ll == 0) { 11194 return (0); 11195 } 11196 11197 if (ll & 0xffffffff00000000) { 11198 l += 32; ll >>= 32; 11199 } 11200 if (ll & 0xffff0000) { 11201 l += 16; ll >>= 16; 11202 } 11203 if (ll & 0xff00) { 11204 l += 8; ll >>= 8; 11205 } 11206 if (ll & 0xf0) { 11207 l += 4; ll >>= 4; 11208 } 11209 if (ll & 0xc) { 11210 l += 2; ll >>= 2; 11211 } 11212 if (ll & 0x2) { 11213 l += 1; 11214 } 11215 return (l); 11216 } 11217 11218 static int 11219 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11220 { 11221 vnode_t *avp = NULL; 11222 int error; 11223 11224 if ((error = nfs4lookup_xattr(vp, "", &avp, 11225 LOOKUP_XATTR, cr)) == 0) 11226 error = do_xattr_exists_check(avp, valp, cr); 11227 if (avp) 11228 VN_RELE(avp); 11229 11230 return (error); 11231 } 11232 11233 /* ARGSUSED */ 11234 int 11235 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11236 caller_context_t *ct) 11237 { 11238 int error; 11239 hrtime_t t; 11240 rnode4_t *rp; 11241 nfs4_ga_res_t gar; 11242 nfs4_ga_ext_res_t ger; 11243 11244 gar.n4g_ext_res = &ger; 11245 11246 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11247 return (EIO); 11248 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11249 *valp = MAXPATHLEN; 11250 return (0); 11251 } 11252 if (cmd == _PC_ACL_ENABLED) { 11253 *valp = _ACL_ACE_ENABLED; 11254 return (0); 11255 } 11256 11257 rp = VTOR4(vp); 11258 if (cmd == _PC_XATTR_EXISTS) { 11259 /* 11260 * The existence of the xattr directory is not sufficient 11261 * for determining whether generic user attributes exists. 11262 * The attribute directory could only be a transient directory 11263 * used for Solaris sysattr support. Do a small readdir 11264 * to verify if the only entries are sysattrs or not. 11265 * 11266 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11267 * is NULL. Once the xadir vp exists, we can create xattrs, 11268 * and we don't have any way to update the "base" object's 11269 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11270 * could help out. 11271 */ 11272 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11273 rp->r_xattr_dir == NULL) { 11274 return (nfs4_have_xattrs(vp, valp, cr)); 11275 } 11276 } else { /* OLD CODE */ 11277 if (ATTRCACHE4_VALID(vp)) { 11278 mutex_enter(&rp->r_statelock); 11279 if (rp->r_pathconf.pc4_cache_valid) { 11280 error = 0; 11281 switch (cmd) { 11282 case _PC_FILESIZEBITS: 11283 *valp = 11284 rp->r_pathconf.pc4_filesizebits; 11285 break; 11286 case _PC_LINK_MAX: 11287 *valp = 11288 rp->r_pathconf.pc4_link_max; 11289 break; 11290 case _PC_NAME_MAX: 11291 *valp = 11292 rp->r_pathconf.pc4_name_max; 11293 break; 11294 case _PC_CHOWN_RESTRICTED: 11295 *valp = 11296 rp->r_pathconf.pc4_chown_restricted; 11297 break; 11298 case _PC_NO_TRUNC: 11299 *valp = 11300 rp->r_pathconf.pc4_no_trunc; 11301 break; 11302 default: 11303 error = EINVAL; 11304 break; 11305 } 11306 mutex_exit(&rp->r_statelock); 11307 #ifdef DEBUG 11308 nfs4_pathconf_cache_hits++; 11309 #endif 11310 return (error); 11311 } 11312 mutex_exit(&rp->r_statelock); 11313 } 11314 } 11315 #ifdef DEBUG 11316 nfs4_pathconf_cache_misses++; 11317 #endif 11318 11319 t = gethrtime(); 11320 11321 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11322 11323 if (error) { 11324 mutex_enter(&rp->r_statelock); 11325 rp->r_pathconf.pc4_cache_valid = FALSE; 11326 rp->r_pathconf.pc4_xattr_valid = FALSE; 11327 mutex_exit(&rp->r_statelock); 11328 return (error); 11329 } 11330 11331 /* interpret the max filesize */ 11332 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11333 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11334 11335 /* Store the attributes we just received */ 11336 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11337 11338 switch (cmd) { 11339 case _PC_FILESIZEBITS: 11340 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11341 break; 11342 case _PC_LINK_MAX: 11343 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11344 break; 11345 case _PC_NAME_MAX: 11346 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11347 break; 11348 case _PC_CHOWN_RESTRICTED: 11349 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11350 break; 11351 case _PC_NO_TRUNC: 11352 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11353 break; 11354 case _PC_XATTR_EXISTS: 11355 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11356 if (error = nfs4_have_xattrs(vp, valp, cr)) 11357 return (error); 11358 } 11359 break; 11360 default: 11361 return (EINVAL); 11362 } 11363 11364 return (0); 11365 } 11366 11367 /* 11368 * Called by async thread to do synchronous pageio. Do the i/o, wait 11369 * for it to complete, and cleanup the page list when done. 11370 */ 11371 static int 11372 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11373 int flags, cred_t *cr) 11374 { 11375 int error; 11376 11377 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11378 11379 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11380 if (flags & B_READ) 11381 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11382 else 11383 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11384 return (error); 11385 } 11386 11387 /* ARGSUSED */ 11388 static int 11389 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11390 int flags, cred_t *cr, caller_context_t *ct) 11391 { 11392 int error; 11393 rnode4_t *rp; 11394 11395 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11396 return (EIO); 11397 11398 if (pp == NULL) 11399 return (EINVAL); 11400 11401 rp = VTOR4(vp); 11402 mutex_enter(&rp->r_statelock); 11403 rp->r_count++; 11404 mutex_exit(&rp->r_statelock); 11405 11406 if (flags & B_ASYNC) { 11407 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11408 nfs4_sync_pageio); 11409 } else 11410 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11411 mutex_enter(&rp->r_statelock); 11412 rp->r_count--; 11413 cv_broadcast(&rp->r_cv); 11414 mutex_exit(&rp->r_statelock); 11415 return (error); 11416 } 11417 11418 /* ARGSUSED */ 11419 static void 11420 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11421 caller_context_t *ct) 11422 { 11423 int error; 11424 rnode4_t *rp; 11425 page_t *plist; 11426 page_t *pptr; 11427 offset3 offset; 11428 count3 len; 11429 k_sigset_t smask; 11430 11431 /* 11432 * We should get called with fl equal to either B_FREE or 11433 * B_INVAL. Any other value is illegal. 11434 * 11435 * The page that we are either supposed to free or destroy 11436 * should be exclusive locked and its io lock should not 11437 * be held. 11438 */ 11439 ASSERT(fl == B_FREE || fl == B_INVAL); 11440 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11441 11442 rp = VTOR4(vp); 11443 11444 /* 11445 * If the page doesn't need to be committed or we shouldn't 11446 * even bother attempting to commit it, then just make sure 11447 * that the p_fsdata byte is clear and then either free or 11448 * destroy the page as appropriate. 11449 */ 11450 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11451 pp->p_fsdata = C_NOCOMMIT; 11452 if (fl == B_FREE) 11453 page_free(pp, dn); 11454 else 11455 page_destroy(pp, dn); 11456 return; 11457 } 11458 11459 /* 11460 * If there is a page invalidation operation going on, then 11461 * if this is one of the pages being destroyed, then just 11462 * clear the p_fsdata byte and then either free or destroy 11463 * the page as appropriate. 11464 */ 11465 mutex_enter(&rp->r_statelock); 11466 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11467 mutex_exit(&rp->r_statelock); 11468 pp->p_fsdata = C_NOCOMMIT; 11469 if (fl == B_FREE) 11470 page_free(pp, dn); 11471 else 11472 page_destroy(pp, dn); 11473 return; 11474 } 11475 11476 /* 11477 * If we are freeing this page and someone else is already 11478 * waiting to do a commit, then just unlock the page and 11479 * return. That other thread will take care of commiting 11480 * this page. The page can be freed sometime after the 11481 * commit has finished. Otherwise, if the page is marked 11482 * as delay commit, then we may be getting called from 11483 * pvn_write_done, one page at a time. This could result 11484 * in one commit per page, so we end up doing lots of small 11485 * commits instead of fewer larger commits. This is bad, 11486 * we want do as few commits as possible. 11487 */ 11488 if (fl == B_FREE) { 11489 if (rp->r_flags & R4COMMITWAIT) { 11490 page_unlock(pp); 11491 mutex_exit(&rp->r_statelock); 11492 return; 11493 } 11494 if (pp->p_fsdata == C_DELAYCOMMIT) { 11495 pp->p_fsdata = C_COMMIT; 11496 page_unlock(pp); 11497 mutex_exit(&rp->r_statelock); 11498 return; 11499 } 11500 } 11501 11502 /* 11503 * Check to see if there is a signal which would prevent an 11504 * attempt to commit the pages from being successful. If so, 11505 * then don't bother with all of the work to gather pages and 11506 * generate the unsuccessful RPC. Just return from here and 11507 * let the page be committed at some later time. 11508 */ 11509 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11510 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11511 sigunintr(&smask); 11512 page_unlock(pp); 11513 mutex_exit(&rp->r_statelock); 11514 return; 11515 } 11516 sigunintr(&smask); 11517 11518 /* 11519 * We are starting to need to commit pages, so let's try 11520 * to commit as many as possible at once to reduce the 11521 * overhead. 11522 * 11523 * Set the `commit inprogress' state bit. We must 11524 * first wait until any current one finishes. Then 11525 * we initialize the c_pages list with this page. 11526 */ 11527 while (rp->r_flags & R4COMMIT) { 11528 rp->r_flags |= R4COMMITWAIT; 11529 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11530 rp->r_flags &= ~R4COMMITWAIT; 11531 } 11532 rp->r_flags |= R4COMMIT; 11533 mutex_exit(&rp->r_statelock); 11534 ASSERT(rp->r_commit.c_pages == NULL); 11535 rp->r_commit.c_pages = pp; 11536 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11537 rp->r_commit.c_commlen = PAGESIZE; 11538 11539 /* 11540 * Gather together all other pages which can be committed. 11541 * They will all be chained off r_commit.c_pages. 11542 */ 11543 nfs4_get_commit(vp); 11544 11545 /* 11546 * Clear the `commit inprogress' status and disconnect 11547 * the list of pages to be committed from the rnode. 11548 * At this same time, we also save the starting offset 11549 * and length of data to be committed on the server. 11550 */ 11551 plist = rp->r_commit.c_pages; 11552 rp->r_commit.c_pages = NULL; 11553 offset = rp->r_commit.c_commbase; 11554 len = rp->r_commit.c_commlen; 11555 mutex_enter(&rp->r_statelock); 11556 rp->r_flags &= ~R4COMMIT; 11557 cv_broadcast(&rp->r_commit.c_cv); 11558 mutex_exit(&rp->r_statelock); 11559 11560 if (curproc == proc_pageout || curproc == proc_fsflush || 11561 nfs_zone() != VTOMI4(vp)->mi_zone) { 11562 nfs4_async_commit(vp, plist, offset, len, 11563 cr, do_nfs4_async_commit); 11564 return; 11565 } 11566 11567 /* 11568 * Actually generate the COMMIT op over the wire operation. 11569 */ 11570 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11571 11572 /* 11573 * If we got an error during the commit, just unlock all 11574 * of the pages. The pages will get retransmitted to the 11575 * server during a putpage operation. 11576 */ 11577 if (error) { 11578 while (plist != NULL) { 11579 pptr = plist; 11580 page_sub(&plist, pptr); 11581 page_unlock(pptr); 11582 } 11583 return; 11584 } 11585 11586 /* 11587 * We've tried as hard as we can to commit the data to stable 11588 * storage on the server. We just unlock the rest of the pages 11589 * and clear the commit required state. They will be put 11590 * onto the tail of the cachelist if they are nolonger 11591 * mapped. 11592 */ 11593 while (plist != pp) { 11594 pptr = plist; 11595 page_sub(&plist, pptr); 11596 pptr->p_fsdata = C_NOCOMMIT; 11597 page_unlock(pptr); 11598 } 11599 11600 /* 11601 * It is possible that nfs4_commit didn't return error but 11602 * some other thread has modified the page we are going 11603 * to free/destroy. 11604 * In this case we need to rewrite the page. Do an explicit check 11605 * before attempting to free/destroy the page. If modified, needs to 11606 * be rewritten so unlock the page and return. 11607 */ 11608 if (hat_ismod(pp)) { 11609 pp->p_fsdata = C_NOCOMMIT; 11610 page_unlock(pp); 11611 return; 11612 } 11613 11614 /* 11615 * Now, as appropriate, either free or destroy the page 11616 * that we were called with. 11617 */ 11618 pp->p_fsdata = C_NOCOMMIT; 11619 if (fl == B_FREE) 11620 page_free(pp, dn); 11621 else 11622 page_destroy(pp, dn); 11623 } 11624 11625 /* 11626 * Commit requires that the current fh be the file written to. 11627 * The compound op structure is: 11628 * PUTFH(file), COMMIT 11629 */ 11630 static int 11631 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11632 { 11633 COMPOUND4args_clnt args; 11634 COMPOUND4res_clnt res; 11635 COMMIT4res *cm_res; 11636 nfs_argop4 argop[2]; 11637 nfs_resop4 *resop; 11638 int doqueue; 11639 mntinfo4_t *mi; 11640 rnode4_t *rp; 11641 cred_t *cred_otw = NULL; 11642 bool_t needrecov = FALSE; 11643 nfs4_recov_state_t recov_state; 11644 nfs4_open_stream_t *osp = NULL; 11645 bool_t first_time = TRUE; /* first time getting OTW cred */ 11646 bool_t last_time = FALSE; /* last time getting OTW cred */ 11647 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11648 11649 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11650 11651 rp = VTOR4(vp); 11652 11653 mi = VTOMI4(vp); 11654 recov_state.rs_flags = 0; 11655 recov_state.rs_num_retry_despite_err = 0; 11656 get_commit_cred: 11657 /* 11658 * Releases the osp, if a valid open stream is provided. 11659 * Puts a hold on the cred_otw and the new osp (if found). 11660 */ 11661 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11662 &first_time, &last_time); 11663 args.ctag = TAG_COMMIT; 11664 recov_retry: 11665 /* 11666 * Commit ops: putfh file; commit 11667 */ 11668 args.array_len = 2; 11669 args.array = argop; 11670 11671 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11672 &recov_state, NULL); 11673 if (e.error) { 11674 crfree(cred_otw); 11675 if (osp != NULL) 11676 open_stream_rele(osp, rp); 11677 return (e.error); 11678 } 11679 11680 /* putfh directory */ 11681 argop[0].argop = OP_CPUTFH; 11682 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11683 11684 /* commit */ 11685 argop[1].argop = OP_COMMIT; 11686 argop[1].nfs_argop4_u.opcommit.offset = offset; 11687 argop[1].nfs_argop4_u.opcommit.count = count; 11688 11689 doqueue = 1; 11690 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11691 11692 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11693 if (!needrecov && e.error) { 11694 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11695 needrecov); 11696 crfree(cred_otw); 11697 if (e.error == EACCES && last_time == FALSE) 11698 goto get_commit_cred; 11699 if (osp != NULL) 11700 open_stream_rele(osp, rp); 11701 return (e.error); 11702 } 11703 11704 if (needrecov) { 11705 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11706 NULL, OP_COMMIT, NULL) == FALSE) { 11707 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11708 &recov_state, needrecov); 11709 if (!e.error) 11710 (void) xdr_free(xdr_COMPOUND4res_clnt, 11711 (caddr_t)&res); 11712 goto recov_retry; 11713 } 11714 if (e.error) { 11715 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11716 &recov_state, needrecov); 11717 crfree(cred_otw); 11718 if (osp != NULL) 11719 open_stream_rele(osp, rp); 11720 return (e.error); 11721 } 11722 /* fall through for res.status case */ 11723 } 11724 11725 if (res.status) { 11726 e.error = geterrno4(res.status); 11727 if (e.error == EACCES && last_time == FALSE) { 11728 crfree(cred_otw); 11729 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11730 &recov_state, needrecov); 11731 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11732 goto get_commit_cred; 11733 } 11734 /* 11735 * Can't do a nfs4_purge_stale_fh here because this 11736 * can cause a deadlock. nfs4_commit can 11737 * be called from nfs4_dispose which can be called 11738 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11739 * can call back to pvn_vplist_dirty. 11740 */ 11741 if (e.error == ESTALE) { 11742 mutex_enter(&rp->r_statelock); 11743 rp->r_flags |= R4STALE; 11744 if (!rp->r_error) 11745 rp->r_error = e.error; 11746 mutex_exit(&rp->r_statelock); 11747 PURGE_ATTRCACHE4(vp); 11748 } else { 11749 mutex_enter(&rp->r_statelock); 11750 if (!rp->r_error) 11751 rp->r_error = e.error; 11752 mutex_exit(&rp->r_statelock); 11753 } 11754 } else { 11755 ASSERT(rp->r_flags & R4HAVEVERF); 11756 resop = &res.array[1]; /* commit res */ 11757 cm_res = &resop->nfs_resop4_u.opcommit; 11758 mutex_enter(&rp->r_statelock); 11759 if (cm_res->writeverf == rp->r_writeverf) { 11760 mutex_exit(&rp->r_statelock); 11761 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11762 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11763 &recov_state, needrecov); 11764 crfree(cred_otw); 11765 if (osp != NULL) 11766 open_stream_rele(osp, rp); 11767 return (0); 11768 } 11769 nfs4_set_mod(vp); 11770 rp->r_writeverf = cm_res->writeverf; 11771 mutex_exit(&rp->r_statelock); 11772 e.error = NFS_VERF_MISMATCH; 11773 } 11774 11775 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11776 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11777 crfree(cred_otw); 11778 if (osp != NULL) 11779 open_stream_rele(osp, rp); 11780 11781 return (e.error); 11782 } 11783 11784 static void 11785 nfs4_set_mod(vnode_t *vp) 11786 { 11787 page_t *pp; 11788 kmutex_t *vphm; 11789 rnode4_t *rp; 11790 11791 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11792 11793 /* make sure we're looking at the master vnode, not a shadow */ 11794 11795 rp = VTOR4(vp); 11796 if (IS_SHADOW(vp, rp)) 11797 vp = RTOV4(rp); 11798 11799 vphm = page_vnode_mutex(vp); 11800 mutex_enter(vphm); 11801 /* 11802 * If there are no pages associated with this vnode, then 11803 * just return. 11804 */ 11805 if ((pp = vp->v_pages) == NULL) { 11806 mutex_exit(vphm); 11807 return; 11808 } 11809 11810 do { 11811 if (pp->p_fsdata != C_NOCOMMIT) { 11812 hat_setmod(pp); 11813 pp->p_fsdata = C_NOCOMMIT; 11814 } 11815 } while ((pp = pp->p_vpnext) != vp->v_pages); 11816 mutex_exit(vphm); 11817 } 11818 11819 /* 11820 * This function is used to gather a page list of the pages which 11821 * can be committed on the server. 11822 * 11823 * The calling thread must have set R4COMMIT. This bit is used to 11824 * serialize access to the commit structure in the rnode. As long 11825 * as the thread has set R4COMMIT, then it can manipulate the commit 11826 * structure without requiring any other locks. 11827 * 11828 * When this function is called from nfs4_dispose() the page passed 11829 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11830 * will skip it. This is not a problem since we initially add the 11831 * page to the r_commit page list. 11832 * 11833 */ 11834 static void 11835 nfs4_get_commit(vnode_t *vp) 11836 { 11837 rnode4_t *rp; 11838 page_t *pp; 11839 kmutex_t *vphm; 11840 11841 rp = VTOR4(vp); 11842 11843 ASSERT(rp->r_flags & R4COMMIT); 11844 11845 /* make sure we're looking at the master vnode, not a shadow */ 11846 11847 if (IS_SHADOW(vp, rp)) 11848 vp = RTOV4(rp); 11849 11850 vphm = page_vnode_mutex(vp); 11851 mutex_enter(vphm); 11852 11853 /* 11854 * If there are no pages associated with this vnode, then 11855 * just return. 11856 */ 11857 if ((pp = vp->v_pages) == NULL) { 11858 mutex_exit(vphm); 11859 return; 11860 } 11861 11862 /* 11863 * Step through all of the pages associated with this vnode 11864 * looking for pages which need to be committed. 11865 */ 11866 do { 11867 /* 11868 * First short-cut everything (without the page_lock) 11869 * and see if this page does not need to be committed 11870 * or is modified if so then we'll just skip it. 11871 */ 11872 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11873 continue; 11874 11875 /* 11876 * Attempt to lock the page. If we can't, then 11877 * someone else is messing with it or we have been 11878 * called from nfs4_dispose and this is the page that 11879 * nfs4_dispose was called with.. anyway just skip it. 11880 */ 11881 if (!page_trylock(pp, SE_EXCL)) 11882 continue; 11883 11884 /* 11885 * Lets check again now that we have the page lock. 11886 */ 11887 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11888 page_unlock(pp); 11889 continue; 11890 } 11891 11892 /* this had better not be a free page */ 11893 ASSERT(PP_ISFREE(pp) == 0); 11894 11895 /* 11896 * The page needs to be committed and we locked it. 11897 * Update the base and length parameters and add it 11898 * to r_pages. 11899 */ 11900 if (rp->r_commit.c_pages == NULL) { 11901 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11902 rp->r_commit.c_commlen = PAGESIZE; 11903 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11904 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11905 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11906 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11907 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11908 <= pp->p_offset) { 11909 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11910 rp->r_commit.c_commbase + PAGESIZE; 11911 } 11912 page_add(&rp->r_commit.c_pages, pp); 11913 } while ((pp = pp->p_vpnext) != vp->v_pages); 11914 11915 mutex_exit(vphm); 11916 } 11917 11918 /* 11919 * This routine is used to gather together a page list of the pages 11920 * which are to be committed on the server. This routine must not 11921 * be called if the calling thread holds any locked pages. 11922 * 11923 * The calling thread must have set R4COMMIT. This bit is used to 11924 * serialize access to the commit structure in the rnode. As long 11925 * as the thread has set R4COMMIT, then it can manipulate the commit 11926 * structure without requiring any other locks. 11927 */ 11928 static void 11929 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11930 { 11931 11932 rnode4_t *rp; 11933 page_t *pp; 11934 u_offset_t end; 11935 u_offset_t off; 11936 ASSERT(len != 0); 11937 rp = VTOR4(vp); 11938 ASSERT(rp->r_flags & R4COMMIT); 11939 11940 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11941 11942 /* make sure we're looking at the master vnode, not a shadow */ 11943 11944 if (IS_SHADOW(vp, rp)) 11945 vp = RTOV4(rp); 11946 11947 /* 11948 * If there are no pages associated with this vnode, then 11949 * just return. 11950 */ 11951 if ((pp = vp->v_pages) == NULL) 11952 return; 11953 /* 11954 * Calculate the ending offset. 11955 */ 11956 end = soff + len; 11957 for (off = soff; off < end; off += PAGESIZE) { 11958 /* 11959 * Lookup each page by vp, offset. 11960 */ 11961 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 11962 continue; 11963 /* 11964 * If this page does not need to be committed or is 11965 * modified, then just skip it. 11966 */ 11967 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11968 page_unlock(pp); 11969 continue; 11970 } 11971 11972 ASSERT(PP_ISFREE(pp) == 0); 11973 /* 11974 * The page needs to be committed and we locked it. 11975 * Update the base and length parameters and add it 11976 * to r_pages. 11977 */ 11978 if (rp->r_commit.c_pages == NULL) { 11979 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11980 rp->r_commit.c_commlen = PAGESIZE; 11981 } else { 11982 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11983 rp->r_commit.c_commbase + PAGESIZE; 11984 } 11985 page_add(&rp->r_commit.c_pages, pp); 11986 } 11987 } 11988 11989 /* 11990 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 11991 * Flushes and commits data to the server. 11992 */ 11993 static int 11994 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 11995 { 11996 int error; 11997 verifier4 write_verf; 11998 rnode4_t *rp = VTOR4(vp); 11999 12000 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12001 12002 /* 12003 * Flush the data portion of the file and then commit any 12004 * portions which need to be committed. This may need to 12005 * be done twice if the server has changed state since 12006 * data was last written. The data will need to be 12007 * rewritten to the server and then a new commit done. 12008 * 12009 * In fact, this may need to be done several times if the 12010 * server is having problems and crashing while we are 12011 * attempting to do this. 12012 */ 12013 12014 top: 12015 /* 12016 * Do a flush based on the poff and plen arguments. This 12017 * will synchronously write out any modified pages in the 12018 * range specified by (poff, plen). This starts all of the 12019 * i/o operations which will be waited for in the next 12020 * call to nfs4_putpage 12021 */ 12022 12023 mutex_enter(&rp->r_statelock); 12024 write_verf = rp->r_writeverf; 12025 mutex_exit(&rp->r_statelock); 12026 12027 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12028 if (error == EAGAIN) 12029 error = 0; 12030 12031 /* 12032 * Do a flush based on the poff and plen arguments. This 12033 * will synchronously write out any modified pages in the 12034 * range specified by (poff, plen) and wait until all of 12035 * the asynchronous i/o's in that range are done as well. 12036 */ 12037 if (!error) 12038 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12039 12040 if (error) 12041 return (error); 12042 12043 mutex_enter(&rp->r_statelock); 12044 if (rp->r_writeverf != write_verf) { 12045 mutex_exit(&rp->r_statelock); 12046 goto top; 12047 } 12048 mutex_exit(&rp->r_statelock); 12049 12050 /* 12051 * Now commit any pages which might need to be committed. 12052 * If the error, NFS_VERF_MISMATCH, is returned, then 12053 * start over with the flush operation. 12054 */ 12055 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12056 12057 if (error == NFS_VERF_MISMATCH) 12058 goto top; 12059 12060 return (error); 12061 } 12062 12063 /* 12064 * nfs4_commit_vp() will wait for other pending commits and 12065 * will either commit the whole file or a range, plen dictates 12066 * if we commit whole file. a value of zero indicates the whole 12067 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12068 */ 12069 static int 12070 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12071 cred_t *cr, int wait_on_writes) 12072 { 12073 rnode4_t *rp; 12074 page_t *plist; 12075 offset3 offset; 12076 count3 len; 12077 12078 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12079 12080 rp = VTOR4(vp); 12081 12082 /* 12083 * before we gather commitable pages make 12084 * sure there are no outstanding async writes 12085 */ 12086 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12087 mutex_enter(&rp->r_statelock); 12088 while (rp->r_count > 0) { 12089 cv_wait(&rp->r_cv, &rp->r_statelock); 12090 } 12091 mutex_exit(&rp->r_statelock); 12092 } 12093 12094 /* 12095 * Set the `commit inprogress' state bit. We must 12096 * first wait until any current one finishes. 12097 */ 12098 mutex_enter(&rp->r_statelock); 12099 while (rp->r_flags & R4COMMIT) { 12100 rp->r_flags |= R4COMMITWAIT; 12101 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12102 rp->r_flags &= ~R4COMMITWAIT; 12103 } 12104 rp->r_flags |= R4COMMIT; 12105 mutex_exit(&rp->r_statelock); 12106 12107 /* 12108 * Gather all of the pages which need to be 12109 * committed. 12110 */ 12111 if (plen == 0) 12112 nfs4_get_commit(vp); 12113 else 12114 nfs4_get_commit_range(vp, poff, plen); 12115 12116 /* 12117 * Clear the `commit inprogress' bit and disconnect the 12118 * page list which was gathered by nfs4_get_commit. 12119 */ 12120 plist = rp->r_commit.c_pages; 12121 rp->r_commit.c_pages = NULL; 12122 offset = rp->r_commit.c_commbase; 12123 len = rp->r_commit.c_commlen; 12124 mutex_enter(&rp->r_statelock); 12125 rp->r_flags &= ~R4COMMIT; 12126 cv_broadcast(&rp->r_commit.c_cv); 12127 mutex_exit(&rp->r_statelock); 12128 12129 /* 12130 * If any pages need to be committed, commit them and 12131 * then unlock them so that they can be freed some 12132 * time later. 12133 */ 12134 if (plist == NULL) 12135 return (0); 12136 12137 /* 12138 * No error occurred during the flush portion 12139 * of this operation, so now attempt to commit 12140 * the data to stable storage on the server. 12141 * 12142 * This will unlock all of the pages on the list. 12143 */ 12144 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12145 } 12146 12147 static int 12148 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12149 cred_t *cr) 12150 { 12151 int error; 12152 page_t *pp; 12153 12154 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12155 12156 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12157 12158 /* 12159 * If we got an error, then just unlock all of the pages 12160 * on the list. 12161 */ 12162 if (error) { 12163 while (plist != NULL) { 12164 pp = plist; 12165 page_sub(&plist, pp); 12166 page_unlock(pp); 12167 } 12168 return (error); 12169 } 12170 /* 12171 * We've tried as hard as we can to commit the data to stable 12172 * storage on the server. We just unlock the pages and clear 12173 * the commit required state. They will get freed later. 12174 */ 12175 while (plist != NULL) { 12176 pp = plist; 12177 page_sub(&plist, pp); 12178 pp->p_fsdata = C_NOCOMMIT; 12179 page_unlock(pp); 12180 } 12181 12182 return (error); 12183 } 12184 12185 static void 12186 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12187 cred_t *cr) 12188 { 12189 12190 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12191 } 12192 12193 /*ARGSUSED*/ 12194 static int 12195 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12196 caller_context_t *ct) 12197 { 12198 int error = 0; 12199 mntinfo4_t *mi; 12200 vattr_t va; 12201 vsecattr_t nfsace4_vsap; 12202 12203 mi = VTOMI4(vp); 12204 if (nfs_zone() != mi->mi_zone) 12205 return (EIO); 12206 if (mi->mi_flags & MI4_ACL) { 12207 /* if we have a delegation, return it */ 12208 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12209 (void) nfs4delegreturn(VTOR4(vp), 12210 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12211 12212 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12213 NFS4_ACL_SET); 12214 if (error) /* EINVAL */ 12215 return (error); 12216 12217 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12218 /* 12219 * These are aclent_t type entries. 12220 */ 12221 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12222 vp->v_type == VDIR, FALSE); 12223 if (error) 12224 return (error); 12225 } else { 12226 /* 12227 * These are ace_t type entries. 12228 */ 12229 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12230 FALSE); 12231 if (error) 12232 return (error); 12233 } 12234 bzero(&va, sizeof (va)); 12235 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12236 vs_ace4_destroy(&nfsace4_vsap); 12237 return (error); 12238 } 12239 return (ENOSYS); 12240 } 12241 12242 /* ARGSUSED */ 12243 int 12244 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12245 caller_context_t *ct) 12246 { 12247 int error; 12248 mntinfo4_t *mi; 12249 nfs4_ga_res_t gar; 12250 rnode4_t *rp = VTOR4(vp); 12251 12252 mi = VTOMI4(vp); 12253 if (nfs_zone() != mi->mi_zone) 12254 return (EIO); 12255 12256 bzero(&gar, sizeof (gar)); 12257 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12258 12259 /* 12260 * vsecattr->vsa_mask holds the original acl request mask. 12261 * This is needed when determining what to return. 12262 * (See: nfs4_create_getsecattr_return()) 12263 */ 12264 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12265 if (error) /* EINVAL */ 12266 return (error); 12267 12268 if (mi->mi_flags & MI4_ACL) { 12269 /* 12270 * Check if the data is cached and the cache is valid. If it 12271 * is we don't go over the wire. 12272 */ 12273 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12274 mutex_enter(&rp->r_statelock); 12275 if (rp->r_secattr != NULL) { 12276 error = nfs4_create_getsecattr_return( 12277 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12278 rp->r_attr.va_gid, 12279 vp->v_type == VDIR); 12280 if (!error) { /* error == 0 - Success! */ 12281 mutex_exit(&rp->r_statelock); 12282 return (error); 12283 } 12284 } 12285 mutex_exit(&rp->r_statelock); 12286 } 12287 12288 /* 12289 * The getattr otw call will always get both the acl, in 12290 * the form of a list of nfsace4's, and the number of acl 12291 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12292 */ 12293 gar.n4g_va.va_mask = AT_ALL; 12294 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12295 if (error) { 12296 vs_ace4_destroy(&gar.n4g_vsa); 12297 if (error == ENOTSUP || error == EOPNOTSUPP) 12298 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12299 return (error); 12300 } 12301 12302 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12303 /* 12304 * No error was returned, but according to the response 12305 * bitmap, neither was an acl. 12306 */ 12307 vs_ace4_destroy(&gar.n4g_vsa); 12308 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12309 return (error); 12310 } 12311 12312 /* 12313 * Update the cache with the ACL. 12314 */ 12315 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12316 12317 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12318 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12319 vp->v_type == VDIR); 12320 vs_ace4_destroy(&gar.n4g_vsa); 12321 if ((error) && (vsecattr->vsa_mask & 12322 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12323 (error != EACCES)) { 12324 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12325 } 12326 return (error); 12327 } 12328 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12329 return (error); 12330 } 12331 12332 /* 12333 * The function returns: 12334 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12335 * - EINVAL if the passed in "acl_mask" is an invalid request. 12336 * 12337 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12338 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12339 * 12340 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12341 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12342 * - We have a count field set without the corresponding acl field set. (e.g. - 12343 * VSA_ACECNT is set, but VSA_ACE is not) 12344 */ 12345 static int 12346 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12347 { 12348 /* Shortcut the masks that are always valid. */ 12349 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12350 return (0); 12351 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12352 return (0); 12353 12354 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12355 /* 12356 * We can't have any VSA_ACL type stuff in the mask now. 12357 */ 12358 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12359 VSA_DFACLCNT)) 12360 return (EINVAL); 12361 12362 if (op == NFS4_ACL_SET) { 12363 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12364 return (EINVAL); 12365 } 12366 } 12367 12368 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12369 /* 12370 * We can't have any VSA_ACE type stuff in the mask now. 12371 */ 12372 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12373 return (EINVAL); 12374 12375 if (op == NFS4_ACL_SET) { 12376 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12377 return (EINVAL); 12378 12379 if ((acl_mask & VSA_DFACLCNT) && 12380 !(acl_mask & VSA_DFACL)) 12381 return (EINVAL); 12382 } 12383 } 12384 return (0); 12385 } 12386 12387 /* 12388 * The theory behind creating the correct getsecattr return is simply this: 12389 * "Don't return anything that the caller is not expecting to have to free." 12390 */ 12391 static int 12392 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12393 uid_t uid, gid_t gid, int isdir) 12394 { 12395 int error = 0; 12396 /* Save the mask since the translators modify it. */ 12397 uint_t orig_mask = vsap->vsa_mask; 12398 12399 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12400 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12401 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12402 12403 if (error) 12404 return (error); 12405 12406 /* 12407 * If the caller only asked for the ace count (VSA_ACECNT) 12408 * don't give them the full acl (VSA_ACE), free it. 12409 */ 12410 if (!orig_mask & VSA_ACE) { 12411 if (vsap->vsa_aclentp != NULL) { 12412 kmem_free(vsap->vsa_aclentp, 12413 vsap->vsa_aclcnt * sizeof (ace_t)); 12414 vsap->vsa_aclentp = NULL; 12415 } 12416 } 12417 vsap->vsa_mask = orig_mask; 12418 12419 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12420 VSA_DFACLCNT)) { 12421 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12422 isdir, FALSE, 12423 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12424 12425 if (error) 12426 return (error); 12427 12428 /* 12429 * If the caller only asked for the acl count (VSA_ACLCNT) 12430 * and/or the default acl count (VSA_DFACLCNT) don't give them 12431 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12432 */ 12433 if (!orig_mask & VSA_ACL) { 12434 if (vsap->vsa_aclentp != NULL) { 12435 kmem_free(vsap->vsa_aclentp, 12436 vsap->vsa_aclcnt * sizeof (aclent_t)); 12437 vsap->vsa_aclentp = NULL; 12438 } 12439 } 12440 12441 if (!orig_mask & VSA_DFACL) { 12442 if (vsap->vsa_dfaclentp != NULL) { 12443 kmem_free(vsap->vsa_dfaclentp, 12444 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12445 vsap->vsa_dfaclentp = NULL; 12446 } 12447 } 12448 vsap->vsa_mask = orig_mask; 12449 } 12450 return (0); 12451 } 12452 12453 /* ARGSUSED */ 12454 int 12455 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12456 caller_context_t *ct) 12457 { 12458 int error; 12459 12460 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12461 return (EIO); 12462 /* 12463 * check for valid cmd parameter 12464 */ 12465 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12466 return (EINVAL); 12467 12468 /* 12469 * Check access permissions 12470 */ 12471 if ((cmd & F_SHARE) && 12472 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12473 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12474 return (EBADF); 12475 12476 /* 12477 * If the filesystem is mounted using local locking, pass the 12478 * request off to the local share code. 12479 */ 12480 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12481 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12482 12483 switch (cmd) { 12484 case F_SHARE: 12485 case F_UNSHARE: 12486 /* 12487 * This will be properly implemented later, 12488 * see RFE: 4823948 . 12489 */ 12490 error = EAGAIN; 12491 break; 12492 12493 case F_HASREMOTELOCKS: 12494 /* 12495 * NFS client can't store remote locks itself 12496 */ 12497 shr->s_access = 0; 12498 error = 0; 12499 break; 12500 12501 default: 12502 error = EINVAL; 12503 break; 12504 } 12505 12506 return (error); 12507 } 12508 12509 /* 12510 * Common code called by directory ops to update the attrcache 12511 */ 12512 static int 12513 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12514 hrtime_t t, vnode_t *vp, cred_t *cr) 12515 { 12516 int error = 0; 12517 12518 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12519 12520 if (status != NFS4_OK) { 12521 /* getattr not done or failed */ 12522 PURGE_ATTRCACHE4(vp); 12523 return (error); 12524 } 12525 12526 if (garp) { 12527 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12528 } else { 12529 PURGE_ATTRCACHE4(vp); 12530 } 12531 return (error); 12532 } 12533 12534 /* 12535 * Update directory caches for directory modification ops (link, rename, etc.) 12536 * When dinfo is NULL, manage dircaches in the old way. 12537 */ 12538 static void 12539 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12540 dirattr_info_t *dinfo) 12541 { 12542 rnode4_t *drp = VTOR4(dvp); 12543 12544 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12545 12546 /* Purge rddir cache for dir since it changed */ 12547 if (drp->r_dir != NULL) 12548 nfs4_purge_rddir_cache(dvp); 12549 12550 /* 12551 * If caller provided dinfo, then use it to manage dir caches. 12552 */ 12553 if (dinfo != NULL) { 12554 if (vp != NULL) { 12555 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12556 if (!VTOR4(vp)->created_v4) { 12557 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12558 dnlc_update(dvp, nm, vp); 12559 } else { 12560 /* 12561 * XXX don't update if the created_v4 flag is 12562 * set 12563 */ 12564 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12565 NFS4_DEBUG(nfs4_client_state_debug, 12566 (CE_NOTE, "nfs4_update_dircaches: " 12567 "don't update dnlc: created_v4 flag")); 12568 } 12569 } 12570 12571 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12572 dinfo->di_cred, FALSE, cinfo); 12573 12574 return; 12575 } 12576 12577 /* 12578 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12579 * Since caller modified dir but didn't receive post-dirmod-op dir 12580 * attrs, the dir's attrs must be purged. 12581 * 12582 * XXX this check and dnlc update/purge should really be atomic, 12583 * XXX but can't use rnode statelock because it'll deadlock in 12584 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12585 * XXX does occur. 12586 * 12587 * XXX We also may want to check that atomic is true in the 12588 * XXX change_info struct. If it is not, the change_info may 12589 * XXX reflect changes by more than one clients which means that 12590 * XXX our cache may not be valid. 12591 */ 12592 PURGE_ATTRCACHE4(dvp); 12593 if (drp->r_change == cinfo->before) { 12594 /* no changes took place in the directory prior to our link */ 12595 if (vp != NULL) { 12596 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12597 if (!VTOR4(vp)->created_v4) { 12598 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12599 dnlc_update(dvp, nm, vp); 12600 } else { 12601 /* 12602 * XXX dont' update if the created_v4 flag 12603 * is set 12604 */ 12605 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12606 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12607 "nfs4_update_dircaches: don't" 12608 " update dnlc: created_v4 flag")); 12609 } 12610 } 12611 } else { 12612 /* Another client modified directory - purge its dnlc cache */ 12613 dnlc_purge_vp(dvp); 12614 } 12615 } 12616 12617 /* 12618 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12619 * file. 12620 * 12621 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12622 * file (ie: client recovery) and otherwise set to FALSE. 12623 * 12624 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12625 * initiated) calling functions. 12626 * 12627 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12628 * of resending a 'lost' open request. 12629 * 12630 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12631 * server that hands out BAD_SEQID on open confirm. 12632 * 12633 * Errors are returned via the nfs4_error_t parameter. 12634 */ 12635 void 12636 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12637 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12638 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12639 { 12640 COMPOUND4args_clnt args; 12641 COMPOUND4res_clnt res; 12642 nfs_argop4 argop[2]; 12643 nfs_resop4 *resop; 12644 int doqueue = 1; 12645 mntinfo4_t *mi; 12646 OPEN_CONFIRM4args *open_confirm_args; 12647 int needrecov; 12648 12649 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12650 #if DEBUG 12651 mutex_enter(&oop->oo_lock); 12652 ASSERT(oop->oo_seqid_inuse); 12653 mutex_exit(&oop->oo_lock); 12654 #endif 12655 12656 recov_retry_confirm: 12657 nfs4_error_zinit(ep); 12658 *retry_open = FALSE; 12659 12660 if (resend) 12661 args.ctag = TAG_OPEN_CONFIRM_LOST; 12662 else 12663 args.ctag = TAG_OPEN_CONFIRM; 12664 12665 args.array_len = 2; 12666 args.array = argop; 12667 12668 /* putfh target fh */ 12669 argop[0].argop = OP_CPUTFH; 12670 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12671 12672 argop[1].argop = OP_OPEN_CONFIRM; 12673 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12674 12675 (*seqid) += 1; 12676 open_confirm_args->seqid = *seqid; 12677 open_confirm_args->open_stateid = *stateid; 12678 12679 mi = VTOMI4(vp); 12680 12681 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12682 12683 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12684 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12685 } 12686 12687 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12688 if (!needrecov && ep->error) 12689 return; 12690 12691 if (needrecov) { 12692 bool_t abort = FALSE; 12693 12694 if (reopening_file == FALSE) { 12695 nfs4_bseqid_entry_t *bsep = NULL; 12696 12697 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12698 bsep = nfs4_create_bseqid_entry(oop, NULL, 12699 vp, 0, args.ctag, 12700 open_confirm_args->seqid); 12701 12702 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12703 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12704 if (bsep) { 12705 kmem_free(bsep, sizeof (*bsep)); 12706 if (num_bseqid_retryp && 12707 --(*num_bseqid_retryp) == 0) 12708 abort = TRUE; 12709 } 12710 } 12711 if ((ep->error == ETIMEDOUT || 12712 res.status == NFS4ERR_RESOURCE) && 12713 abort == FALSE && resend == FALSE) { 12714 if (!ep->error) 12715 (void) xdr_free(xdr_COMPOUND4res_clnt, 12716 (caddr_t)&res); 12717 12718 delay(SEC_TO_TICK(confirm_retry_sec)); 12719 goto recov_retry_confirm; 12720 } 12721 /* State may have changed so retry the entire OPEN op */ 12722 if (abort == FALSE) 12723 *retry_open = TRUE; 12724 else 12725 *retry_open = FALSE; 12726 if (!ep->error) 12727 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12728 return; 12729 } 12730 12731 if (res.status) { 12732 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12733 return; 12734 } 12735 12736 resop = &res.array[1]; /* open confirm res */ 12737 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12738 stateid, sizeof (*stateid)); 12739 12740 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12741 } 12742 12743 /* 12744 * Return the credentials associated with a client state object. The 12745 * caller is responsible for freeing the credentials. 12746 */ 12747 12748 static cred_t * 12749 state_to_cred(nfs4_open_stream_t *osp) 12750 { 12751 cred_t *cr; 12752 12753 /* 12754 * It's ok to not lock the open stream and open owner to get 12755 * the oo_cred since this is only written once (upon creation) 12756 * and will not change. 12757 */ 12758 cr = osp->os_open_owner->oo_cred; 12759 crhold(cr); 12760 12761 return (cr); 12762 } 12763 12764 /* 12765 * nfs4_find_sysid 12766 * 12767 * Find the sysid for the knetconfig associated with the given mi. 12768 */ 12769 static struct lm_sysid * 12770 nfs4_find_sysid(mntinfo4_t *mi) 12771 { 12772 ASSERT(nfs_zone() == mi->mi_zone); 12773 12774 /* 12775 * Switch from RDMA knconf to original mount knconf 12776 */ 12777 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12778 mi->mi_curr_serv->sv_hostname, NULL)); 12779 } 12780 12781 #ifdef DEBUG 12782 /* 12783 * Return a string version of the call type for easy reading. 12784 */ 12785 static char * 12786 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12787 { 12788 switch (ctype) { 12789 case NFS4_LCK_CTYPE_NORM: 12790 return ("NORMAL"); 12791 case NFS4_LCK_CTYPE_RECLAIM: 12792 return ("RECLAIM"); 12793 case NFS4_LCK_CTYPE_RESEND: 12794 return ("RESEND"); 12795 case NFS4_LCK_CTYPE_REINSTATE: 12796 return ("REINSTATE"); 12797 default: 12798 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12799 "type %d", ctype); 12800 return (""); 12801 } 12802 } 12803 #endif 12804 12805 /* 12806 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12807 * Unlock requests don't have an over-the-wire locktype, so we just return 12808 * something non-threatening. 12809 */ 12810 12811 static nfs_lock_type4 12812 flk_to_locktype(int cmd, int l_type) 12813 { 12814 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12815 12816 switch (l_type) { 12817 case F_UNLCK: 12818 return (READ_LT); 12819 case F_RDLCK: 12820 if (cmd == F_SETLK) 12821 return (READ_LT); 12822 else 12823 return (READW_LT); 12824 case F_WRLCK: 12825 if (cmd == F_SETLK) 12826 return (WRITE_LT); 12827 else 12828 return (WRITEW_LT); 12829 } 12830 panic("flk_to_locktype"); 12831 /*NOTREACHED*/ 12832 } 12833 12834 /* 12835 * Do some preliminary checks for nfs4frlock. 12836 */ 12837 static int 12838 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12839 u_offset_t offset) 12840 { 12841 int error = 0; 12842 12843 /* 12844 * If we are setting a lock, check that the file is opened 12845 * with the correct mode. 12846 */ 12847 if (cmd == F_SETLK || cmd == F_SETLKW) { 12848 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12849 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12850 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12851 "nfs4frlock_validate_args: file was opened with " 12852 "incorrect mode")); 12853 return (EBADF); 12854 } 12855 } 12856 12857 /* Convert the offset. It may need to be restored before returning. */ 12858 if (error = convoff(vp, flk, 0, offset)) { 12859 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12860 "nfs4frlock_validate_args: convoff => error= %d\n", 12861 error)); 12862 return (error); 12863 } 12864 12865 return (error); 12866 } 12867 12868 /* 12869 * Set the flock64's lm_sysid for nfs4frlock. 12870 */ 12871 static int 12872 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12873 { 12874 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12875 12876 /* Find the lm_sysid */ 12877 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12878 12879 if (*lspp == NULL) { 12880 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12881 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12882 return (ENOLCK); 12883 } 12884 12885 flk->l_sysid = lm_sysidt(*lspp); 12886 12887 return (0); 12888 } 12889 12890 /* 12891 * Do the remaining preliminary setup for nfs4frlock. 12892 */ 12893 static void 12894 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12895 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12896 cred_t **cred_otw) 12897 { 12898 /* 12899 * set tick_delay to the base delay time. 12900 * (NFS4_BASE_WAIT_TIME is in secs) 12901 */ 12902 12903 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12904 12905 /* 12906 * If lock is relative to EOF, we need the newest length of the 12907 * file. Therefore invalidate the ATTR_CACHE. 12908 */ 12909 12910 *whencep = flk->l_whence; 12911 12912 if (*whencep == 2) /* SEEK_END */ 12913 PURGE_ATTRCACHE4(vp); 12914 12915 recov_statep->rs_flags = 0; 12916 recov_statep->rs_num_retry_despite_err = 0; 12917 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12918 } 12919 12920 /* 12921 * Initialize and allocate the data structures necessary for 12922 * the nfs4frlock call. 12923 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12924 */ 12925 static void 12926 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12927 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12928 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12929 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12930 { 12931 int argoplist_size; 12932 int num_ops = 2; 12933 12934 *retry = FALSE; 12935 *did_start_fop = FALSE; 12936 *skip_get_err = FALSE; 12937 lost_rqstp->lr_op = 0; 12938 argoplist_size = num_ops * sizeof (nfs_argop4); 12939 /* fill array with zero */ 12940 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12941 12942 *argspp = argsp; 12943 *respp = NULL; 12944 12945 argsp->array_len = num_ops; 12946 argsp->array = *argopp; 12947 12948 /* initialize in case of error; will get real value down below */ 12949 argsp->ctag = TAG_NONE; 12950 12951 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 12952 *op_hintp = OH_LOCKU; 12953 else 12954 *op_hintp = OH_OTHER; 12955 } 12956 12957 /* 12958 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 12959 * the proper nfs4_server_t for this instance of nfs4frlock. 12960 * Returns 0 (success) or an errno value. 12961 */ 12962 static int 12963 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 12964 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 12965 bool_t *did_start_fop, bool_t *startrecovp) 12966 { 12967 int error = 0; 12968 rnode4_t *rp; 12969 12970 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12971 12972 if (ctype == NFS4_LCK_CTYPE_NORM) { 12973 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 12974 recov_statep, startrecovp); 12975 if (error) 12976 return (error); 12977 *did_start_fop = TRUE; 12978 } else { 12979 *did_start_fop = FALSE; 12980 *startrecovp = FALSE; 12981 } 12982 12983 if (!error) { 12984 rp = VTOR4(vp); 12985 12986 /* If the file failed recovery, just quit. */ 12987 mutex_enter(&rp->r_statelock); 12988 if (rp->r_flags & R4RECOVERR) { 12989 error = EIO; 12990 } 12991 mutex_exit(&rp->r_statelock); 12992 } 12993 12994 return (error); 12995 } 12996 12997 /* 12998 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 12999 * resend nfs4frlock call is initiated by the recovery framework. 13000 * Acquires the lop and oop seqid synchronization. 13001 */ 13002 static void 13003 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13004 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13005 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13006 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13007 { 13008 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13009 int error; 13010 13011 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13012 (CE_NOTE, 13013 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13014 ASSERT(resend_rqstp != NULL); 13015 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13016 resend_rqstp->lr_op == OP_LOCKU); 13017 13018 *oopp = resend_rqstp->lr_oop; 13019 if (resend_rqstp->lr_oop) { 13020 open_owner_hold(resend_rqstp->lr_oop); 13021 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13022 ASSERT(error == 0); /* recov thread always succeeds */ 13023 } 13024 13025 /* Must resend this lost lock/locku request. */ 13026 ASSERT(resend_rqstp->lr_lop != NULL); 13027 *lopp = resend_rqstp->lr_lop; 13028 lock_owner_hold(resend_rqstp->lr_lop); 13029 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13030 ASSERT(error == 0); /* recov thread always succeeds */ 13031 13032 *ospp = resend_rqstp->lr_osp; 13033 if (*ospp) 13034 open_stream_hold(resend_rqstp->lr_osp); 13035 13036 if (resend_rqstp->lr_op == OP_LOCK) { 13037 LOCK4args *lock_args; 13038 13039 argop->argop = OP_LOCK; 13040 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13041 lock_args->locktype = resend_rqstp->lr_locktype; 13042 lock_args->reclaim = 13043 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13044 lock_args->offset = resend_rqstp->lr_flk->l_start; 13045 lock_args->length = resend_rqstp->lr_flk->l_len; 13046 if (lock_args->length == 0) 13047 lock_args->length = ~lock_args->length; 13048 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13049 mi2clientid(mi), &lock_args->locker); 13050 13051 switch (resend_rqstp->lr_ctype) { 13052 case NFS4_LCK_CTYPE_RESEND: 13053 argsp->ctag = TAG_LOCK_RESEND; 13054 break; 13055 case NFS4_LCK_CTYPE_REINSTATE: 13056 argsp->ctag = TAG_LOCK_REINSTATE; 13057 break; 13058 case NFS4_LCK_CTYPE_RECLAIM: 13059 argsp->ctag = TAG_LOCK_RECLAIM; 13060 break; 13061 default: 13062 argsp->ctag = TAG_LOCK_UNKNOWN; 13063 break; 13064 } 13065 } else { 13066 LOCKU4args *locku_args; 13067 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13068 13069 argop->argop = OP_LOCKU; 13070 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13071 locku_args->locktype = READ_LT; 13072 locku_args->seqid = lop->lock_seqid + 1; 13073 mutex_enter(&lop->lo_lock); 13074 locku_args->lock_stateid = lop->lock_stateid; 13075 mutex_exit(&lop->lo_lock); 13076 locku_args->offset = resend_rqstp->lr_flk->l_start; 13077 locku_args->length = resend_rqstp->lr_flk->l_len; 13078 if (locku_args->length == 0) 13079 locku_args->length = ~locku_args->length; 13080 13081 switch (resend_rqstp->lr_ctype) { 13082 case NFS4_LCK_CTYPE_RESEND: 13083 argsp->ctag = TAG_LOCKU_RESEND; 13084 break; 13085 case NFS4_LCK_CTYPE_REINSTATE: 13086 argsp->ctag = TAG_LOCKU_REINSTATE; 13087 break; 13088 default: 13089 argsp->ctag = TAG_LOCK_UNKNOWN; 13090 break; 13091 } 13092 } 13093 } 13094 13095 /* 13096 * Setup the LOCKT4 arguments. 13097 */ 13098 static void 13099 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13100 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13101 rnode4_t *rp) 13102 { 13103 LOCKT4args *lockt_args; 13104 13105 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13106 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13107 argop->argop = OP_LOCKT; 13108 argsp->ctag = TAG_LOCKT; 13109 lockt_args = &argop->nfs_argop4_u.oplockt; 13110 13111 /* 13112 * The locktype will be READ_LT unless it's 13113 * a write lock. We do this because the Solaris 13114 * system call allows the combination of 13115 * F_UNLCK and F_GETLK* and so in that case the 13116 * unlock is mapped to a read. 13117 */ 13118 if (flk->l_type == F_WRLCK) 13119 lockt_args->locktype = WRITE_LT; 13120 else 13121 lockt_args->locktype = READ_LT; 13122 13123 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13124 /* set the lock owner4 args */ 13125 nfs4_setlockowner_args(&lockt_args->owner, rp, 13126 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13127 flk->l_pid); 13128 lockt_args->offset = flk->l_start; 13129 lockt_args->length = flk->l_len; 13130 if (flk->l_len == 0) 13131 lockt_args->length = ~lockt_args->length; 13132 13133 *lockt_argsp = lockt_args; 13134 } 13135 13136 /* 13137 * If the client is holding a delegation, and the open stream to be used 13138 * with this lock request is a delegation open stream, then re-open the stream. 13139 * Sets the nfs4_error_t to all zeros unless the open stream has already 13140 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13141 * means the caller should retry (like a recovery retry). 13142 */ 13143 static void 13144 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13145 { 13146 open_delegation_type4 dt; 13147 bool_t reopen_needed, force; 13148 nfs4_open_stream_t *osp; 13149 open_claim_type4 oclaim; 13150 rnode4_t *rp = VTOR4(vp); 13151 mntinfo4_t *mi = VTOMI4(vp); 13152 13153 ASSERT(nfs_zone() == mi->mi_zone); 13154 13155 nfs4_error_zinit(ep); 13156 13157 mutex_enter(&rp->r_statev4_lock); 13158 dt = rp->r_deleg_type; 13159 mutex_exit(&rp->r_statev4_lock); 13160 13161 if (dt != OPEN_DELEGATE_NONE) { 13162 nfs4_open_owner_t *oop; 13163 13164 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13165 if (!oop) { 13166 ep->stat = NFS4ERR_IO; 13167 return; 13168 } 13169 /* returns with 'os_sync_lock' held */ 13170 osp = find_open_stream(oop, rp); 13171 if (!osp) { 13172 open_owner_rele(oop); 13173 ep->stat = NFS4ERR_IO; 13174 return; 13175 } 13176 13177 if (osp->os_failed_reopen) { 13178 NFS4_DEBUG((nfs4_open_stream_debug || 13179 nfs4_client_lock_debug), (CE_NOTE, 13180 "nfs4frlock_check_deleg: os_failed_reopen set " 13181 "for osp %p, cr %p, rp %s", (void *)osp, 13182 (void *)cr, rnode4info(rp))); 13183 mutex_exit(&osp->os_sync_lock); 13184 open_stream_rele(osp, rp); 13185 open_owner_rele(oop); 13186 ep->stat = NFS4ERR_IO; 13187 return; 13188 } 13189 13190 /* 13191 * Determine whether a reopen is needed. If this 13192 * is a delegation open stream, then send the open 13193 * to the server to give visibility to the open owner. 13194 * Even if it isn't a delegation open stream, we need 13195 * to check if the previous open CLAIM_DELEGATE_CUR 13196 * was sufficient. 13197 */ 13198 13199 reopen_needed = osp->os_delegation || 13200 ((lt == F_RDLCK && 13201 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13202 (lt == F_WRLCK && 13203 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13204 13205 mutex_exit(&osp->os_sync_lock); 13206 open_owner_rele(oop); 13207 13208 if (reopen_needed) { 13209 /* 13210 * Always use CLAIM_PREVIOUS after server reboot. 13211 * The server will reject CLAIM_DELEGATE_CUR if 13212 * it is used during the grace period. 13213 */ 13214 mutex_enter(&mi->mi_lock); 13215 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13216 oclaim = CLAIM_PREVIOUS; 13217 force = TRUE; 13218 } else { 13219 oclaim = CLAIM_DELEGATE_CUR; 13220 force = FALSE; 13221 } 13222 mutex_exit(&mi->mi_lock); 13223 13224 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13225 if (ep->error == EAGAIN) { 13226 nfs4_error_zinit(ep); 13227 ep->stat = NFS4ERR_DELAY; 13228 } 13229 } 13230 open_stream_rele(osp, rp); 13231 osp = NULL; 13232 } 13233 } 13234 13235 /* 13236 * Setup the LOCKU4 arguments. 13237 * Returns errors via the nfs4_error_t. 13238 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13239 * over-the-wire. The caller must release the 13240 * reference on *lopp. 13241 * NFS4ERR_DELAY caller should retry (like recovery retry) 13242 * (other) unrecoverable error. 13243 */ 13244 static void 13245 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13246 LOCKU4args **locku_argsp, flock64_t *flk, 13247 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13248 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13249 bool_t *skip_get_err, bool_t *go_otwp) 13250 { 13251 nfs4_lock_owner_t *lop = NULL; 13252 LOCKU4args *locku_args; 13253 pid_t pid; 13254 bool_t is_spec = FALSE; 13255 rnode4_t *rp = VTOR4(vp); 13256 13257 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13258 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13259 13260 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13261 if (ep->error || ep->stat) 13262 return; 13263 13264 argop->argop = OP_LOCKU; 13265 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13266 argsp->ctag = TAG_LOCKU_REINSTATE; 13267 else 13268 argsp->ctag = TAG_LOCKU; 13269 locku_args = &argop->nfs_argop4_u.oplocku; 13270 *locku_argsp = locku_args; 13271 13272 /* 13273 * XXX what should locku_args->locktype be? 13274 * setting to ALWAYS be READ_LT so at least 13275 * it is a valid locktype. 13276 */ 13277 13278 locku_args->locktype = READ_LT; 13279 13280 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13281 flk->l_pid; 13282 13283 /* 13284 * Get the lock owner stateid. If no lock owner 13285 * exists, return success. 13286 */ 13287 lop = find_lock_owner(rp, pid, LOWN_ANY); 13288 *lopp = lop; 13289 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13290 is_spec = TRUE; 13291 if (!lop || is_spec) { 13292 /* 13293 * No lock owner so no locks to unlock. 13294 * Return success. If there was a failed 13295 * reclaim earlier, the lock might still be 13296 * registered with the local locking code, 13297 * so notify it of the unlock. 13298 * 13299 * If the lockowner is using a special stateid, 13300 * then the original lock request (that created 13301 * this lockowner) was never successful, so we 13302 * have no lock to undo OTW. 13303 */ 13304 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13305 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13306 "(%ld) so return success", (long)pid)); 13307 13308 if (ctype == NFS4_LCK_CTYPE_NORM) 13309 flk->l_pid = curproc->p_pid; 13310 nfs4_register_lock_locally(vp, flk, flag, offset); 13311 /* 13312 * Release our hold and NULL out so final_cleanup 13313 * doesn't try to end a lock seqid sync we 13314 * never started. 13315 */ 13316 if (is_spec) { 13317 lock_owner_rele(lop); 13318 *lopp = NULL; 13319 } 13320 *skip_get_err = TRUE; 13321 *go_otwp = FALSE; 13322 return; 13323 } 13324 13325 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13326 if (ep->error == EAGAIN) { 13327 lock_owner_rele(lop); 13328 *lopp = NULL; 13329 return; 13330 } 13331 13332 mutex_enter(&lop->lo_lock); 13333 locku_args->lock_stateid = lop->lock_stateid; 13334 mutex_exit(&lop->lo_lock); 13335 locku_args->seqid = lop->lock_seqid + 1; 13336 13337 /* leave the ref count on lop, rele after RPC call */ 13338 13339 locku_args->offset = flk->l_start; 13340 locku_args->length = flk->l_len; 13341 if (flk->l_len == 0) 13342 locku_args->length = ~locku_args->length; 13343 13344 *go_otwp = TRUE; 13345 } 13346 13347 /* 13348 * Setup the LOCK4 arguments. 13349 * 13350 * Returns errors via the nfs4_error_t. 13351 * NFS4_OK no problems 13352 * NFS4ERR_DELAY caller should retry (like recovery retry) 13353 * (other) unrecoverable error 13354 */ 13355 static void 13356 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13357 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13358 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13359 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13360 { 13361 LOCK4args *lock_args; 13362 nfs4_open_owner_t *oop = NULL; 13363 nfs4_open_stream_t *osp = NULL; 13364 nfs4_lock_owner_t *lop = NULL; 13365 pid_t pid; 13366 rnode4_t *rp = VTOR4(vp); 13367 13368 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13369 13370 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13371 if (ep->error || ep->stat != NFS4_OK) 13372 return; 13373 13374 argop->argop = OP_LOCK; 13375 if (ctype == NFS4_LCK_CTYPE_NORM) 13376 argsp->ctag = TAG_LOCK; 13377 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13378 argsp->ctag = TAG_RELOCK; 13379 else 13380 argsp->ctag = TAG_LOCK_REINSTATE; 13381 lock_args = &argop->nfs_argop4_u.oplock; 13382 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13383 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13384 /* 13385 * Get the lock owner. If no lock owner exists, 13386 * create a 'temporary' one and grab the open seqid 13387 * synchronization (which puts a hold on the open 13388 * owner and open stream). 13389 * This also grabs the lock seqid synchronization. 13390 */ 13391 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13392 ep->stat = 13393 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13394 13395 if (ep->stat != NFS4_OK) 13396 goto out; 13397 13398 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13399 &lock_args->locker); 13400 13401 lock_args->offset = flk->l_start; 13402 lock_args->length = flk->l_len; 13403 if (flk->l_len == 0) 13404 lock_args->length = ~lock_args->length; 13405 *lock_argsp = lock_args; 13406 out: 13407 *oopp = oop; 13408 *ospp = osp; 13409 *lopp = lop; 13410 } 13411 13412 /* 13413 * After we get the reply from the server, record the proper information 13414 * for possible resend lock requests. 13415 * 13416 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13417 */ 13418 static void 13419 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13420 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13421 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13422 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13423 { 13424 bool_t unlock = (flk->l_type == F_UNLCK); 13425 13426 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13427 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13428 ctype == NFS4_LCK_CTYPE_REINSTATE); 13429 13430 if (error != 0 && !unlock) { 13431 NFS4_DEBUG((nfs4_lost_rqst_debug || 13432 nfs4_client_lock_debug), (CE_NOTE, 13433 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13434 " for lop %p", (void *)lop)); 13435 ASSERT(lop != NULL); 13436 mutex_enter(&lop->lo_lock); 13437 lop->lo_pending_rqsts = 1; 13438 mutex_exit(&lop->lo_lock); 13439 } 13440 13441 lost_rqstp->lr_putfirst = FALSE; 13442 lost_rqstp->lr_op = 0; 13443 13444 /* 13445 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13446 * recovery purposes so that the lock request that was sent 13447 * can be saved and re-issued later. Ditto for EIO from a forced 13448 * unmount. This is done to have the client's local locking state 13449 * match the v4 server's state; that is, the request was 13450 * potentially received and accepted by the server but the client 13451 * thinks it was not. 13452 */ 13453 if (error == ETIMEDOUT || error == EINTR || 13454 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13455 NFS4_DEBUG((nfs4_lost_rqst_debug || 13456 nfs4_client_lock_debug), (CE_NOTE, 13457 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13458 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13459 (void *)lop, (void *)oop, (void *)osp)); 13460 if (unlock) 13461 lost_rqstp->lr_op = OP_LOCKU; 13462 else { 13463 lost_rqstp->lr_op = OP_LOCK; 13464 lost_rqstp->lr_locktype = locktype; 13465 } 13466 /* 13467 * Objects are held and rele'd via the recovery code. 13468 * See nfs4_save_lost_rqst. 13469 */ 13470 lost_rqstp->lr_vp = vp; 13471 lost_rqstp->lr_dvp = NULL; 13472 lost_rqstp->lr_oop = oop; 13473 lost_rqstp->lr_osp = osp; 13474 lost_rqstp->lr_lop = lop; 13475 lost_rqstp->lr_cr = cr; 13476 switch (ctype) { 13477 case NFS4_LCK_CTYPE_NORM: 13478 flk->l_pid = ttoproc(curthread)->p_pid; 13479 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13480 break; 13481 case NFS4_LCK_CTYPE_REINSTATE: 13482 lost_rqstp->lr_putfirst = TRUE; 13483 lost_rqstp->lr_ctype = ctype; 13484 break; 13485 default: 13486 break; 13487 } 13488 lost_rqstp->lr_flk = flk; 13489 } 13490 } 13491 13492 /* 13493 * Update lop's seqid. Also update the seqid stored in a resend request, 13494 * if any. (Some recovery errors increment the seqid, and we may have to 13495 * send the resend request again.) 13496 */ 13497 13498 static void 13499 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13500 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13501 { 13502 if (lock_args) { 13503 if (lock_args->locker.new_lock_owner == TRUE) 13504 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13505 else { 13506 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13507 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13508 } 13509 } else if (locku_args) { 13510 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13511 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13512 } 13513 } 13514 13515 /* 13516 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13517 * COMPOUND4 args/res for calls that need to retry. 13518 * Switches the *cred_otwp to base_cr. 13519 */ 13520 static void 13521 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13522 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13523 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13524 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13525 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13526 { 13527 nfs4_open_owner_t *oop = *oopp; 13528 nfs4_open_stream_t *osp = *ospp; 13529 nfs4_lock_owner_t *lop = *lopp; 13530 nfs_argop4 *argop = (*argspp)->array; 13531 13532 if (*did_start_fop) { 13533 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13534 needrecov); 13535 *did_start_fop = FALSE; 13536 } 13537 ASSERT((*argspp)->array_len == 2); 13538 if (argop[1].argop == OP_LOCK) 13539 nfs4args_lock_free(&argop[1]); 13540 else if (argop[1].argop == OP_LOCKT) 13541 nfs4args_lockt_free(&argop[1]); 13542 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13543 if (!error) 13544 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13545 *argspp = NULL; 13546 *respp = NULL; 13547 13548 if (lop) { 13549 nfs4_end_lock_seqid_sync(lop); 13550 lock_owner_rele(lop); 13551 *lopp = NULL; 13552 } 13553 13554 /* need to free up the reference on osp for lock args */ 13555 if (osp != NULL) { 13556 open_stream_rele(osp, VTOR4(vp)); 13557 *ospp = NULL; 13558 } 13559 13560 /* need to free up the reference on oop for lock args */ 13561 if (oop != NULL) { 13562 nfs4_end_open_seqid_sync(oop); 13563 open_owner_rele(oop); 13564 *oopp = NULL; 13565 } 13566 13567 crfree(*cred_otwp); 13568 *cred_otwp = base_cr; 13569 crhold(*cred_otwp); 13570 } 13571 13572 /* 13573 * Function to process the client's recovery for nfs4frlock. 13574 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13575 * 13576 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13577 * COMPOUND4 args/res for calls that need to retry. 13578 * 13579 * Note: the rp's r_lkserlock is *not* dropped during this path. 13580 */ 13581 static bool_t 13582 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13583 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13584 LOCK4args *lock_args, LOCKU4args *locku_args, 13585 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13586 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13587 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13588 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13589 { 13590 nfs4_open_owner_t *oop = *oopp; 13591 nfs4_open_stream_t *osp = *ospp; 13592 nfs4_lock_owner_t *lop = *lopp; 13593 13594 bool_t abort, retry; 13595 13596 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13597 ASSERT((*argspp) != NULL); 13598 ASSERT((*respp) != NULL); 13599 if (lock_args || locku_args) 13600 ASSERT(lop != NULL); 13601 13602 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13603 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13604 13605 retry = TRUE; 13606 abort = FALSE; 13607 if (needrecov) { 13608 nfs4_bseqid_entry_t *bsep = NULL; 13609 nfs_opnum4 op; 13610 13611 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13612 13613 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13614 seqid4 seqid; 13615 13616 if (lock_args) { 13617 if (lock_args->locker.new_lock_owner == TRUE) 13618 seqid = lock_args->locker.locker4_u. 13619 open_owner.open_seqid; 13620 else 13621 seqid = lock_args->locker.locker4_u. 13622 lock_owner.lock_seqid; 13623 } else if (locku_args) { 13624 seqid = locku_args->seqid; 13625 } else { 13626 seqid = 0; 13627 } 13628 13629 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13630 flk->l_pid, (*argspp)->ctag, seqid); 13631 } 13632 13633 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13634 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13635 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13636 NULL, op, bsep); 13637 13638 if (bsep) 13639 kmem_free(bsep, sizeof (*bsep)); 13640 } 13641 13642 /* 13643 * Return that we do not want to retry the request for 3 cases: 13644 * 1. If we received EINTR or are bailing out because of a forced 13645 * unmount, we came into this code path just for the sake of 13646 * initiating recovery, we now need to return the error. 13647 * 2. If we have aborted recovery. 13648 * 3. We received NFS4ERR_BAD_SEQID. 13649 */ 13650 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13651 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13652 retry = FALSE; 13653 13654 if (*did_start_fop == TRUE) { 13655 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13656 needrecov); 13657 *did_start_fop = FALSE; 13658 } 13659 13660 if (retry == TRUE) { 13661 nfs_argop4 *argop; 13662 13663 argop = (*argspp)->array; 13664 ASSERT((*argspp)->array_len == 2); 13665 13666 if (argop[1].argop == OP_LOCK) 13667 nfs4args_lock_free(&argop[1]); 13668 else if (argop[1].argop == OP_LOCKT) 13669 nfs4args_lockt_free(&argop[1]); 13670 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13671 if (!ep->error) 13672 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13673 *respp = NULL; 13674 *argspp = NULL; 13675 } 13676 13677 if (lop != NULL) { 13678 nfs4_end_lock_seqid_sync(lop); 13679 lock_owner_rele(lop); 13680 } 13681 13682 *lopp = NULL; 13683 13684 /* need to free up the reference on osp for lock args */ 13685 if (osp != NULL) { 13686 open_stream_rele(osp, rp); 13687 *ospp = NULL; 13688 } 13689 13690 /* need to free up the reference on oop for lock args */ 13691 if (oop != NULL) { 13692 nfs4_end_open_seqid_sync(oop); 13693 open_owner_rele(oop); 13694 *oopp = NULL; 13695 } 13696 13697 return (retry); 13698 } 13699 13700 /* 13701 * Handles the successful reply from the server for nfs4frlock. 13702 */ 13703 static void 13704 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13705 vnode_t *vp, int flag, u_offset_t offset, 13706 nfs4_lost_rqst_t *resend_rqstp) 13707 { 13708 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13709 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13710 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13711 if (ctype == NFS4_LCK_CTYPE_NORM) { 13712 flk->l_pid = ttoproc(curthread)->p_pid; 13713 /* 13714 * We do not register lost locks locally in 13715 * the 'resend' case since the user/application 13716 * doesn't think we have the lock. 13717 */ 13718 ASSERT(!resend_rqstp); 13719 nfs4_register_lock_locally(vp, flk, flag, offset); 13720 } 13721 } 13722 } 13723 13724 /* 13725 * Handle the DENIED reply from the server for nfs4frlock. 13726 * Returns TRUE if we should retry the request; FALSE otherwise. 13727 * 13728 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13729 * COMPOUND4 args/res for calls that need to retry. Can also 13730 * drop and regrab the r_lkserlock. 13731 */ 13732 static bool_t 13733 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13734 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13735 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13736 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13737 nfs4_recov_state_t *recov_statep, int needrecov, 13738 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13739 clock_t *tick_delayp, short *whencep, int *errorp, 13740 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13741 bool_t *skip_get_err) 13742 { 13743 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13744 13745 if (lock_args) { 13746 nfs4_open_owner_t *oop = *oopp; 13747 nfs4_open_stream_t *osp = *ospp; 13748 nfs4_lock_owner_t *lop = *lopp; 13749 int intr; 13750 13751 /* 13752 * Blocking lock needs to sleep and retry from the request. 13753 * 13754 * Do not block and wait for 'resend' or 'reinstate' 13755 * lock requests, just return the error. 13756 * 13757 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13758 */ 13759 if (cmd == F_SETLKW) { 13760 rnode4_t *rp = VTOR4(vp); 13761 nfs_argop4 *argop = (*argspp)->array; 13762 13763 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13764 13765 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13766 recov_statep, needrecov); 13767 *did_start_fop = FALSE; 13768 ASSERT((*argspp)->array_len == 2); 13769 if (argop[1].argop == OP_LOCK) 13770 nfs4args_lock_free(&argop[1]); 13771 else if (argop[1].argop == OP_LOCKT) 13772 nfs4args_lockt_free(&argop[1]); 13773 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13774 if (*respp) 13775 (void) xdr_free(xdr_COMPOUND4res_clnt, 13776 (caddr_t)*respp); 13777 *argspp = NULL; 13778 *respp = NULL; 13779 nfs4_end_lock_seqid_sync(lop); 13780 lock_owner_rele(lop); 13781 *lopp = NULL; 13782 if (osp != NULL) { 13783 open_stream_rele(osp, rp); 13784 *ospp = NULL; 13785 } 13786 if (oop != NULL) { 13787 nfs4_end_open_seqid_sync(oop); 13788 open_owner_rele(oop); 13789 *oopp = NULL; 13790 } 13791 13792 nfs_rw_exit(&rp->r_lkserlock); 13793 13794 intr = nfs4_block_and_wait(tick_delayp, rp); 13795 13796 if (intr) { 13797 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13798 RW_WRITER, FALSE); 13799 *errorp = EINTR; 13800 return (FALSE); 13801 } 13802 13803 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13804 RW_WRITER, FALSE); 13805 13806 /* 13807 * Make sure we are still safe to lock with 13808 * regards to mmapping. 13809 */ 13810 if (!nfs4_safelock(vp, flk, cr)) { 13811 *errorp = EAGAIN; 13812 return (FALSE); 13813 } 13814 13815 return (TRUE); 13816 } 13817 if (ctype == NFS4_LCK_CTYPE_NORM) 13818 *errorp = EAGAIN; 13819 *skip_get_err = TRUE; 13820 flk->l_whence = 0; 13821 *whencep = 0; 13822 return (FALSE); 13823 } else if (lockt_args) { 13824 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13825 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13826 13827 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13828 flk, lockt_args); 13829 13830 /* according to NLM code */ 13831 *errorp = 0; 13832 *whencep = 0; 13833 *skip_get_err = TRUE; 13834 return (FALSE); 13835 } 13836 return (FALSE); 13837 } 13838 13839 /* 13840 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13841 */ 13842 static void 13843 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13844 { 13845 switch (resp->status) { 13846 case NFS4ERR_ACCESS: 13847 case NFS4ERR_ADMIN_REVOKED: 13848 case NFS4ERR_BADHANDLE: 13849 case NFS4ERR_BAD_RANGE: 13850 case NFS4ERR_BAD_SEQID: 13851 case NFS4ERR_BAD_STATEID: 13852 case NFS4ERR_BADXDR: 13853 case NFS4ERR_DEADLOCK: 13854 case NFS4ERR_DELAY: 13855 case NFS4ERR_EXPIRED: 13856 case NFS4ERR_FHEXPIRED: 13857 case NFS4ERR_GRACE: 13858 case NFS4ERR_INVAL: 13859 case NFS4ERR_ISDIR: 13860 case NFS4ERR_LEASE_MOVED: 13861 case NFS4ERR_LOCK_NOTSUPP: 13862 case NFS4ERR_LOCK_RANGE: 13863 case NFS4ERR_MOVED: 13864 case NFS4ERR_NOFILEHANDLE: 13865 case NFS4ERR_NO_GRACE: 13866 case NFS4ERR_OLD_STATEID: 13867 case NFS4ERR_OPENMODE: 13868 case NFS4ERR_RECLAIM_BAD: 13869 case NFS4ERR_RECLAIM_CONFLICT: 13870 case NFS4ERR_RESOURCE: 13871 case NFS4ERR_SERVERFAULT: 13872 case NFS4ERR_STALE: 13873 case NFS4ERR_STALE_CLIENTID: 13874 case NFS4ERR_STALE_STATEID: 13875 return; 13876 default: 13877 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13878 "nfs4frlock_results_default: got unrecognizable " 13879 "res.status %d", resp->status)); 13880 *errorp = NFS4ERR_INVAL; 13881 } 13882 } 13883 13884 /* 13885 * The lock request was successful, so update the client's state. 13886 */ 13887 static void 13888 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13889 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13890 vnode_t *vp, flock64_t *flk, cred_t *cr, 13891 nfs4_lost_rqst_t *resend_rqstp) 13892 { 13893 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13894 13895 if (lock_args) { 13896 LOCK4res *lock_res; 13897 13898 lock_res = &resop->nfs_resop4_u.oplock; 13899 /* update the stateid with server's response */ 13900 13901 if (lock_args->locker.new_lock_owner == TRUE) { 13902 mutex_enter(&lop->lo_lock); 13903 lop->lo_just_created = NFS4_PERM_CREATED; 13904 mutex_exit(&lop->lo_lock); 13905 } 13906 13907 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13908 13909 /* 13910 * If the lock was the result of a resending a lost 13911 * request, we've synched up the stateid and seqid 13912 * with the server, but now the server might be out of sync 13913 * with what the application thinks it has for locks. 13914 * Clean that up here. It's unclear whether we should do 13915 * this even if the filesystem has been forcibly unmounted. 13916 * For most servers, it's probably wasted effort, but 13917 * RFC3530 lets servers require that unlocks exactly match 13918 * the locks that are held. 13919 */ 13920 if (resend_rqstp != NULL && 13921 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13922 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13923 } else { 13924 flk->l_whence = 0; 13925 } 13926 } else if (locku_args) { 13927 LOCKU4res *locku_res; 13928 13929 locku_res = &resop->nfs_resop4_u.oplocku; 13930 13931 /* Update the stateid with the server's response */ 13932 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13933 } else if (lockt_args) { 13934 /* Switch the lock type to express success, see fcntl */ 13935 flk->l_type = F_UNLCK; 13936 flk->l_whence = 0; 13937 } 13938 } 13939 13940 /* 13941 * Do final cleanup before exiting nfs4frlock. 13942 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13943 * COMPOUND4 args/res for calls that haven't already. 13944 */ 13945 static void 13946 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 13947 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 13948 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 13949 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13950 short whence, u_offset_t offset, struct lm_sysid *ls, 13951 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 13952 bool_t did_start_fop, bool_t skip_get_err, 13953 cred_t *cred_otw, cred_t *cred) 13954 { 13955 mntinfo4_t *mi = VTOMI4(vp); 13956 rnode4_t *rp = VTOR4(vp); 13957 int error = *errorp; 13958 nfs_argop4 *argop; 13959 13960 ASSERT(nfs_zone() == mi->mi_zone); 13961 /* 13962 * The client recovery code wants the raw status information, 13963 * so don't map the NFS status code to an errno value for 13964 * non-normal call types. 13965 */ 13966 if (ctype == NFS4_LCK_CTYPE_NORM) { 13967 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 13968 *errorp = geterrno4(resp->status); 13969 if (did_start_fop == TRUE) 13970 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 13971 needrecov); 13972 13973 /* 13974 * We've established a new lock on the server, so invalidate 13975 * the pages associated with the vnode to get the most up to 13976 * date pages from the server after acquiring the lock. We 13977 * want to be sure that the read operation gets the newest data. 13978 * N.B. 13979 * We used to do this in nfs4frlock_results_ok but that doesn't 13980 * work since VOP_PUTPAGE can call nfs4_commit which calls 13981 * nfs4_start_fop. We flush the pages below after calling 13982 * nfs4_end_fop above 13983 */ 13984 if (!error && resp && resp->status == NFS4_OK) { 13985 int error; 13986 13987 error = VOP_PUTPAGE(vp, (u_offset_t)0, 13988 0, B_INVAL, cred, NULL); 13989 13990 if (error && (error == ENOSPC || error == EDQUOT)) { 13991 rnode4_t *rp = VTOR4(vp); 13992 13993 mutex_enter(&rp->r_statelock); 13994 if (!rp->r_error) 13995 rp->r_error = error; 13996 mutex_exit(&rp->r_statelock); 13997 } 13998 } 13999 } 14000 if (argsp) { 14001 ASSERT(argsp->array_len == 2); 14002 argop = argsp->array; 14003 if (argop[1].argop == OP_LOCK) 14004 nfs4args_lock_free(&argop[1]); 14005 else if (argop[1].argop == OP_LOCKT) 14006 nfs4args_lockt_free(&argop[1]); 14007 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14008 if (resp) 14009 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14010 } 14011 14012 /* free the reference on the lock owner */ 14013 if (lop != NULL) { 14014 nfs4_end_lock_seqid_sync(lop); 14015 lock_owner_rele(lop); 14016 } 14017 14018 /* need to free up the reference on osp for lock args */ 14019 if (osp != NULL) 14020 open_stream_rele(osp, rp); 14021 14022 /* need to free up the reference on oop for lock args */ 14023 if (oop != NULL) { 14024 nfs4_end_open_seqid_sync(oop); 14025 open_owner_rele(oop); 14026 } 14027 14028 (void) convoff(vp, flk, whence, offset); 14029 14030 lm_rel_sysid(ls); 14031 14032 /* 14033 * Record debug information in the event we get EINVAL. 14034 */ 14035 mutex_enter(&mi->mi_lock); 14036 if (*errorp == EINVAL && (lock_args || locku_args) && 14037 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14038 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14039 zcmn_err(getzoneid(), CE_NOTE, 14040 "%s operation failed with " 14041 "EINVAL probably since the server, %s," 14042 " doesn't support POSIX style locking", 14043 lock_args ? "LOCK" : "LOCKU", 14044 mi->mi_curr_serv->sv_hostname); 14045 mi->mi_flags |= MI4_LOCK_DEBUG; 14046 } 14047 } 14048 mutex_exit(&mi->mi_lock); 14049 14050 if (cred_otw) 14051 crfree(cred_otw); 14052 } 14053 14054 /* 14055 * This calls the server and the local locking code. 14056 * 14057 * Client locks are registerred locally by oring the sysid with 14058 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14059 * We need to distinguish between the two to avoid collision in case one 14060 * machine is used as both client and server. 14061 * 14062 * Blocking lock requests will continually retry to acquire the lock 14063 * forever. 14064 * 14065 * The ctype is defined as follows: 14066 * NFS4_LCK_CTYPE_NORM: normal lock request. 14067 * 14068 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14069 * recovery, get the pid from flk instead of curproc, and don't reregister 14070 * the lock locally. 14071 * 14072 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14073 * that we will use the information passed in via resend_rqstp to setup the 14074 * lock/locku request. This resend is the exact same request as the 'lost 14075 * lock', and is initiated by the recovery framework. A successful resend 14076 * request can initiate one or more reinstate requests. 14077 * 14078 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14079 * does not trigger additional reinstate requests. This lock call type is 14080 * set for setting the v4 server's locking state back to match what the 14081 * client's local locking state is in the event of a received 'lost lock'. 14082 * 14083 * Errors are returned via the nfs4_error_t parameter. 14084 */ 14085 void 14086 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14087 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14088 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14089 { 14090 COMPOUND4args_clnt args, *argsp = NULL; 14091 COMPOUND4res_clnt res, *resp = NULL; 14092 nfs_argop4 *argop; 14093 nfs_resop4 *resop; 14094 rnode4_t *rp; 14095 int doqueue = 1; 14096 clock_t tick_delay; /* delay in clock ticks */ 14097 struct lm_sysid *ls; 14098 LOCK4args *lock_args = NULL; 14099 LOCKU4args *locku_args = NULL; 14100 LOCKT4args *lockt_args = NULL; 14101 nfs4_open_owner_t *oop = NULL; 14102 nfs4_open_stream_t *osp = NULL; 14103 nfs4_lock_owner_t *lop = NULL; 14104 bool_t needrecov = FALSE; 14105 nfs4_recov_state_t recov_state; 14106 short whence; 14107 nfs4_op_hint_t op_hint; 14108 nfs4_lost_rqst_t lost_rqst; 14109 bool_t retry = FALSE; 14110 bool_t did_start_fop = FALSE; 14111 bool_t skip_get_err = FALSE; 14112 cred_t *cred_otw = NULL; 14113 bool_t recovonly; /* just queue request */ 14114 int frc_no_reclaim = 0; 14115 #ifdef DEBUG 14116 char *name; 14117 #endif 14118 14119 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14120 14121 #ifdef DEBUG 14122 name = fn_name(VTOSV(vp)->sv_name); 14123 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14124 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14125 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14126 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14127 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14128 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14129 resend_rqstp ? "TRUE" : "FALSE")); 14130 kmem_free(name, MAXNAMELEN); 14131 #endif 14132 14133 nfs4_error_zinit(ep); 14134 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14135 if (ep->error) 14136 return; 14137 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14138 if (ep->error) 14139 return; 14140 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14141 vp, cr, &cred_otw); 14142 14143 recov_retry: 14144 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14145 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14146 rp = VTOR4(vp); 14147 14148 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14149 &did_start_fop, &recovonly); 14150 14151 if (ep->error) 14152 goto out; 14153 14154 if (recovonly) { 14155 /* 14156 * Leave the request for the recovery system to deal with. 14157 */ 14158 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14159 ASSERT(cmd != F_GETLK); 14160 ASSERT(flk->l_type == F_UNLCK); 14161 14162 nfs4_error_init(ep, EINTR); 14163 needrecov = TRUE; 14164 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14165 if (lop != NULL) { 14166 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14167 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14168 (void) nfs4_start_recovery(ep, 14169 VTOMI4(vp), vp, NULL, NULL, 14170 (lost_rqst.lr_op == OP_LOCK || 14171 lost_rqst.lr_op == OP_LOCKU) ? 14172 &lost_rqst : NULL, OP_LOCKU, NULL); 14173 lock_owner_rele(lop); 14174 lop = NULL; 14175 } 14176 flk->l_pid = curproc->p_pid; 14177 nfs4_register_lock_locally(vp, flk, flag, offset); 14178 goto out; 14179 } 14180 14181 /* putfh directory fh */ 14182 argop[0].argop = OP_CPUTFH; 14183 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14184 14185 /* 14186 * Set up the over-the-wire arguments and get references to the 14187 * open owner, etc. 14188 */ 14189 14190 if (ctype == NFS4_LCK_CTYPE_RESEND || 14191 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14192 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14193 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14194 } else { 14195 bool_t go_otw = TRUE; 14196 14197 ASSERT(resend_rqstp == NULL); 14198 14199 switch (cmd) { 14200 case F_GETLK: 14201 case F_O_GETLK: 14202 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14203 &lockt_args, argsp, flk, rp); 14204 break; 14205 case F_SETLKW: 14206 case F_SETLK: 14207 if (flk->l_type == F_UNLCK) 14208 nfs4frlock_setup_locku_args(ctype, 14209 &argop[1], &locku_args, flk, 14210 &lop, ep, argsp, 14211 vp, flag, offset, cr, 14212 &skip_get_err, &go_otw); 14213 else 14214 nfs4frlock_setup_lock_args(ctype, 14215 &lock_args, &oop, &osp, &lop, &argop[1], 14216 argsp, flk, cmd, vp, cr, ep); 14217 14218 if (ep->error) 14219 goto out; 14220 14221 switch (ep->stat) { 14222 case NFS4_OK: 14223 break; 14224 case NFS4ERR_DELAY: 14225 /* recov thread never gets this error */ 14226 ASSERT(resend_rqstp == NULL); 14227 ASSERT(did_start_fop); 14228 14229 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14230 &recov_state, TRUE); 14231 did_start_fop = FALSE; 14232 if (argop[1].argop == OP_LOCK) 14233 nfs4args_lock_free(&argop[1]); 14234 else if (argop[1].argop == OP_LOCKT) 14235 nfs4args_lockt_free(&argop[1]); 14236 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14237 argsp = NULL; 14238 goto recov_retry; 14239 default: 14240 ep->error = EIO; 14241 goto out; 14242 } 14243 break; 14244 default: 14245 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14246 "nfs4_frlock: invalid cmd %d", cmd)); 14247 ep->error = EINVAL; 14248 goto out; 14249 } 14250 14251 if (!go_otw) 14252 goto out; 14253 } 14254 14255 /* XXX should we use the local reclock as a cache ? */ 14256 /* 14257 * Unregister the lock with the local locking code before 14258 * contacting the server. This avoids a potential race where 14259 * another process gets notified that it has been granted a lock 14260 * before we can unregister ourselves locally. 14261 */ 14262 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14263 if (ctype == NFS4_LCK_CTYPE_NORM) 14264 flk->l_pid = ttoproc(curthread)->p_pid; 14265 nfs4_register_lock_locally(vp, flk, flag, offset); 14266 } 14267 14268 /* 14269 * Send the server the lock request. Continually loop with a delay 14270 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14271 */ 14272 resp = &res; 14273 14274 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14275 (CE_NOTE, 14276 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14277 rnode4info(rp))); 14278 14279 if (lock_args && frc_no_reclaim) { 14280 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14281 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14282 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14283 lock_args->reclaim = FALSE; 14284 if (did_reclaimp) 14285 *did_reclaimp = 0; 14286 } 14287 14288 /* 14289 * Do the OTW call. 14290 */ 14291 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14292 14293 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14294 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14295 14296 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14297 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14298 "nfs4frlock: needrecov %d", needrecov)); 14299 14300 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14301 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14302 args.ctag); 14303 14304 /* 14305 * Check if one of these mutually exclusive error cases has 14306 * happened: 14307 * need to swap credentials due to access error 14308 * recovery is needed 14309 * different error (only known case is missing Kerberos ticket) 14310 */ 14311 14312 if ((ep->error == EACCES || 14313 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14314 cred_otw != cr) { 14315 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14316 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14317 cr, &cred_otw); 14318 goto recov_retry; 14319 } 14320 14321 if (needrecov) { 14322 /* 14323 * LOCKT requests don't need to recover from lost 14324 * requests since they don't create/modify state. 14325 */ 14326 if ((ep->error == EINTR || 14327 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14328 lockt_args) 14329 goto out; 14330 /* 14331 * Do not attempt recovery for requests initiated by 14332 * the recovery framework. Let the framework redrive them. 14333 */ 14334 if (ctype != NFS4_LCK_CTYPE_NORM) 14335 goto out; 14336 else { 14337 ASSERT(resend_rqstp == NULL); 14338 } 14339 14340 nfs4frlock_save_lost_rqst(ctype, ep->error, 14341 flk_to_locktype(cmd, flk->l_type), 14342 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14343 14344 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14345 &resp, lock_args, locku_args, &oop, &osp, &lop, 14346 rp, vp, &recov_state, op_hint, &did_start_fop, 14347 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14348 14349 if (retry) { 14350 ASSERT(oop == NULL); 14351 ASSERT(osp == NULL); 14352 ASSERT(lop == NULL); 14353 goto recov_retry; 14354 } 14355 goto out; 14356 } 14357 14358 /* 14359 * Bail out if have reached this point with ep->error set. Can 14360 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14361 * This happens if Kerberos ticket has expired or has been 14362 * destroyed. 14363 */ 14364 if (ep->error != 0) 14365 goto out; 14366 14367 /* 14368 * Process the reply. 14369 */ 14370 switch (resp->status) { 14371 case NFS4_OK: 14372 resop = &resp->array[1]; 14373 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14374 resend_rqstp); 14375 /* 14376 * Have a successful lock operation, now update state. 14377 */ 14378 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14379 resop, lop, vp, flk, cr, resend_rqstp); 14380 break; 14381 14382 case NFS4ERR_DENIED: 14383 resop = &resp->array[1]; 14384 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14385 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14386 &recov_state, needrecov, &argsp, &resp, 14387 &tick_delay, &whence, &ep->error, resop, cr, 14388 &did_start_fop, &skip_get_err); 14389 14390 if (retry) { 14391 ASSERT(oop == NULL); 14392 ASSERT(osp == NULL); 14393 ASSERT(lop == NULL); 14394 goto recov_retry; 14395 } 14396 break; 14397 /* 14398 * If the server won't let us reclaim, fall-back to trying to lock 14399 * the file from scratch. Code elsewhere will check the changeinfo 14400 * to ensure the file hasn't been changed. 14401 */ 14402 case NFS4ERR_NO_GRACE: 14403 if (lock_args && lock_args->reclaim == TRUE) { 14404 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14405 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14406 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14407 frc_no_reclaim = 1; 14408 /* clean up before retrying */ 14409 needrecov = 0; 14410 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14411 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14412 &recov_state, op_hint, &did_start_fop, NULL, flk); 14413 goto recov_retry; 14414 } 14415 /* FALLTHROUGH */ 14416 14417 default: 14418 nfs4frlock_results_default(resp, &ep->error); 14419 break; 14420 } 14421 out: 14422 /* 14423 * Process and cleanup from error. Make interrupted unlock 14424 * requests look successful, since they will be handled by the 14425 * client recovery code. 14426 */ 14427 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14428 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14429 lock_args, locku_args, did_start_fop, 14430 skip_get_err, cred_otw, cr); 14431 14432 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14433 (cmd == F_SETLK || cmd == F_SETLKW)) 14434 ep->error = 0; 14435 } 14436 14437 /* 14438 * nfs4_safelock: 14439 * 14440 * Return non-zero if the given lock request can be handled without 14441 * violating the constraints on concurrent mapping and locking. 14442 */ 14443 14444 static int 14445 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14446 { 14447 rnode4_t *rp = VTOR4(vp); 14448 struct vattr va; 14449 int error; 14450 14451 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14452 ASSERT(rp->r_mapcnt >= 0); 14453 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14454 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14455 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14456 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14457 14458 if (rp->r_mapcnt == 0) 14459 return (1); /* always safe if not mapped */ 14460 14461 /* 14462 * If the file is already mapped and there are locks, then they 14463 * should be all safe locks. So adding or removing a lock is safe 14464 * as long as the new request is safe (i.e., whole-file, meaning 14465 * length and starting offset are both zero). 14466 */ 14467 14468 if (bfp->l_start != 0 || bfp->l_len != 0) { 14469 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14470 "cannot lock a memory mapped file unless locking the " 14471 "entire file: start %"PRIx64", len %"PRIx64, 14472 bfp->l_start, bfp->l_len)); 14473 return (0); 14474 } 14475 14476 /* mandatory locking and mapping don't mix */ 14477 va.va_mask = AT_MODE; 14478 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14479 if (error != 0) { 14480 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14481 "getattr error %d", error)); 14482 return (0); /* treat errors conservatively */ 14483 } 14484 if (MANDLOCK(vp, va.va_mode)) { 14485 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14486 "cannot mandatory lock and mmap a file")); 14487 return (0); 14488 } 14489 14490 return (1); 14491 } 14492 14493 14494 /* 14495 * Register the lock locally within Solaris. 14496 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14497 * recording locks locally. 14498 * 14499 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14500 * are registered locally. 14501 */ 14502 void 14503 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14504 u_offset_t offset) 14505 { 14506 int oldsysid; 14507 int error; 14508 #ifdef DEBUG 14509 char *name; 14510 #endif 14511 14512 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14513 14514 #ifdef DEBUG 14515 name = fn_name(VTOSV(vp)->sv_name); 14516 NFS4_DEBUG(nfs4_client_lock_debug, 14517 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14518 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14519 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14520 flk->l_sysid)); 14521 kmem_free(name, MAXNAMELEN); 14522 #endif 14523 14524 /* register the lock with local locking */ 14525 oldsysid = flk->l_sysid; 14526 flk->l_sysid |= LM_SYSID_CLIENT; 14527 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14528 #ifdef DEBUG 14529 if (error != 0) { 14530 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14531 "nfs4_register_lock_locally: could not register with" 14532 " local locking")); 14533 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14534 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14535 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14536 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14537 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14538 flk->l_type, flk->l_start, flk->l_len)); 14539 (void) reclock(vp, flk, 0, flag, offset, NULL); 14540 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14541 "blocked by pid %d sysid 0x%x type %d " 14542 "off 0x%" PRIx64 " len 0x%" PRIx64, 14543 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14544 flk->l_len)); 14545 } 14546 #endif 14547 flk->l_sysid = oldsysid; 14548 } 14549 14550 /* 14551 * nfs4_lockrelease: 14552 * 14553 * Release any locks on the given vnode that are held by the current 14554 * process. Also removes the lock owner (if one exists) from the rnode's 14555 * list. 14556 */ 14557 static int 14558 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14559 { 14560 flock64_t ld; 14561 int ret, error; 14562 rnode4_t *rp; 14563 nfs4_lock_owner_t *lop; 14564 nfs4_recov_state_t recov_state; 14565 mntinfo4_t *mi; 14566 bool_t possible_orphan = FALSE; 14567 bool_t recovonly; 14568 14569 ASSERT((uintptr_t)vp > KERNELBASE); 14570 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14571 14572 rp = VTOR4(vp); 14573 mi = VTOMI4(vp); 14574 14575 /* 14576 * If we have not locked anything then we can 14577 * just return since we have no work to do. 14578 */ 14579 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14580 return (0); 14581 } 14582 14583 /* 14584 * We need to comprehend that another thread may 14585 * kick off recovery and the lock_owner we have stashed 14586 * in lop might be invalid so we should NOT cache it 14587 * locally! 14588 */ 14589 recov_state.rs_flags = 0; 14590 recov_state.rs_num_retry_despite_err = 0; 14591 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14592 &recovonly); 14593 if (error) { 14594 mutex_enter(&rp->r_statelock); 14595 rp->r_flags |= R4LODANGLERS; 14596 mutex_exit(&rp->r_statelock); 14597 return (error); 14598 } 14599 14600 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14601 14602 /* 14603 * Check if the lock owner might have a lock (request was sent but 14604 * no response was received). Also check if there are any remote 14605 * locks on the file. (In theory we shouldn't have to make this 14606 * second check if there's no lock owner, but for now we'll be 14607 * conservative and do it anyway.) If either condition is true, 14608 * send an unlock for the entire file to the server. 14609 * 14610 * Note that no explicit synchronization is needed here. At worst, 14611 * flk_has_remote_locks() will return a false positive, in which case 14612 * the unlock call wastes time but doesn't harm correctness. 14613 */ 14614 14615 if (lop) { 14616 mutex_enter(&lop->lo_lock); 14617 possible_orphan = lop->lo_pending_rqsts; 14618 mutex_exit(&lop->lo_lock); 14619 lock_owner_rele(lop); 14620 } 14621 14622 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14623 14624 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14625 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14626 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14627 (void *)lop)); 14628 14629 if (possible_orphan || flk_has_remote_locks(vp)) { 14630 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14631 ld.l_whence = 0; /* unlock from start of file */ 14632 ld.l_start = 0; 14633 ld.l_len = 0; /* do entire file */ 14634 14635 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14636 cr, NULL); 14637 14638 if (ret != 0) { 14639 /* 14640 * If VOP_FRLOCK fails, make sure we unregister 14641 * local locks before we continue. 14642 */ 14643 ld.l_pid = ttoproc(curthread)->p_pid; 14644 nfs4_register_lock_locally(vp, &ld, flag, offset); 14645 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14646 "nfs4_lockrelease: lock release error on vp" 14647 " %p: error %d.\n", (void *)vp, ret)); 14648 } 14649 } 14650 14651 recov_state.rs_flags = 0; 14652 recov_state.rs_num_retry_despite_err = 0; 14653 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14654 &recovonly); 14655 if (error) { 14656 mutex_enter(&rp->r_statelock); 14657 rp->r_flags |= R4LODANGLERS; 14658 mutex_exit(&rp->r_statelock); 14659 return (error); 14660 } 14661 14662 /* 14663 * So, here we're going to need to retrieve the lock-owner 14664 * again (in case recovery has done a switch-a-roo) and 14665 * remove it because we can. 14666 */ 14667 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14668 14669 if (lop) { 14670 nfs4_rnode_remove_lock_owner(rp, lop); 14671 lock_owner_rele(lop); 14672 } 14673 14674 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14675 return (0); 14676 } 14677 14678 /* 14679 * Wait for 'tick_delay' clock ticks. 14680 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14681 * NOTE: lock_lease_time is in seconds. 14682 * 14683 * XXX For future improvements, should implement a waiting queue scheme. 14684 */ 14685 static int 14686 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14687 { 14688 long milliseconds_delay; 14689 time_t lock_lease_time; 14690 14691 /* wait tick_delay clock ticks or siginteruptus */ 14692 if (delay_sig(*tick_delay)) { 14693 return (EINTR); 14694 } 14695 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14696 "reissue the lock request: blocked for %ld clock ticks: %ld " 14697 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14698 14699 /* get the lease time */ 14700 lock_lease_time = r2lease_time(rp); 14701 14702 /* drv_hztousec converts ticks to microseconds */ 14703 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14704 if (milliseconds_delay < lock_lease_time * 1000) { 14705 *tick_delay = 2 * *tick_delay; 14706 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14707 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14708 } 14709 return (0); 14710 } 14711 14712 14713 void 14714 nfs4_vnops_init(void) 14715 { 14716 } 14717 14718 void 14719 nfs4_vnops_fini(void) 14720 { 14721 } 14722 14723 /* 14724 * Return a reference to the directory (parent) vnode for a given vnode, 14725 * using the saved pathname information and the directory file handle. The 14726 * caller is responsible for disposing of the reference. 14727 * Returns zero or an errno value. 14728 * 14729 * Caller should set need_start_op to FALSE if it is the recovery 14730 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14731 */ 14732 int 14733 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14734 { 14735 svnode_t *svnp; 14736 vnode_t *dvp = NULL; 14737 servinfo4_t *svp; 14738 nfs4_fname_t *mfname; 14739 int error; 14740 14741 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14742 14743 if (vp->v_flag & VROOT) { 14744 nfs4_sharedfh_t *sfh; 14745 nfs_fh4 fh; 14746 mntinfo4_t *mi; 14747 14748 ASSERT(vp->v_type == VREG); 14749 14750 mi = VTOMI4(vp); 14751 svp = mi->mi_curr_serv; 14752 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14753 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14754 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14755 sfh = sfh4_get(&fh, VTOMI4(vp)); 14756 nfs_rw_exit(&svp->sv_lock); 14757 mfname = mi->mi_fname; 14758 fn_hold(mfname); 14759 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14760 sfh4_rele(&sfh); 14761 14762 if (dvp->v_type == VNON) 14763 dvp->v_type = VDIR; 14764 *dvpp = dvp; 14765 return (0); 14766 } 14767 14768 svnp = VTOSV(vp); 14769 14770 if (svnp == NULL) { 14771 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14772 "shadow node is NULL")); 14773 return (EINVAL); 14774 } 14775 14776 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14777 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14778 "shadow node name or dfh val == NULL")); 14779 return (EINVAL); 14780 } 14781 14782 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14783 (int)need_start_op); 14784 if (error != 0) { 14785 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14786 "nfs4_make_dotdot returned %d", error)); 14787 return (error); 14788 } 14789 if (!dvp) { 14790 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14791 "nfs4_make_dotdot returned a NULL dvp")); 14792 return (EIO); 14793 } 14794 if (dvp->v_type == VNON) 14795 dvp->v_type = VDIR; 14796 ASSERT(dvp->v_type == VDIR); 14797 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14798 mutex_enter(&dvp->v_lock); 14799 dvp->v_flag |= V_XATTRDIR; 14800 mutex_exit(&dvp->v_lock); 14801 } 14802 *dvpp = dvp; 14803 return (0); 14804 } 14805 14806 /* 14807 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14808 * length that fnamep can accept, including the trailing null. 14809 * Returns 0 if okay, returns an errno value if there was a problem. 14810 */ 14811 14812 int 14813 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14814 { 14815 char *fn; 14816 int err = 0; 14817 servinfo4_t *svp; 14818 svnode_t *shvp; 14819 14820 /* 14821 * If the file being opened has VROOT set, then this is 14822 * a "file" mount. sv_name will not be interesting, so 14823 * go back to the servinfo4 to get the original mount 14824 * path and strip off all but the final edge. Otherwise 14825 * just return the name from the shadow vnode. 14826 */ 14827 14828 if (vp->v_flag & VROOT) { 14829 14830 svp = VTOMI4(vp)->mi_curr_serv; 14831 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14832 14833 fn = strrchr(svp->sv_path, '/'); 14834 if (fn == NULL) 14835 err = EINVAL; 14836 else 14837 fn++; 14838 } else { 14839 shvp = VTOSV(vp); 14840 fn = fn_name(shvp->sv_name); 14841 } 14842 14843 if (err == 0) 14844 if (strlen(fn) < maxlen) 14845 (void) strcpy(fnamep, fn); 14846 else 14847 err = ENAMETOOLONG; 14848 14849 if (vp->v_flag & VROOT) 14850 nfs_rw_exit(&svp->sv_lock); 14851 else 14852 kmem_free(fn, MAXNAMELEN); 14853 14854 return (err); 14855 } 14856 14857 /* 14858 * Bookkeeping for a close that doesn't need to go over the wire. 14859 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14860 * it is left at 1. 14861 */ 14862 void 14863 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14864 { 14865 rnode4_t *rp; 14866 mntinfo4_t *mi; 14867 14868 mi = VTOMI4(vp); 14869 rp = VTOR4(vp); 14870 14871 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14872 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14873 ASSERT(nfs_zone() == mi->mi_zone); 14874 ASSERT(mutex_owned(&osp->os_sync_lock)); 14875 ASSERT(*have_lockp); 14876 14877 if (!osp->os_valid || 14878 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14879 return; 14880 } 14881 14882 /* 14883 * This removes the reference obtained at OPEN; ie, 14884 * when the open stream structure was created. 14885 * 14886 * We don't have to worry about calling 'open_stream_rele' 14887 * since we our currently holding a reference to this 14888 * open stream which means the count can not go to 0 with 14889 * this decrement. 14890 */ 14891 ASSERT(osp->os_ref_count >= 2); 14892 osp->os_ref_count--; 14893 osp->os_valid = 0; 14894 mutex_exit(&osp->os_sync_lock); 14895 *have_lockp = 0; 14896 14897 nfs4_dec_state_ref_count(mi); 14898 } 14899 14900 /* 14901 * Close all remaining open streams on the rnode. These open streams 14902 * could be here because: 14903 * - The close attempted at either close or delmap failed 14904 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14905 * - Someone did mknod on a regular file but never opened it 14906 */ 14907 int 14908 nfs4close_all(vnode_t *vp, cred_t *cr) 14909 { 14910 nfs4_open_stream_t *osp; 14911 int error; 14912 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14913 rnode4_t *rp; 14914 14915 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14916 14917 error = 0; 14918 rp = VTOR4(vp); 14919 14920 /* 14921 * At this point, all we know is that the last time 14922 * someone called vn_rele, the count was 1. Since then, 14923 * the vnode could have been re-activated. We want to 14924 * loop through the open streams and close each one, but 14925 * we have to be careful since once we release the rnode 14926 * hash bucket lock, someone else is free to come in and 14927 * re-activate the rnode and add new open streams. The 14928 * strategy is take the rnode hash bucket lock, verify that 14929 * the count is still 1, grab the open stream off the 14930 * head of the list and mark it invalid, then release the 14931 * rnode hash bucket lock and proceed with that open stream. 14932 * This is ok because nfs4close_one() will acquire the proper 14933 * open/create to close/destroy synchronization for open 14934 * streams, and will ensure that if someone has reopened 14935 * the open stream after we've dropped the hash bucket lock 14936 * then we'll just simply return without destroying the 14937 * open stream. 14938 * Repeat until the list is empty. 14939 */ 14940 14941 for (;;) { 14942 14943 /* make sure vnode hasn't been reactivated */ 14944 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14945 mutex_enter(&vp->v_lock); 14946 if (vp->v_count > 1) { 14947 mutex_exit(&vp->v_lock); 14948 rw_exit(&rp->r_hashq->r_lock); 14949 break; 14950 } 14951 /* 14952 * Grabbing r_os_lock before releasing v_lock prevents 14953 * a window where the rnode/open stream could get 14954 * reactivated (and os_force_close set to 0) before we 14955 * had a chance to set os_force_close to 1. 14956 */ 14957 mutex_enter(&rp->r_os_lock); 14958 mutex_exit(&vp->v_lock); 14959 14960 osp = list_head(&rp->r_open_streams); 14961 if (!osp) { 14962 /* nothing left to CLOSE OTW, so return */ 14963 mutex_exit(&rp->r_os_lock); 14964 rw_exit(&rp->r_hashq->r_lock); 14965 break; 14966 } 14967 14968 mutex_enter(&rp->r_statev4_lock); 14969 /* the file can't still be mem mapped */ 14970 ASSERT(rp->r_mapcnt == 0); 14971 if (rp->created_v4) 14972 rp->created_v4 = 0; 14973 mutex_exit(&rp->r_statev4_lock); 14974 14975 /* 14976 * Grab a ref on this open stream; nfs4close_one 14977 * will mark it as invalid 14978 */ 14979 mutex_enter(&osp->os_sync_lock); 14980 osp->os_ref_count++; 14981 osp->os_force_close = 1; 14982 mutex_exit(&osp->os_sync_lock); 14983 mutex_exit(&rp->r_os_lock); 14984 rw_exit(&rp->r_hashq->r_lock); 14985 14986 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 14987 14988 /* Update error if it isn't already non-zero */ 14989 if (error == 0) { 14990 if (e.error) 14991 error = e.error; 14992 else if (e.stat) 14993 error = geterrno4(e.stat); 14994 } 14995 14996 #ifdef DEBUG 14997 nfs4close_all_cnt++; 14998 #endif 14999 /* Release the ref on osp acquired above. */ 15000 open_stream_rele(osp, rp); 15001 15002 /* Proceed to the next open stream, if any */ 15003 } 15004 return (error); 15005 } 15006 15007 /* 15008 * nfs4close_one - close one open stream for a file if needed. 15009 * 15010 * "close_type" indicates which close path this is: 15011 * CLOSE_NORM: close initiated via VOP_CLOSE. 15012 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15013 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15014 * the close and release of client state for this open stream 15015 * (unless someone else has the open stream open). 15016 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15017 * (e.g., due to abort because of a signal). 15018 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15019 * 15020 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15021 * recovery. Instead, the caller is expected to deal with retries. 15022 * 15023 * The caller can either pass in the osp ('provided_osp') or not. 15024 * 15025 * 'access_bits' represents the access we are closing/downgrading. 15026 * 15027 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15028 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15029 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15030 * 15031 * Errors are returned via the nfs4_error_t. 15032 */ 15033 void 15034 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15035 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15036 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15037 uint_t mmap_flags) 15038 { 15039 nfs4_open_owner_t *oop; 15040 nfs4_open_stream_t *osp = NULL; 15041 int retry = 0; 15042 int num_retries = NFS4_NUM_RECOV_RETRIES; 15043 rnode4_t *rp; 15044 mntinfo4_t *mi; 15045 nfs4_recov_state_t recov_state; 15046 cred_t *cred_otw = NULL; 15047 bool_t recovonly = FALSE; 15048 int isrecov; 15049 int force_close; 15050 int close_failed = 0; 15051 int did_dec_count = 0; 15052 int did_start_op = 0; 15053 int did_force_recovlock = 0; 15054 int did_start_seqid_sync = 0; 15055 int have_sync_lock = 0; 15056 15057 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15058 15059 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15060 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15061 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15062 len, maxprot, mmap_flags, access_bits)); 15063 15064 nfs4_error_zinit(ep); 15065 rp = VTOR4(vp); 15066 mi = VTOMI4(vp); 15067 isrecov = (close_type == CLOSE_RESEND || 15068 close_type == CLOSE_AFTER_RESEND); 15069 15070 /* 15071 * First get the open owner. 15072 */ 15073 if (!provided_osp) { 15074 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15075 } else { 15076 oop = provided_osp->os_open_owner; 15077 ASSERT(oop != NULL); 15078 open_owner_hold(oop); 15079 } 15080 15081 if (!oop) { 15082 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15083 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15084 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15085 (void *)provided_osp, close_type)); 15086 ep->error = EIO; 15087 goto out; 15088 } 15089 15090 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15091 recov_retry: 15092 osp = NULL; 15093 close_failed = 0; 15094 force_close = (close_type == CLOSE_FORCE); 15095 retry = 0; 15096 did_start_op = 0; 15097 did_force_recovlock = 0; 15098 did_start_seqid_sync = 0; 15099 have_sync_lock = 0; 15100 recovonly = FALSE; 15101 recov_state.rs_flags = 0; 15102 recov_state.rs_num_retry_despite_err = 0; 15103 15104 /* 15105 * Second synchronize with recovery. 15106 */ 15107 if (!isrecov) { 15108 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15109 &recov_state, &recovonly); 15110 if (!ep->error) { 15111 did_start_op = 1; 15112 } else { 15113 close_failed = 1; 15114 /* 15115 * If we couldn't get start_fop, but have to 15116 * cleanup state, then at least acquire the 15117 * mi_recovlock so we can synchronize with 15118 * recovery. 15119 */ 15120 if (close_type == CLOSE_FORCE) { 15121 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15122 RW_READER, FALSE); 15123 did_force_recovlock = 1; 15124 } else 15125 goto out; 15126 } 15127 } 15128 15129 /* 15130 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15131 * set 'recovonly' to TRUE since most likely this is due to 15132 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15133 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15134 * to retry, causing us to loop until recovery finishes. Plus we 15135 * don't need protection over the open seqid since we're not going 15136 * OTW, hence don't need to use the seqid. 15137 */ 15138 if (recovonly == FALSE) { 15139 /* need to grab the open owner sync before 'os_sync_lock' */ 15140 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15141 if (ep->error == EAGAIN) { 15142 ASSERT(!isrecov); 15143 if (did_start_op) 15144 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15145 &recov_state, TRUE); 15146 if (did_force_recovlock) 15147 nfs_rw_exit(&mi->mi_recovlock); 15148 goto recov_retry; 15149 } 15150 did_start_seqid_sync = 1; 15151 } 15152 15153 /* 15154 * Third get an open stream and acquire 'os_sync_lock' to 15155 * sychronize the opening/creating of an open stream with the 15156 * closing/destroying of an open stream. 15157 */ 15158 if (!provided_osp) { 15159 /* returns with 'os_sync_lock' held */ 15160 osp = find_open_stream(oop, rp); 15161 if (!osp) { 15162 ep->error = EIO; 15163 goto out; 15164 } 15165 } else { 15166 osp = provided_osp; 15167 open_stream_hold(osp); 15168 mutex_enter(&osp->os_sync_lock); 15169 } 15170 have_sync_lock = 1; 15171 15172 ASSERT(oop == osp->os_open_owner); 15173 15174 /* 15175 * Fourth, do any special pre-OTW CLOSE processing 15176 * based on the specific close type. 15177 */ 15178 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15179 !did_dec_count) { 15180 ASSERT(osp->os_open_ref_count > 0); 15181 osp->os_open_ref_count--; 15182 did_dec_count = 1; 15183 if (osp->os_open_ref_count == 0) 15184 osp->os_final_close = 1; 15185 } 15186 15187 if (close_type == CLOSE_FORCE) { 15188 /* see if somebody reopened the open stream. */ 15189 if (!osp->os_force_close) { 15190 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15191 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15192 "was reopened, vp %p", (void *)osp, (void *)vp)); 15193 ep->error = 0; 15194 ep->stat = NFS4_OK; 15195 goto out; 15196 } 15197 15198 if (!osp->os_final_close && !did_dec_count) { 15199 osp->os_open_ref_count--; 15200 did_dec_count = 1; 15201 } 15202 15203 /* 15204 * We can't depend on os_open_ref_count being 0 due to the 15205 * way executables are opened (VN_RELE to match a VOP_OPEN). 15206 */ 15207 #ifdef NOTYET 15208 ASSERT(osp->os_open_ref_count == 0); 15209 #endif 15210 if (osp->os_open_ref_count != 0) { 15211 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15212 "nfs4close_one: should panic here on an " 15213 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15214 "since this is probably the exec problem.")); 15215 15216 osp->os_open_ref_count = 0; 15217 } 15218 15219 /* 15220 * There is the possibility that nfs4close_one() 15221 * for close_type == CLOSE_DELMAP couldn't find the 15222 * open stream, thus couldn't decrement its os_mapcnt; 15223 * therefore we can't use this ASSERT yet. 15224 */ 15225 #ifdef NOTYET 15226 ASSERT(osp->os_mapcnt == 0); 15227 #endif 15228 osp->os_mapcnt = 0; 15229 } 15230 15231 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15232 ASSERT(osp->os_mapcnt >= btopr(len)); 15233 15234 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15235 osp->os_mmap_write -= btopr(len); 15236 if (maxprot & PROT_READ) 15237 osp->os_mmap_read -= btopr(len); 15238 if (maxprot & PROT_EXEC) 15239 osp->os_mmap_read -= btopr(len); 15240 /* mirror the PROT_NONE check in nfs4_addmap() */ 15241 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15242 !(maxprot & PROT_EXEC)) 15243 osp->os_mmap_read -= btopr(len); 15244 osp->os_mapcnt -= btopr(len); 15245 did_dec_count = 1; 15246 } 15247 15248 if (recovonly) { 15249 nfs4_lost_rqst_t lost_rqst; 15250 15251 /* request should not already be in recovery queue */ 15252 ASSERT(lrp == NULL); 15253 nfs4_error_init(ep, EINTR); 15254 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15255 osp, cred_otw, vp); 15256 mutex_exit(&osp->os_sync_lock); 15257 have_sync_lock = 0; 15258 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15259 lost_rqst.lr_op == OP_CLOSE ? 15260 &lost_rqst : NULL, OP_CLOSE, NULL); 15261 close_failed = 1; 15262 force_close = 0; 15263 goto close_cleanup; 15264 } 15265 15266 /* 15267 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15268 * we stopped operating on the open owner's <old oo_name, old seqid> 15269 * space, which means we stopped operating on the open stream 15270 * too. So don't go OTW (as the seqid is likely bad, and the 15271 * stateid could be stale, potentially triggering a false 15272 * setclientid), and just clean up the client's internal state. 15273 */ 15274 if (osp->os_orig_oo_name != oop->oo_name) { 15275 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15276 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15277 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15278 "oo_name %" PRIx64")", 15279 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15280 oop->oo_name)); 15281 close_failed = 1; 15282 } 15283 15284 /* If the file failed recovery, just quit. */ 15285 mutex_enter(&rp->r_statelock); 15286 if (rp->r_flags & R4RECOVERR) { 15287 close_failed = 1; 15288 } 15289 mutex_exit(&rp->r_statelock); 15290 15291 /* 15292 * If the force close path failed to obtain start_fop 15293 * then skip the OTW close and just remove the state. 15294 */ 15295 if (close_failed) 15296 goto close_cleanup; 15297 15298 /* 15299 * Fifth, check to see if there are still mapped pages or other 15300 * opens using this open stream. If there are then we can't 15301 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15302 */ 15303 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15304 nfs4_lost_rqst_t new_lost_rqst; 15305 bool_t needrecov = FALSE; 15306 cred_t *odg_cred_otw = NULL; 15307 seqid4 open_dg_seqid = 0; 15308 15309 if (osp->os_delegation) { 15310 /* 15311 * If this open stream was never OPENed OTW then we 15312 * surely can't DOWNGRADE it (especially since the 15313 * osp->open_stateid is really a delegation stateid 15314 * when os_delegation is 1). 15315 */ 15316 if (access_bits & FREAD) 15317 osp->os_share_acc_read--; 15318 if (access_bits & FWRITE) 15319 osp->os_share_acc_write--; 15320 osp->os_share_deny_none--; 15321 nfs4_error_zinit(ep); 15322 goto out; 15323 } 15324 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15325 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15326 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15327 if (needrecov && !isrecov) { 15328 bool_t abort; 15329 nfs4_bseqid_entry_t *bsep = NULL; 15330 15331 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15332 bsep = nfs4_create_bseqid_entry(oop, NULL, 15333 vp, 0, 15334 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15335 open_dg_seqid); 15336 15337 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15338 oop, osp, odg_cred_otw, vp, access_bits, 0); 15339 mutex_exit(&osp->os_sync_lock); 15340 have_sync_lock = 0; 15341 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15342 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15343 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15344 bsep); 15345 if (odg_cred_otw) 15346 crfree(odg_cred_otw); 15347 if (bsep) 15348 kmem_free(bsep, sizeof (*bsep)); 15349 15350 if (abort == TRUE) 15351 goto out; 15352 15353 if (did_start_seqid_sync) { 15354 nfs4_end_open_seqid_sync(oop); 15355 did_start_seqid_sync = 0; 15356 } 15357 open_stream_rele(osp, rp); 15358 15359 if (did_start_op) 15360 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15361 &recov_state, FALSE); 15362 if (did_force_recovlock) 15363 nfs_rw_exit(&mi->mi_recovlock); 15364 15365 goto recov_retry; 15366 } else { 15367 if (odg_cred_otw) 15368 crfree(odg_cred_otw); 15369 } 15370 goto out; 15371 } 15372 15373 /* 15374 * If this open stream was created as the results of an open 15375 * while holding a delegation, then just release it; no need 15376 * to do an OTW close. Otherwise do a "normal" OTW close. 15377 */ 15378 if (osp->os_delegation) { 15379 nfs4close_notw(vp, osp, &have_sync_lock); 15380 nfs4_error_zinit(ep); 15381 goto out; 15382 } 15383 15384 /* 15385 * If this stream is not valid, we're done. 15386 */ 15387 if (!osp->os_valid) { 15388 nfs4_error_zinit(ep); 15389 goto out; 15390 } 15391 15392 /* 15393 * Last open or mmap ref has vanished, need to do an OTW close. 15394 * First check to see if a close is still necessary. 15395 */ 15396 if (osp->os_failed_reopen) { 15397 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15398 "don't close OTW osp %p since reopen failed.", 15399 (void *)osp)); 15400 /* 15401 * Reopen of the open stream failed, hence the 15402 * stateid of the open stream is invalid/stale, and 15403 * sending this OTW would incorrectly cause another 15404 * round of recovery. In this case, we need to set 15405 * the 'os_valid' bit to 0 so another thread doesn't 15406 * come in and re-open this open stream before 15407 * this "closing" thread cleans up state (decrementing 15408 * the nfs4_server_t's state_ref_count and decrementing 15409 * the os_ref_count). 15410 */ 15411 osp->os_valid = 0; 15412 /* 15413 * This removes the reference obtained at OPEN; ie, 15414 * when the open stream structure was created. 15415 * 15416 * We don't have to worry about calling 'open_stream_rele' 15417 * since we our currently holding a reference to this 15418 * open stream which means the count can not go to 0 with 15419 * this decrement. 15420 */ 15421 ASSERT(osp->os_ref_count >= 2); 15422 osp->os_ref_count--; 15423 nfs4_error_zinit(ep); 15424 close_failed = 0; 15425 goto close_cleanup; 15426 } 15427 15428 ASSERT(osp->os_ref_count > 1); 15429 15430 /* 15431 * Sixth, try the CLOSE OTW. 15432 */ 15433 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15434 close_type, ep, &have_sync_lock); 15435 15436 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15437 /* 15438 * Let the recovery thread be responsible for 15439 * removing the state for CLOSE. 15440 */ 15441 close_failed = 1; 15442 force_close = 0; 15443 retry = 0; 15444 } 15445 15446 /* See if we need to retry with a different cred */ 15447 if ((ep->error == EACCES || 15448 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15449 cred_otw != cr) { 15450 crfree(cred_otw); 15451 cred_otw = cr; 15452 crhold(cred_otw); 15453 retry = 1; 15454 } 15455 15456 if (ep->error || ep->stat) 15457 close_failed = 1; 15458 15459 if (retry && !isrecov && num_retries-- > 0) { 15460 if (have_sync_lock) { 15461 mutex_exit(&osp->os_sync_lock); 15462 have_sync_lock = 0; 15463 } 15464 if (did_start_seqid_sync) { 15465 nfs4_end_open_seqid_sync(oop); 15466 did_start_seqid_sync = 0; 15467 } 15468 open_stream_rele(osp, rp); 15469 15470 if (did_start_op) 15471 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15472 &recov_state, FALSE); 15473 if (did_force_recovlock) 15474 nfs_rw_exit(&mi->mi_recovlock); 15475 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15476 "nfs4close_one: need to retry the close " 15477 "operation")); 15478 goto recov_retry; 15479 } 15480 close_cleanup: 15481 /* 15482 * Seventh and lastly, process our results. 15483 */ 15484 if (close_failed && force_close) { 15485 /* 15486 * It's ok to drop and regrab the 'os_sync_lock' since 15487 * nfs4close_notw() will recheck to make sure the 15488 * "close"/removal of state should happen. 15489 */ 15490 if (!have_sync_lock) { 15491 mutex_enter(&osp->os_sync_lock); 15492 have_sync_lock = 1; 15493 } 15494 /* 15495 * This is last call, remove the ref on the open 15496 * stream created by open and clean everything up. 15497 */ 15498 osp->os_pending_close = 0; 15499 nfs4close_notw(vp, osp, &have_sync_lock); 15500 nfs4_error_zinit(ep); 15501 } 15502 15503 if (!close_failed) { 15504 if (have_sync_lock) { 15505 osp->os_pending_close = 0; 15506 mutex_exit(&osp->os_sync_lock); 15507 have_sync_lock = 0; 15508 } else { 15509 mutex_enter(&osp->os_sync_lock); 15510 osp->os_pending_close = 0; 15511 mutex_exit(&osp->os_sync_lock); 15512 } 15513 if (did_start_op && recov_state.rs_sp != NULL) { 15514 mutex_enter(&recov_state.rs_sp->s_lock); 15515 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15516 mutex_exit(&recov_state.rs_sp->s_lock); 15517 } else { 15518 nfs4_dec_state_ref_count(mi); 15519 } 15520 nfs4_error_zinit(ep); 15521 } 15522 15523 out: 15524 if (have_sync_lock) 15525 mutex_exit(&osp->os_sync_lock); 15526 if (did_start_op) 15527 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15528 recovonly ? TRUE : FALSE); 15529 if (did_force_recovlock) 15530 nfs_rw_exit(&mi->mi_recovlock); 15531 if (cred_otw) 15532 crfree(cred_otw); 15533 if (osp) 15534 open_stream_rele(osp, rp); 15535 if (oop) { 15536 if (did_start_seqid_sync) 15537 nfs4_end_open_seqid_sync(oop); 15538 open_owner_rele(oop); 15539 } 15540 } 15541 15542 /* 15543 * Convert information returned by the server in the LOCK4denied 15544 * structure to the form required by fcntl. 15545 */ 15546 static void 15547 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15548 { 15549 nfs4_lo_name_t *lo; 15550 15551 #ifdef DEBUG 15552 if (denied_to_flk_debug) { 15553 lockt_denied_debug = lockt_denied; 15554 debug_enter("lockt_denied"); 15555 } 15556 #endif 15557 15558 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15559 flk->l_whence = 0; /* aka SEEK_SET */ 15560 flk->l_start = lockt_denied->offset; 15561 flk->l_len = lockt_denied->length; 15562 15563 /* 15564 * If the blocking clientid matches our client id, then we can 15565 * interpret the lockowner (since we built it). If not, then 15566 * fabricate a sysid and pid. Note that the l_sysid field 15567 * in *flk already has the local sysid. 15568 */ 15569 15570 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15571 15572 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15573 lo = (nfs4_lo_name_t *) 15574 lockt_denied->owner.owner_val; 15575 15576 flk->l_pid = lo->ln_pid; 15577 } else { 15578 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15579 "denied_to_flk: bad lock owner length\n")); 15580 15581 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15582 } 15583 } else { 15584 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15585 "denied_to_flk: foreign clientid\n")); 15586 15587 /* 15588 * Construct a new sysid which should be different from 15589 * sysids of other systems. 15590 */ 15591 15592 flk->l_sysid++; 15593 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15594 } 15595 } 15596 15597 static pid_t 15598 lo_to_pid(lock_owner4 *lop) 15599 { 15600 pid_t pid = 0; 15601 uchar_t *cp; 15602 int i; 15603 15604 cp = (uchar_t *)&lop->clientid; 15605 15606 for (i = 0; i < sizeof (lop->clientid); i++) 15607 pid += (pid_t)*cp++; 15608 15609 cp = (uchar_t *)lop->owner_val; 15610 15611 for (i = 0; i < lop->owner_len; i++) 15612 pid += (pid_t)*cp++; 15613 15614 return (pid); 15615 } 15616 15617 /* 15618 * Given a lock pointer, returns the length of that lock. 15619 * "end" is the last locked offset the "l_len" covers from 15620 * the start of the lock. 15621 */ 15622 static off64_t 15623 lock_to_end(flock64_t *lock) 15624 { 15625 off64_t lock_end; 15626 15627 if (lock->l_len == 0) 15628 lock_end = (off64_t)MAXEND; 15629 else 15630 lock_end = lock->l_start + lock->l_len - 1; 15631 15632 return (lock_end); 15633 } 15634 15635 /* 15636 * Given the end of a lock, it will return you the length "l_len" for that lock. 15637 */ 15638 static off64_t 15639 end_to_len(off64_t start, off64_t end) 15640 { 15641 off64_t lock_len; 15642 15643 ASSERT(end >= start); 15644 if (end == MAXEND) 15645 lock_len = 0; 15646 else 15647 lock_len = end - start + 1; 15648 15649 return (lock_len); 15650 } 15651 15652 /* 15653 * On given end for a lock it determines if it is the last locked offset 15654 * or not, if so keeps it as is, else adds one to return the length for 15655 * valid start. 15656 */ 15657 static off64_t 15658 start_check(off64_t x) 15659 { 15660 if (x == MAXEND) 15661 return (x); 15662 else 15663 return (x + 1); 15664 } 15665 15666 /* 15667 * See if these two locks overlap, and if so return 1; 15668 * otherwise, return 0. 15669 */ 15670 static int 15671 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15672 { 15673 off64_t llfp_end, curfp_end; 15674 15675 llfp_end = lock_to_end(llfp); 15676 curfp_end = lock_to_end(curfp); 15677 15678 if (((llfp_end >= curfp->l_start) && 15679 (llfp->l_start <= curfp->l_start)) || 15680 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15681 return (1); 15682 return (0); 15683 } 15684 15685 /* 15686 * Determine what the intersecting lock region is, and add that to the 15687 * 'nl_llpp' locklist in increasing order (by l_start). 15688 */ 15689 static void 15690 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15691 locklist_t **nl_llpp, vnode_t *vp) 15692 { 15693 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15694 off64_t lost_flp_end, local_flp_end, len, start; 15695 15696 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15697 15698 if (!locks_intersect(lost_flp, local_flp)) 15699 return; 15700 15701 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15702 "locks intersect")); 15703 15704 lost_flp_end = lock_to_end(lost_flp); 15705 local_flp_end = lock_to_end(local_flp); 15706 15707 /* Find the starting point of the intersecting region */ 15708 if (local_flp->l_start > lost_flp->l_start) 15709 start = local_flp->l_start; 15710 else 15711 start = lost_flp->l_start; 15712 15713 /* Find the lenght of the intersecting region */ 15714 if (lost_flp_end < local_flp_end) 15715 len = end_to_len(start, lost_flp_end); 15716 else 15717 len = end_to_len(start, local_flp_end); 15718 15719 /* 15720 * Prepare the flock structure for the intersection found and insert 15721 * it into the new list in increasing l_start order. This list contains 15722 * intersections of locks registered by the client with the local host 15723 * and the lost lock. 15724 * The lock type of this lock is the same as that of the local_flp. 15725 */ 15726 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15727 intersect_llp->ll_flock.l_start = start; 15728 intersect_llp->ll_flock.l_len = len; 15729 intersect_llp->ll_flock.l_type = local_flp->l_type; 15730 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15731 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15732 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15733 intersect_llp->ll_vp = vp; 15734 15735 tmp_fllp = *nl_llpp; 15736 cur_fllp = NULL; 15737 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15738 intersect_llp->ll_flock.l_start) { 15739 cur_fllp = tmp_fllp; 15740 tmp_fllp = tmp_fllp->ll_next; 15741 } 15742 if (cur_fllp == NULL) { 15743 /* first on the list */ 15744 intersect_llp->ll_next = *nl_llpp; 15745 *nl_llpp = intersect_llp; 15746 } else { 15747 intersect_llp->ll_next = cur_fllp->ll_next; 15748 cur_fllp->ll_next = intersect_llp; 15749 } 15750 15751 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15752 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15753 intersect_llp->ll_flock.l_start, 15754 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15755 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15756 } 15757 15758 /* 15759 * Our local locking current state is potentially different than 15760 * what the NFSv4 server thinks we have due to a lost lock that was 15761 * resent and then received. We need to reset our "NFSv4" locking 15762 * state to match the current local locking state for this pid since 15763 * that is what the user/application sees as what the world is. 15764 * 15765 * We cannot afford to drop the open/lock seqid sync since then we can 15766 * get confused about what the current local locking state "is" versus 15767 * "was". 15768 * 15769 * If we are unable to fix up the locks, we send SIGLOST to the affected 15770 * process. This is not done if the filesystem has been forcibly 15771 * unmounted, in case the process has already exited and a new process 15772 * exists with the same pid. 15773 */ 15774 static void 15775 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15776 nfs4_lock_owner_t *lop) 15777 { 15778 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15779 mntinfo4_t *mi = VTOMI4(vp); 15780 const int cmd = F_SETLK; 15781 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15782 flock64_t ul_fl; 15783 15784 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15785 "nfs4_reinstitute_local_lock_state")); 15786 15787 /* 15788 * Find active locks for this vp from the local locking code. 15789 * Scan through this list and find out the locks that intersect with 15790 * the lost lock. Once we find the lock that intersects, add the 15791 * intersection area as a new lock to a new list "ri_llp". The lock 15792 * type of the intersection region lock added to ri_llp is the same 15793 * as that found in the active lock list, "list". The intersecting 15794 * region locks are added to ri_llp in increasing l_start order. 15795 */ 15796 ASSERT(nfs_zone() == mi->mi_zone); 15797 15798 locks = flk_active_locks_for_vp(vp); 15799 ri_llp = NULL; 15800 15801 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15802 ASSERT(llp->ll_vp == vp); 15803 /* 15804 * Pick locks that belong to this pid/lockowner 15805 */ 15806 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15807 continue; 15808 15809 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15810 } 15811 15812 /* 15813 * Now we have the list of intersections with the lost lock. These are 15814 * the locks that were/are active before the server replied to the 15815 * last/lost lock. Issue these locks to the server here. Playing these 15816 * locks to the server will re-establish aur current local locking state 15817 * with the v4 server. 15818 * If we get an error, send SIGLOST to the application for that lock. 15819 */ 15820 15821 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15822 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15823 "nfs4_reinstitute_local_lock_state: need to issue " 15824 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15825 llp->ll_flock.l_start, 15826 llp->ll_flock.l_start + llp->ll_flock.l_len, 15827 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15828 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15829 /* 15830 * No need to relock what we already have 15831 */ 15832 if (llp->ll_flock.l_type == lost_flp->l_type) 15833 continue; 15834 15835 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15836 } 15837 15838 /* 15839 * Now keeping the start of the lost lock as our reference parse the 15840 * newly created ri_llp locklist to find the ranges that we have locked 15841 * with the v4 server but not in the current local locking. We need 15842 * to unlock these ranges. 15843 * These ranges can also be reffered to as those ranges, where the lost 15844 * lock does not overlap with the locks in the ri_llp but are locked 15845 * since the server replied to the lost lock. 15846 */ 15847 cur_start = lost_flp->l_start; 15848 lost_flp_end = lock_to_end(lost_flp); 15849 15850 ul_fl.l_type = F_UNLCK; 15851 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15852 ul_fl.l_sysid = lost_flp->l_sysid; 15853 ul_fl.l_pid = lost_flp->l_pid; 15854 15855 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15856 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15857 15858 if (llp->ll_flock.l_start <= cur_start) { 15859 cur_start = start_check(llp_ll_flock_end); 15860 continue; 15861 } 15862 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15863 "nfs4_reinstitute_local_lock_state: " 15864 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15865 cur_start, llp->ll_flock.l_start)); 15866 15867 ul_fl.l_start = cur_start; 15868 ul_fl.l_len = end_to_len(cur_start, 15869 (llp->ll_flock.l_start - 1)); 15870 15871 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15872 cur_start = start_check(llp_ll_flock_end); 15873 } 15874 15875 /* 15876 * In the case where the lost lock ends after all intersecting locks, 15877 * unlock the last part of the lost lock range. 15878 */ 15879 if (cur_start != start_check(lost_flp_end)) { 15880 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15881 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15882 "lost lock region [%"PRIx64" - %"PRIx64"]", 15883 cur_start, lost_flp->l_start + lost_flp->l_len)); 15884 15885 ul_fl.l_start = cur_start; 15886 /* 15887 * Is it an to-EOF lock? if so unlock till the end 15888 */ 15889 if (lost_flp->l_len == 0) 15890 ul_fl.l_len = 0; 15891 else 15892 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15893 15894 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15895 } 15896 15897 if (locks != NULL) 15898 flk_free_locklist(locks); 15899 15900 /* Free up our newly created locklist */ 15901 for (llp = ri_llp; llp != NULL; ) { 15902 tmp_llp = llp->ll_next; 15903 kmem_free(llp, sizeof (locklist_t)); 15904 llp = tmp_llp; 15905 } 15906 15907 /* 15908 * Now return back to the original calling nfs4frlock() 15909 * and let us naturally drop our seqid syncs. 15910 */ 15911 } 15912 15913 /* 15914 * Create a lost state record for the given lock reinstantiation request 15915 * and push it onto the lost state queue. 15916 */ 15917 static void 15918 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15919 nfs4_lock_owner_t *lop) 15920 { 15921 nfs4_lost_rqst_t req; 15922 nfs_lock_type4 locktype; 15923 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15924 15925 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15926 15927 locktype = flk_to_locktype(cmd, flk->l_type); 15928 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15929 NULL, NULL, lop, flk, &req, cr, vp); 15930 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15931 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15932 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15933 NULL); 15934 } 15935