1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 28 * All Rights Reserved 29 */ 30 31 #include <sys/param.h> 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/cred.h> 35 #include <sys/time.h> 36 #include <sys/vnode.h> 37 #include <sys/vfs.h> 38 #include <sys/vfs_opreg.h> 39 #include <sys/file.h> 40 #include <sys/filio.h> 41 #include <sys/uio.h> 42 #include <sys/buf.h> 43 #include <sys/mman.h> 44 #include <sys/pathname.h> 45 #include <sys/dirent.h> 46 #include <sys/debug.h> 47 #include <sys/vmsystm.h> 48 #include <sys/fcntl.h> 49 #include <sys/flock.h> 50 #include <sys/swap.h> 51 #include <sys/errno.h> 52 #include <sys/strsubr.h> 53 #include <sys/sysmacros.h> 54 #include <sys/kmem.h> 55 #include <sys/cmn_err.h> 56 #include <sys/pathconf.h> 57 #include <sys/utsname.h> 58 #include <sys/dnlc.h> 59 #include <sys/acl.h> 60 #include <sys/systeminfo.h> 61 #include <sys/policy.h> 62 #include <sys/sdt.h> 63 #include <sys/list.h> 64 #include <sys/stat.h> 65 #include <sys/zone.h> 66 67 #include <rpc/types.h> 68 #include <rpc/auth.h> 69 #include <rpc/clnt.h> 70 71 #include <nfs/nfs.h> 72 #include <nfs/nfs_clnt.h> 73 #include <nfs/nfs_acl.h> 74 #include <nfs/lm.h> 75 #include <nfs/nfs4.h> 76 #include <nfs/nfs4_kprot.h> 77 #include <nfs/rnode4.h> 78 #include <nfs/nfs4_clnt.h> 79 80 #include <vm/hat.h> 81 #include <vm/as.h> 82 #include <vm/page.h> 83 #include <vm/pvn.h> 84 #include <vm/seg.h> 85 #include <vm/seg_map.h> 86 #include <vm/seg_kpm.h> 87 #include <vm/seg_vn.h> 88 89 #include <fs/fs_subr.h> 90 91 #include <sys/ddi.h> 92 #include <sys/int_fmtio.h> 93 94 typedef struct { 95 nfs4_ga_res_t *di_garp; 96 cred_t *di_cred; 97 hrtime_t di_time_call; 98 } dirattr_info_t; 99 100 typedef enum nfs4_acl_op { 101 NFS4_ACL_GET, 102 NFS4_ACL_SET 103 } nfs4_acl_op_t; 104 105 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 106 107 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 108 char *, dirattr_info_t *); 109 110 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 111 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 112 nfs4_error_t *, int *); 113 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 114 cred_t *); 115 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 116 stable_how4 *); 117 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 118 cred_t *, bool_t, struct uio *); 119 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 120 vsecattr_t *); 121 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 122 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 123 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 124 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 125 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 126 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 127 int, vnode_t **, cred_t *); 128 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 129 cred_t *, int, int, enum createmode4, int); 130 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 131 caller_context_t *); 132 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 133 vnode_t *, char *, cred_t *, nfsstat4 *); 134 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 135 vnode_t *, char *, cred_t *, nfsstat4 *); 136 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 137 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 138 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 139 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 140 page_t *[], size_t, struct seg *, caddr_t, 141 enum seg_rw, cred_t *); 142 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 143 cred_t *); 144 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 145 int, cred_t *); 146 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 147 int, cred_t *); 148 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 149 static void nfs4_set_mod(vnode_t *); 150 static void nfs4_get_commit(vnode_t *); 151 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 152 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 153 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 154 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 155 cred_t *); 156 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 157 cred_t *); 158 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 159 hrtime_t, vnode_t *, cred_t *); 160 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 161 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 162 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 163 u_offset_t); 164 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 165 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 166 static cred_t *state_to_cred(nfs4_open_stream_t *); 167 static int vtoname(vnode_t *, char *, ssize_t); 168 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 169 static pid_t lo_to_pid(lock_owner4 *); 170 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 171 cred_t *, nfs4_lock_owner_t *); 172 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 173 nfs4_lock_owner_t *); 174 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 175 static void nfs4_delmap_callback(struct as *, void *, uint_t); 176 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 177 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 178 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 179 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 180 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 181 uid_t, gid_t, int); 182 183 /* 184 * Routines that implement the setting of v4 args for the misc. ops 185 */ 186 static void nfs4args_lock_free(nfs_argop4 *); 187 static void nfs4args_lockt_free(nfs_argop4 *); 188 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 189 int, rnode4_t *, cred_t *, bitmap4, int *, 190 nfs4_stateid_types_t *); 191 static void nfs4args_setattr_free(nfs_argop4 *); 192 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 193 bitmap4); 194 static void nfs4args_verify_free(nfs_argop4 *); 195 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 196 WRITE4args **, nfs4_stateid_types_t *); 197 198 /* 199 * These are the vnode ops functions that implement the vnode interface to 200 * the networked file system. See more comments below at nfs4_vnodeops. 201 */ 202 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 203 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 204 caller_context_t *); 205 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 206 caller_context_t *); 207 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 208 caller_context_t *); 209 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 210 caller_context_t *); 211 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 212 caller_context_t *); 213 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 214 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 215 caller_context_t *); 216 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 217 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 218 int, vnode_t **, cred_t *, int, caller_context_t *, 219 vsecattr_t *); 220 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 221 int); 222 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 223 caller_context_t *, int); 224 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 225 caller_context_t *, int); 226 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 227 cred_t *, caller_context_t *, int, vsecattr_t *); 228 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 229 caller_context_t *, int); 230 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 231 cred_t *, caller_context_t *, int); 232 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 233 caller_context_t *, int); 234 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 235 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 236 page_t *[], size_t, struct seg *, caddr_t, 237 enum seg_rw, cred_t *, caller_context_t *); 238 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 239 caller_context_t *); 240 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 241 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 242 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 243 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 244 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 245 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 246 struct flk_callback *, cred_t *, caller_context_t *); 247 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 248 cred_t *, caller_context_t *); 249 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 250 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 251 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 252 cred_t *, caller_context_t *); 253 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 254 caller_context_t *); 255 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 256 caller_context_t *); 257 /* 258 * These vnode ops are required to be called from outside this source file, 259 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 260 * as static. 261 */ 262 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 263 caller_context_t *); 264 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 265 int nfs4_lookup(vnode_t *, char *, vnode_t **, 266 struct pathname *, int, vnode_t *, cred_t *, 267 caller_context_t *, int *, pathname_t *); 268 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 269 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 270 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 271 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 272 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 273 caller_context_t *); 274 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 275 caller_context_t *); 276 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 277 caller_context_t *); 278 279 /* 280 * Used for nfs4_commit_vp() to indicate if we should 281 * wait on pending writes. 282 */ 283 #define NFS4_WRITE_NOWAIT 0 284 #define NFS4_WRITE_WAIT 1 285 286 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 287 288 /* 289 * Error flags used to pass information about certain special errors 290 * which need to be handled specially. 291 */ 292 #define NFS_EOF -98 293 #define NFS_VERF_MISMATCH -97 294 295 /* 296 * Flags used to differentiate between which operation drove the 297 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 298 */ 299 #define NFS4_CLOSE_OP 0x1 300 #define NFS4_DELMAP_OP 0x2 301 #define NFS4_INACTIVE_OP 0x3 302 303 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 304 305 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 306 #define ALIGN64(x, ptr, sz) \ 307 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 308 if (x) { \ 309 x = sizeof (uint64_t) - (x); \ 310 sz -= (x); \ 311 ptr += (x); \ 312 } 313 314 #ifdef DEBUG 315 int nfs4_client_attr_debug = 0; 316 int nfs4_client_state_debug = 0; 317 int nfs4_client_shadow_debug = 0; 318 int nfs4_client_lock_debug = 0; 319 int nfs4_seqid_sync = 0; 320 int nfs4_client_map_debug = 0; 321 static int nfs4_pageio_debug = 0; 322 int nfs4_client_inactive_debug = 0; 323 int nfs4_client_recov_debug = 0; 324 int nfs4_client_failover_debug = 0; 325 int nfs4_client_call_debug = 0; 326 int nfs4_client_lookup_debug = 0; 327 int nfs4_client_zone_debug = 0; 328 int nfs4_lost_rqst_debug = 0; 329 int nfs4_rdattrerr_debug = 0; 330 int nfs4_open_stream_debug = 0; 331 332 int nfs4read_error_inject; 333 334 static int nfs4_create_misses = 0; 335 336 static int nfs4_readdir_cache_shorts = 0; 337 static int nfs4_readdir_readahead = 0; 338 339 static int nfs4_bio_do_stop = 0; 340 341 static int nfs4_lostpage = 0; /* number of times we lost original page */ 342 343 int nfs4_mmap_debug = 0; 344 345 static int nfs4_pathconf_cache_hits = 0; 346 static int nfs4_pathconf_cache_misses = 0; 347 348 int nfs4close_all_cnt; 349 int nfs4close_one_debug = 0; 350 int nfs4close_notw_debug = 0; 351 352 int denied_to_flk_debug = 0; 353 void *lockt_denied_debug; 354 355 #endif 356 357 /* 358 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 359 * or NFS4ERR_RESOURCE. 360 */ 361 static int confirm_retry_sec = 30; 362 363 static int nfs4_lookup_neg_cache = 1; 364 365 /* 366 * number of pages to read ahead 367 * optimized for 100 base-T. 368 */ 369 static int nfs4_nra = 4; 370 371 static int nfs4_do_symlink_cache = 1; 372 373 static int nfs4_pathconf_disable_cache = 0; 374 375 /* 376 * These are the vnode ops routines which implement the vnode interface to 377 * the networked file system. These routines just take their parameters, 378 * make them look networkish by putting the right info into interface structs, 379 * and then calling the appropriate remote routine(s) to do the work. 380 * 381 * Note on directory name lookup cacheing: If we detect a stale fhandle, 382 * we purge the directory cache relative to that vnode. This way, the 383 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 384 * more details on rnode locking. 385 */ 386 387 struct vnodeops *nfs4_vnodeops; 388 389 const fs_operation_def_t nfs4_vnodeops_template[] = { 390 VOPNAME_OPEN, { .vop_open = nfs4_open }, 391 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 392 VOPNAME_READ, { .vop_read = nfs4_read }, 393 VOPNAME_WRITE, { .vop_write = nfs4_write }, 394 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 395 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 396 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 397 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 398 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 399 VOPNAME_CREATE, { .vop_create = nfs4_create }, 400 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 401 VOPNAME_LINK, { .vop_link = nfs4_link }, 402 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 403 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 404 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 405 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 406 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 407 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 408 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 409 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 410 VOPNAME_FID, { .vop_fid = nfs4_fid }, 411 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 412 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 413 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 414 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 415 VOPNAME_SPACE, { .vop_space = nfs4_space }, 416 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 417 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 418 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 419 VOPNAME_MAP, { .vop_map = nfs4_map }, 420 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 421 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 422 /* no separate nfs4_dump */ 423 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 424 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 425 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 426 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 427 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 428 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 429 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 430 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 431 NULL, NULL 432 }; 433 434 /* 435 * The following are subroutines and definitions to set args or get res 436 * for the different nfsv4 ops 437 */ 438 439 void 440 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 441 { 442 int i; 443 444 for (i = 0; i < arglen; i++) { 445 if (argop[i].argop == OP_LOOKUP) { 446 kmem_free( 447 argop[i].nfs_argop4_u.oplookup. 448 objname.utf8string_val, 449 argop[i].nfs_argop4_u.oplookup. 450 objname.utf8string_len); 451 } 452 } 453 } 454 455 static void 456 nfs4args_lock_free(nfs_argop4 *argop) 457 { 458 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 459 460 if (locker->new_lock_owner == TRUE) { 461 open_to_lock_owner4 *open_owner; 462 463 open_owner = &locker->locker4_u.open_owner; 464 if (open_owner->lock_owner.owner_val != NULL) { 465 kmem_free(open_owner->lock_owner.owner_val, 466 open_owner->lock_owner.owner_len); 467 } 468 } 469 } 470 471 static void 472 nfs4args_lockt_free(nfs_argop4 *argop) 473 { 474 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 475 476 if (lowner->owner_val != NULL) { 477 kmem_free(lowner->owner_val, lowner->owner_len); 478 } 479 } 480 481 static void 482 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 483 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 484 nfs4_stateid_types_t *sid_types) 485 { 486 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 487 mntinfo4_t *mi; 488 489 argop->argop = OP_SETATTR; 490 /* 491 * The stateid is set to 0 if client is not modifying the size 492 * and otherwise to whatever nfs4_get_stateid() returns. 493 * 494 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 495 * state struct could be found for the process/file pair. We may 496 * want to change this in the future (by OPENing the file). See 497 * bug # 4474852. 498 */ 499 if (vap->va_mask & AT_SIZE) { 500 501 ASSERT(rp != NULL); 502 mi = VTOMI4(RTOV4(rp)); 503 504 argop->nfs_argop4_u.opsetattr.stateid = 505 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 506 OP_SETATTR, sid_types, FALSE); 507 } else { 508 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 509 sizeof (stateid4)); 510 } 511 512 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 513 if (*error) 514 bzero(attr, sizeof (*attr)); 515 } 516 517 static void 518 nfs4args_setattr_free(nfs_argop4 *argop) 519 { 520 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 521 } 522 523 static int 524 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 525 bitmap4 supp) 526 { 527 fattr4 *attr; 528 int error = 0; 529 530 argop->argop = op; 531 switch (op) { 532 case OP_VERIFY: 533 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 534 break; 535 case OP_NVERIFY: 536 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 537 break; 538 default: 539 return (EINVAL); 540 } 541 if (!error) 542 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 543 if (error) 544 bzero(attr, sizeof (*attr)); 545 return (error); 546 } 547 548 static void 549 nfs4args_verify_free(nfs_argop4 *argop) 550 { 551 switch (argop->argop) { 552 case OP_VERIFY: 553 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 554 break; 555 case OP_NVERIFY: 556 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 557 break; 558 default: 559 break; 560 } 561 } 562 563 static void 564 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 565 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 566 { 567 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 568 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 569 570 argop->argop = OP_WRITE; 571 wargs->stable = stable; 572 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 573 mi, OP_WRITE, sid_tp); 574 wargs->mblk = NULL; 575 *wargs_pp = wargs; 576 } 577 578 void 579 nfs4args_copen_free(OPEN4cargs *open_args) 580 { 581 if (open_args->owner.owner_val) { 582 kmem_free(open_args->owner.owner_val, 583 open_args->owner.owner_len); 584 } 585 if ((open_args->opentype == OPEN4_CREATE) && 586 (open_args->mode != EXCLUSIVE4)) { 587 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 588 } 589 } 590 591 /* 592 * XXX: This is referenced in modstubs.s 593 */ 594 struct vnodeops * 595 nfs4_getvnodeops(void) 596 { 597 return (nfs4_vnodeops); 598 } 599 600 /* 601 * The OPEN operation opens a regular file. 602 */ 603 /*ARGSUSED3*/ 604 static int 605 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 606 { 607 vnode_t *dvp = NULL; 608 rnode4_t *rp, *drp; 609 int error; 610 int just_been_created; 611 char fn[MAXNAMELEN]; 612 613 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 614 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 615 return (EIO); 616 rp = VTOR4(*vpp); 617 618 /* 619 * Check to see if opening something besides a regular file; 620 * if so skip the OTW call 621 */ 622 if ((*vpp)->v_type != VREG) { 623 error = nfs4_open_non_reg_file(vpp, flag, cr); 624 return (error); 625 } 626 627 /* 628 * XXX - would like a check right here to know if the file is 629 * executable or not, so as to skip OTW 630 */ 631 632 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 633 return (error); 634 635 drp = VTOR4(dvp); 636 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 637 return (EINTR); 638 639 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 640 nfs_rw_exit(&drp->r_rwlock); 641 return (error); 642 } 643 644 /* 645 * See if this file has just been CREATEd. 646 * If so, clear the flag and update the dnlc, which was previously 647 * skipped in nfs4_create. 648 * XXX need better serilization on this. 649 * XXX move this into the nf4open_otw call, after we have 650 * XXX acquired the open owner seqid sync. 651 */ 652 mutex_enter(&rp->r_statev4_lock); 653 if (rp->created_v4) { 654 rp->created_v4 = 0; 655 mutex_exit(&rp->r_statev4_lock); 656 657 dnlc_update(dvp, fn, *vpp); 658 /* This is needed so we don't bump the open ref count */ 659 just_been_created = 1; 660 } else { 661 mutex_exit(&rp->r_statev4_lock); 662 just_been_created = 0; 663 } 664 665 /* 666 * If caller specified O_TRUNC/FTRUNC, then be sure to set 667 * FWRITE (to drive successful setattr(size=0) after open) 668 */ 669 if (flag & FTRUNC) 670 flag |= FWRITE; 671 672 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 673 just_been_created); 674 675 if (!error && !((*vpp)->v_flag & VROOT)) 676 dnlc_update(dvp, fn, *vpp); 677 678 nfs_rw_exit(&drp->r_rwlock); 679 680 /* release the hold from vtodv */ 681 VN_RELE(dvp); 682 683 /* exchange the shadow for the master vnode, if needed */ 684 685 if (error == 0 && IS_SHADOW(*vpp, rp)) 686 sv_exchange(vpp); 687 688 return (error); 689 } 690 691 /* 692 * See if there's a "lost open" request to be saved and recovered. 693 */ 694 static void 695 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 696 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 697 vnode_t *dvp, OPEN4cargs *open_args) 698 { 699 vfs_t *vfsp; 700 char *srccfp; 701 702 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 703 704 if (error != ETIMEDOUT && error != EINTR && 705 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 706 lost_rqstp->lr_op = 0; 707 return; 708 } 709 710 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 711 "nfs4open_save_lost_rqst: error %d", error)); 712 713 lost_rqstp->lr_op = OP_OPEN; 714 715 /* 716 * The vp (if it is not NULL) and dvp are held and rele'd via 717 * the recovery code. See nfs4_save_lost_rqst. 718 */ 719 lost_rqstp->lr_vp = vp; 720 lost_rqstp->lr_dvp = dvp; 721 lost_rqstp->lr_oop = oop; 722 lost_rqstp->lr_osp = NULL; 723 lost_rqstp->lr_lop = NULL; 724 lost_rqstp->lr_cr = cr; 725 lost_rqstp->lr_flk = NULL; 726 lost_rqstp->lr_oacc = open_args->share_access; 727 lost_rqstp->lr_odeny = open_args->share_deny; 728 lost_rqstp->lr_oclaim = open_args->claim; 729 if (open_args->claim == CLAIM_DELEGATE_CUR) { 730 lost_rqstp->lr_ostateid = 731 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 732 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 733 } else { 734 srccfp = open_args->open_claim4_u.cfile; 735 } 736 lost_rqstp->lr_ofile.utf8string_len = 0; 737 lost_rqstp->lr_ofile.utf8string_val = NULL; 738 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 739 lost_rqstp->lr_putfirst = FALSE; 740 } 741 742 struct nfs4_excl_time { 743 uint32 seconds; 744 uint32 nseconds; 745 }; 746 747 /* 748 * The OPEN operation creates and/or opens a regular file 749 * 750 * ARGSUSED 751 */ 752 static int 753 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 754 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 755 enum createmode4 createmode, int file_just_been_created) 756 { 757 rnode4_t *rp; 758 rnode4_t *drp = VTOR4(dvp); 759 vnode_t *vp = NULL; 760 vnode_t *vpi = *vpp; 761 bool_t needrecov = FALSE; 762 763 int doqueue = 1; 764 765 COMPOUND4args_clnt args; 766 COMPOUND4res_clnt res; 767 nfs_argop4 *argop; 768 nfs_resop4 *resop; 769 int argoplist_size; 770 int idx_open, idx_fattr; 771 772 GETFH4res *gf_res = NULL; 773 OPEN4res *op_res = NULL; 774 nfs4_ga_res_t *garp; 775 fattr4 *attr = NULL; 776 struct nfs4_excl_time verf; 777 bool_t did_excl_setup = FALSE; 778 int created_osp; 779 780 OPEN4cargs *open_args; 781 nfs4_open_owner_t *oop = NULL; 782 nfs4_open_stream_t *osp = NULL; 783 seqid4 seqid = 0; 784 bool_t retry_open = FALSE; 785 nfs4_recov_state_t recov_state; 786 nfs4_lost_rqst_t lost_rqst; 787 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 788 hrtime_t t; 789 int acc = 0; 790 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 791 cred_t *ncr = NULL; 792 793 nfs4_sharedfh_t *otw_sfh; 794 nfs4_sharedfh_t *orig_sfh; 795 int fh_differs = 0; 796 int numops, setgid_flag; 797 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 798 799 /* 800 * Make sure we properly deal with setting the right gid on 801 * a newly created file to reflect the parent's setgid bit 802 */ 803 setgid_flag = 0; 804 if (create_flag && in_va) { 805 806 /* 807 * If the parent's directory has the setgid bit set 808 * _and_ the client was able to get a valid mapping 809 * for the parent dir's owner_group, we want to 810 * append NVERIFY(owner_group == dva.va_gid) and 811 * SETATTR to the CREATE compound. 812 */ 813 mutex_enter(&drp->r_statelock); 814 if (drp->r_attr.va_mode & VSGID && 815 drp->r_attr.va_gid != GID_NOBODY) { 816 in_va->va_gid = drp->r_attr.va_gid; 817 setgid_flag = 1; 818 } 819 mutex_exit(&drp->r_statelock); 820 } 821 822 /* 823 * Normal/non-create compound: 824 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 825 * 826 * Open(create) compound no setgid: 827 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 828 * RESTOREFH + GETATTR 829 * 830 * Open(create) setgid: 831 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 832 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 833 * NVERIFY(grp) + SETATTR 834 */ 835 if (setgid_flag) { 836 numops = 10; 837 idx_open = 1; 838 idx_fattr = 3; 839 } else if (create_flag) { 840 numops = 7; 841 idx_open = 2; 842 idx_fattr = 4; 843 } else { 844 numops = 4; 845 idx_open = 1; 846 idx_fattr = 3; 847 } 848 849 args.array_len = numops; 850 argoplist_size = numops * sizeof (nfs_argop4); 851 argop = kmem_alloc(argoplist_size, KM_SLEEP); 852 853 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 854 "open %s open flag 0x%x cred %p", file_name, open_flag, 855 (void *)cr)); 856 857 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 858 if (create_flag) { 859 /* 860 * We are to create a file. Initialize the passed in vnode 861 * pointer. 862 */ 863 vpi = NULL; 864 } else { 865 /* 866 * Check to see if the client owns a read delegation and is 867 * trying to open for write. If so, then return the delegation 868 * to avoid the server doing a cb_recall and returning DELAY. 869 * NB - we don't use the statev4_lock here because we'd have 870 * to drop the lock anyway and the result would be stale. 871 */ 872 if ((open_flag & FWRITE) && 873 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 874 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 875 876 /* 877 * If the file has a delegation, then do an access check up 878 * front. This avoids having to an access check later after 879 * we've already done start_op, which could deadlock. 880 */ 881 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 882 if (open_flag & FREAD && 883 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 884 acc |= VREAD; 885 if (open_flag & FWRITE && 886 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 887 acc |= VWRITE; 888 } 889 } 890 891 drp = VTOR4(dvp); 892 893 recov_state.rs_flags = 0; 894 recov_state.rs_num_retry_despite_err = 0; 895 cred_otw = cr; 896 897 recov_retry: 898 fh_differs = 0; 899 nfs4_error_zinit(&e); 900 901 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 902 if (e.error) { 903 if (ncr != NULL) 904 crfree(ncr); 905 kmem_free(argop, argoplist_size); 906 return (e.error); 907 } 908 909 args.ctag = TAG_OPEN; 910 args.array_len = numops; 911 args.array = argop; 912 913 /* putfh directory fh */ 914 argop[0].argop = OP_CPUTFH; 915 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 916 917 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 918 argop[idx_open].argop = OP_COPEN; 919 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 920 open_args->claim = CLAIM_NULL; 921 922 /* name of file */ 923 open_args->open_claim4_u.cfile = file_name; 924 open_args->owner.owner_len = 0; 925 open_args->owner.owner_val = NULL; 926 927 if (create_flag) { 928 /* CREATE a file */ 929 open_args->opentype = OPEN4_CREATE; 930 open_args->mode = createmode; 931 if (createmode == EXCLUSIVE4) { 932 if (did_excl_setup == FALSE) { 933 verf.seconds = zone_get_hostid(NULL); 934 if (verf.seconds != 0) 935 verf.nseconds = newnum(); 936 else { 937 timestruc_t now; 938 939 gethrestime(&now); 940 verf.seconds = now.tv_sec; 941 verf.nseconds = now.tv_nsec; 942 } 943 /* 944 * Since the server will use this value for the 945 * mtime, make sure that it can't overflow. Zero 946 * out the MSB. The actual value does not matter 947 * here, only its uniqeness. 948 */ 949 verf.seconds &= INT32_MAX; 950 did_excl_setup = TRUE; 951 } 952 953 /* Now copy over verifier to OPEN4args. */ 954 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 955 } else { 956 int v_error; 957 bitmap4 supp_attrs; 958 servinfo4_t *svp; 959 960 attr = &open_args->createhow4_u.createattrs; 961 962 svp = drp->r_server; 963 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 964 supp_attrs = svp->sv_supp_attrs; 965 nfs_rw_exit(&svp->sv_lock); 966 967 /* GUARDED4 or UNCHECKED4 */ 968 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 969 supp_attrs); 970 if (v_error) { 971 bzero(attr, sizeof (*attr)); 972 nfs4args_copen_free(open_args); 973 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 974 &recov_state, FALSE); 975 if (ncr != NULL) 976 crfree(ncr); 977 kmem_free(argop, argoplist_size); 978 return (v_error); 979 } 980 } 981 } else { 982 /* NO CREATE */ 983 open_args->opentype = OPEN4_NOCREATE; 984 } 985 986 if (recov_state.rs_sp != NULL) { 987 mutex_enter(&recov_state.rs_sp->s_lock); 988 open_args->owner.clientid = recov_state.rs_sp->clientid; 989 mutex_exit(&recov_state.rs_sp->s_lock); 990 } else { 991 /* XXX should we just fail here? */ 992 open_args->owner.clientid = 0; 993 } 994 995 /* 996 * This increments oop's ref count or creates a temporary 'just_created' 997 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 998 * completes. 999 */ 1000 mutex_enter(&VTOMI4(dvp)->mi_lock); 1001 1002 /* See if a permanent or just created open owner exists */ 1003 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1004 if (!oop) { 1005 /* 1006 * This open owner does not exist so create a temporary 1007 * just created one. 1008 */ 1009 oop = create_open_owner(cr, VTOMI4(dvp)); 1010 ASSERT(oop != NULL); 1011 } 1012 mutex_exit(&VTOMI4(dvp)->mi_lock); 1013 1014 /* this length never changes, do alloc before seqid sync */ 1015 open_args->owner.owner_len = sizeof (oop->oo_name); 1016 open_args->owner.owner_val = 1017 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1018 1019 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1020 if (e.error == EAGAIN) { 1021 open_owner_rele(oop); 1022 nfs4args_copen_free(open_args); 1023 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1024 if (ncr != NULL) { 1025 crfree(ncr); 1026 ncr = NULL; 1027 } 1028 goto recov_retry; 1029 } 1030 1031 /* Check to see if we need to do the OTW call */ 1032 if (!create_flag) { 1033 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1034 file_just_been_created, &e.error, acc, &recov_state)) { 1035 1036 /* 1037 * The OTW open is not necessary. Either 1038 * the open can succeed without it (eg. 1039 * delegation, error == 0) or the open 1040 * must fail due to an access failure 1041 * (error != 0). In either case, tidy 1042 * up and return. 1043 */ 1044 1045 nfs4_end_open_seqid_sync(oop); 1046 open_owner_rele(oop); 1047 nfs4args_copen_free(open_args); 1048 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1049 if (ncr != NULL) 1050 crfree(ncr); 1051 kmem_free(argop, argoplist_size); 1052 return (e.error); 1053 } 1054 } 1055 1056 bcopy(&oop->oo_name, open_args->owner.owner_val, 1057 open_args->owner.owner_len); 1058 1059 seqid = nfs4_get_open_seqid(oop) + 1; 1060 open_args->seqid = seqid; 1061 open_args->share_access = 0; 1062 if (open_flag & FREAD) 1063 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1064 if (open_flag & FWRITE) 1065 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1066 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1067 1068 1069 1070 /* 1071 * getfh w/sanity check for idx_open/idx_fattr 1072 */ 1073 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1074 argop[idx_open + 1].argop = OP_GETFH; 1075 1076 /* getattr */ 1077 argop[idx_fattr].argop = OP_GETATTR; 1078 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1079 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1080 1081 if (setgid_flag) { 1082 vattr_t _v; 1083 servinfo4_t *svp; 1084 bitmap4 supp_attrs; 1085 1086 svp = drp->r_server; 1087 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1088 supp_attrs = svp->sv_supp_attrs; 1089 nfs_rw_exit(&svp->sv_lock); 1090 1091 /* 1092 * For setgid case, we need to: 1093 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1094 */ 1095 argop[4].argop = OP_SAVEFH; 1096 1097 argop[5].argop = OP_CPUTFH; 1098 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1099 1100 argop[6].argop = OP_GETATTR; 1101 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1102 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1103 1104 argop[7].argop = OP_RESTOREFH; 1105 1106 /* 1107 * nverify 1108 */ 1109 _v.va_mask = AT_GID; 1110 _v.va_gid = in_va->va_gid; 1111 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1112 supp_attrs))) { 1113 1114 /* 1115 * setattr 1116 * 1117 * We _know_ we're not messing with AT_SIZE or 1118 * AT_XTIME, so no need for stateid or flags. 1119 * Also we specify NULL rp since we're only 1120 * interested in setting owner_group attributes. 1121 */ 1122 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1123 supp_attrs, &e.error, 0); 1124 if (e.error) 1125 nfs4args_verify_free(&argop[8]); 1126 } 1127 1128 if (e.error) { 1129 /* 1130 * XXX - Revisit the last argument to nfs4_end_op() 1131 * once 5020486 is fixed. 1132 */ 1133 nfs4_end_open_seqid_sync(oop); 1134 open_owner_rele(oop); 1135 nfs4args_copen_free(open_args); 1136 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1137 if (ncr != NULL) 1138 crfree(ncr); 1139 kmem_free(argop, argoplist_size); 1140 return (e.error); 1141 } 1142 } else if (create_flag) { 1143 /* 1144 * For setgid case, we need to: 1145 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1146 */ 1147 argop[1].argop = OP_SAVEFH; 1148 1149 argop[5].argop = OP_RESTOREFH; 1150 1151 argop[6].argop = OP_GETATTR; 1152 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1153 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1154 } 1155 1156 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1157 "nfs4open_otw: %s call, nm %s, rp %s", 1158 needrecov ? "recov" : "first", file_name, 1159 rnode4info(VTOR4(dvp)))); 1160 1161 t = gethrtime(); 1162 1163 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1164 1165 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1166 nfs4_set_open_seqid(seqid, oop, args.ctag); 1167 1168 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1169 1170 if (e.error || needrecov) { 1171 bool_t abort = FALSE; 1172 1173 if (needrecov) { 1174 nfs4_bseqid_entry_t *bsep = NULL; 1175 1176 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1177 cred_otw, vpi, dvp, open_args); 1178 1179 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1180 bsep = nfs4_create_bseqid_entry(oop, NULL, 1181 vpi, 0, args.ctag, open_args->seqid); 1182 num_bseqid_retry--; 1183 } 1184 1185 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1186 NULL, lost_rqst.lr_op == OP_OPEN ? 1187 &lost_rqst : NULL, OP_OPEN, bsep); 1188 1189 if (bsep) 1190 kmem_free(bsep, sizeof (*bsep)); 1191 /* give up if we keep getting BAD_SEQID */ 1192 if (num_bseqid_retry == 0) 1193 abort = TRUE; 1194 if (abort == TRUE && e.error == 0) 1195 e.error = geterrno4(res.status); 1196 } 1197 nfs4_end_open_seqid_sync(oop); 1198 open_owner_rele(oop); 1199 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1200 nfs4args_copen_free(open_args); 1201 if (setgid_flag) { 1202 nfs4args_verify_free(&argop[8]); 1203 nfs4args_setattr_free(&argop[9]); 1204 } 1205 if (!e.error) 1206 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1207 if (ncr != NULL) { 1208 crfree(ncr); 1209 ncr = NULL; 1210 } 1211 if (!needrecov || abort == TRUE || e.error == EINTR || 1212 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1213 kmem_free(argop, argoplist_size); 1214 return (e.error); 1215 } 1216 goto recov_retry; 1217 } 1218 1219 /* 1220 * Will check and update lease after checking the rflag for 1221 * OPEN_CONFIRM in the successful OPEN call. 1222 */ 1223 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1224 1225 /* 1226 * XXX what if we're crossing mount points from server1:/drp 1227 * to server2:/drp/rp. 1228 */ 1229 1230 /* Signal our end of use of the open seqid */ 1231 nfs4_end_open_seqid_sync(oop); 1232 1233 /* 1234 * This will destroy the open owner if it was just created, 1235 * and no one else has put a reference on it. 1236 */ 1237 open_owner_rele(oop); 1238 if (create_flag && (createmode != EXCLUSIVE4) && 1239 res.status == NFS4ERR_BADOWNER) 1240 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1241 1242 e.error = geterrno4(res.status); 1243 nfs4args_copen_free(open_args); 1244 if (setgid_flag) { 1245 nfs4args_verify_free(&argop[8]); 1246 nfs4args_setattr_free(&argop[9]); 1247 } 1248 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1249 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1250 /* 1251 * If the reply is NFS4ERR_ACCESS, it may be because 1252 * we are root (no root net access). If the real uid 1253 * is not root, then retry with the real uid instead. 1254 */ 1255 if (ncr != NULL) { 1256 crfree(ncr); 1257 ncr = NULL; 1258 } 1259 if (res.status == NFS4ERR_ACCESS && 1260 (ncr = crnetadjust(cred_otw)) != NULL) { 1261 cred_otw = ncr; 1262 goto recov_retry; 1263 } 1264 kmem_free(argop, argoplist_size); 1265 return (e.error); 1266 } 1267 1268 resop = &res.array[idx_open]; /* open res */ 1269 op_res = &resop->nfs_resop4_u.opopen; 1270 1271 #ifdef DEBUG 1272 /* 1273 * verify attrset bitmap 1274 */ 1275 if (create_flag && 1276 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1277 /* make sure attrset returned is what we asked for */ 1278 /* XXX Ignore this 'error' for now */ 1279 if (attr->attrmask != op_res->attrset) 1280 /* EMPTY */; 1281 } 1282 #endif 1283 1284 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1285 mutex_enter(&VTOMI4(dvp)->mi_lock); 1286 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1287 mutex_exit(&VTOMI4(dvp)->mi_lock); 1288 } 1289 1290 resop = &res.array[idx_open + 1]; /* getfh res */ 1291 gf_res = &resop->nfs_resop4_u.opgetfh; 1292 1293 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1294 1295 /* 1296 * The open stateid has been updated on the server but not 1297 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1298 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1299 * WRITE call. That, however, will use the old stateid, so go ahead 1300 * and upate the open stateid now, before any call to makenfs4node. 1301 */ 1302 if (vpi) { 1303 nfs4_open_stream_t *tmp_osp; 1304 rnode4_t *tmp_rp = VTOR4(vpi); 1305 1306 tmp_osp = find_open_stream(oop, tmp_rp); 1307 if (tmp_osp) { 1308 tmp_osp->open_stateid = op_res->stateid; 1309 mutex_exit(&tmp_osp->os_sync_lock); 1310 open_stream_rele(tmp_osp, tmp_rp); 1311 } 1312 1313 /* 1314 * We must determine if the file handle given by the otw open 1315 * is the same as the file handle which was passed in with 1316 * *vpp. This case can be reached if the file we are trying 1317 * to open has been removed and another file has been created 1318 * having the same file name. The passed in vnode is released 1319 * later. 1320 */ 1321 orig_sfh = VTOR4(vpi)->r_fh; 1322 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1323 } 1324 1325 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1326 1327 if (create_flag || fh_differs) { 1328 int rnode_err = 0; 1329 1330 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1331 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1332 1333 if (e.error) 1334 PURGE_ATTRCACHE4(vp); 1335 /* 1336 * For the newly created vp case, make sure the rnode 1337 * isn't bad before using it. 1338 */ 1339 mutex_enter(&(VTOR4(vp))->r_statelock); 1340 if (VTOR4(vp)->r_flags & R4RECOVERR) 1341 rnode_err = EIO; 1342 mutex_exit(&(VTOR4(vp))->r_statelock); 1343 1344 if (rnode_err) { 1345 nfs4_end_open_seqid_sync(oop); 1346 nfs4args_copen_free(open_args); 1347 if (setgid_flag) { 1348 nfs4args_verify_free(&argop[8]); 1349 nfs4args_setattr_free(&argop[9]); 1350 } 1351 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1352 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1353 needrecov); 1354 open_owner_rele(oop); 1355 VN_RELE(vp); 1356 if (ncr != NULL) 1357 crfree(ncr); 1358 sfh4_rele(&otw_sfh); 1359 kmem_free(argop, argoplist_size); 1360 return (EIO); 1361 } 1362 } else { 1363 vp = vpi; 1364 } 1365 sfh4_rele(&otw_sfh); 1366 1367 /* 1368 * It seems odd to get a full set of attrs and then not update 1369 * the object's attrcache in the non-create case. Create case uses 1370 * the attrs since makenfs4node checks to see if the attrs need to 1371 * be updated (and then updates them). The non-create case should 1372 * update attrs also. 1373 */ 1374 if (! create_flag && ! fh_differs && !e.error) { 1375 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1376 } 1377 1378 nfs4_error_zinit(&e); 1379 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1380 /* This does not do recovery for vp explicitly. */ 1381 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1382 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1383 1384 if (e.error || e.stat) { 1385 nfs4_end_open_seqid_sync(oop); 1386 nfs4args_copen_free(open_args); 1387 if (setgid_flag) { 1388 nfs4args_verify_free(&argop[8]); 1389 nfs4args_setattr_free(&argop[9]); 1390 } 1391 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1392 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1393 needrecov); 1394 open_owner_rele(oop); 1395 if (create_flag || fh_differs) { 1396 /* rele the makenfs4node */ 1397 VN_RELE(vp); 1398 } 1399 if (ncr != NULL) { 1400 crfree(ncr); 1401 ncr = NULL; 1402 } 1403 if (retry_open == TRUE) { 1404 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1405 "nfs4open_otw: retry the open since OPEN " 1406 "CONFIRM failed with error %d stat %d", 1407 e.error, e.stat)); 1408 if (create_flag && createmode == GUARDED4) { 1409 NFS4_DEBUG(nfs4_client_recov_debug, 1410 (CE_NOTE, "nfs4open_otw: switch " 1411 "createmode from GUARDED4 to " 1412 "UNCHECKED4")); 1413 createmode = UNCHECKED4; 1414 } 1415 goto recov_retry; 1416 } 1417 if (!e.error) { 1418 if (create_flag && (createmode != EXCLUSIVE4) && 1419 e.stat == NFS4ERR_BADOWNER) 1420 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1421 1422 e.error = geterrno4(e.stat); 1423 } 1424 kmem_free(argop, argoplist_size); 1425 return (e.error); 1426 } 1427 } 1428 1429 rp = VTOR4(vp); 1430 1431 mutex_enter(&rp->r_statev4_lock); 1432 if (create_flag) 1433 rp->created_v4 = 1; 1434 mutex_exit(&rp->r_statev4_lock); 1435 1436 mutex_enter(&oop->oo_lock); 1437 /* Doesn't matter if 'oo_just_created' already was set as this */ 1438 oop->oo_just_created = NFS4_PERM_CREATED; 1439 if (oop->oo_cred_otw) 1440 crfree(oop->oo_cred_otw); 1441 oop->oo_cred_otw = cred_otw; 1442 crhold(oop->oo_cred_otw); 1443 mutex_exit(&oop->oo_lock); 1444 1445 /* returns with 'os_sync_lock' held */ 1446 osp = find_or_create_open_stream(oop, rp, &created_osp); 1447 if (!osp) { 1448 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1449 "nfs4open_otw: failed to create an open stream")); 1450 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1451 "signal our end of use of the open seqid")); 1452 1453 nfs4_end_open_seqid_sync(oop); 1454 open_owner_rele(oop); 1455 nfs4args_copen_free(open_args); 1456 if (setgid_flag) { 1457 nfs4args_verify_free(&argop[8]); 1458 nfs4args_setattr_free(&argop[9]); 1459 } 1460 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1461 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1462 if (create_flag || fh_differs) 1463 VN_RELE(vp); 1464 if (ncr != NULL) 1465 crfree(ncr); 1466 1467 kmem_free(argop, argoplist_size); 1468 return (EINVAL); 1469 1470 } 1471 1472 osp->open_stateid = op_res->stateid; 1473 1474 if (open_flag & FREAD) 1475 osp->os_share_acc_read++; 1476 if (open_flag & FWRITE) 1477 osp->os_share_acc_write++; 1478 osp->os_share_deny_none++; 1479 1480 /* 1481 * Need to reset this bitfield for the possible case where we were 1482 * going to OTW CLOSE the file, got a non-recoverable error, and before 1483 * we could retry the CLOSE, OPENed the file again. 1484 */ 1485 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1486 osp->os_final_close = 0; 1487 osp->os_force_close = 0; 1488 #ifdef DEBUG 1489 if (osp->os_failed_reopen) 1490 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1491 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1492 (void *)osp, (void *)cr, rnode4info(rp))); 1493 #endif 1494 osp->os_failed_reopen = 0; 1495 1496 mutex_exit(&osp->os_sync_lock); 1497 1498 nfs4_end_open_seqid_sync(oop); 1499 1500 if (created_osp && recov_state.rs_sp != NULL) { 1501 mutex_enter(&recov_state.rs_sp->s_lock); 1502 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1503 mutex_exit(&recov_state.rs_sp->s_lock); 1504 } 1505 1506 /* get rid of our reference to find oop */ 1507 open_owner_rele(oop); 1508 1509 open_stream_rele(osp, rp); 1510 1511 /* accept delegation, if any */ 1512 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1513 1514 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1515 1516 if (createmode == EXCLUSIVE4 && 1517 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1518 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1519 " EXCLUSIVE4: sending a SETATTR")); 1520 /* 1521 * If doing an exclusive create, then generate 1522 * a SETATTR to set the initial attributes. 1523 * Try to set the mtime and the atime to the 1524 * server's current time. It is somewhat 1525 * expected that these fields will be used to 1526 * store the exclusive create cookie. If not, 1527 * server implementors will need to know that 1528 * a SETATTR will follow an exclusive create 1529 * and the cookie should be destroyed if 1530 * appropriate. 1531 * 1532 * The AT_GID and AT_SIZE bits are turned off 1533 * so that the SETATTR request will not attempt 1534 * to process these. The gid will be set 1535 * separately if appropriate. The size is turned 1536 * off because it is assumed that a new file will 1537 * be created empty and if the file wasn't empty, 1538 * then the exclusive create will have failed 1539 * because the file must have existed already. 1540 * Therefore, no truncate operation is needed. 1541 */ 1542 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1543 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1544 1545 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1546 if (e.error) { 1547 /* 1548 * Couldn't correct the attributes of 1549 * the newly created file and the 1550 * attributes are wrong. Remove the 1551 * file and return an error to the 1552 * application. 1553 */ 1554 /* XXX will this take care of client state ? */ 1555 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1556 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1557 " remove file", e.error)); 1558 VN_RELE(vp); 1559 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1560 /* 1561 * Since we've reled the vnode and removed 1562 * the file we now need to return the error. 1563 * At this point we don't want to update the 1564 * dircaches, call nfs4_waitfor_purge_complete 1565 * or set vpp to vp so we need to skip these 1566 * as well. 1567 */ 1568 goto skip_update_dircaches; 1569 } 1570 } 1571 1572 /* 1573 * If we created or found the correct vnode, due to create_flag or 1574 * fh_differs being set, then update directory cache attribute, readdir 1575 * and dnlc caches. 1576 */ 1577 if (create_flag || fh_differs) { 1578 dirattr_info_t dinfo, *dinfop; 1579 1580 /* 1581 * Make sure getattr succeeded before using results. 1582 * note: op 7 is getattr(dir) for both flavors of 1583 * open(create). 1584 */ 1585 if (create_flag && res.status == NFS4_OK) { 1586 dinfo.di_time_call = t; 1587 dinfo.di_cred = cr; 1588 dinfo.di_garp = 1589 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1590 dinfop = &dinfo; 1591 } else { 1592 dinfop = NULL; 1593 } 1594 1595 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1596 dinfop); 1597 } 1598 1599 /* 1600 * If the page cache for this file was flushed from actions 1601 * above, it was done asynchronously and if that is true, 1602 * there is a need to wait here for it to complete. This must 1603 * be done outside of start_fop/end_fop. 1604 */ 1605 (void) nfs4_waitfor_purge_complete(vp); 1606 1607 /* 1608 * It is implicit that we are in the open case (create_flag == 0) since 1609 * fh_differs can only be set to a non-zero value in the open case. 1610 */ 1611 if (fh_differs != 0 && vpi != NULL) 1612 VN_RELE(vpi); 1613 1614 /* 1615 * Be sure to set *vpp to the correct value before returning. 1616 */ 1617 *vpp = vp; 1618 1619 skip_update_dircaches: 1620 1621 nfs4args_copen_free(open_args); 1622 if (setgid_flag) { 1623 nfs4args_verify_free(&argop[8]); 1624 nfs4args_setattr_free(&argop[9]); 1625 } 1626 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1627 1628 if (ncr) 1629 crfree(ncr); 1630 kmem_free(argop, argoplist_size); 1631 return (e.error); 1632 } 1633 1634 /* 1635 * Reopen an open instance. cf. nfs4open_otw(). 1636 * 1637 * Errors are returned by the nfs4_error_t parameter. 1638 * - ep->error contains an errno value or zero. 1639 * - if it is zero, ep->stat is set to an NFS status code, if any. 1640 * If the file could not be reopened, but the caller should continue, the 1641 * file is marked dead and no error values are returned. If the caller 1642 * should stop recovering open files and start over, either the ep->error 1643 * value or ep->stat will indicate an error (either something that requires 1644 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1645 * filehandles) may be handled silently by this routine. 1646 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1647 * will be started, so the caller should not do it. 1648 * 1649 * Gotos: 1650 * - kill_file : reopen failed in such a fashion to constitute marking the 1651 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1652 * is for cases where recovery is not possible. 1653 * - failed_reopen : same as above, except that the file has already been 1654 * marked dead, so no need to do it again. 1655 * - bailout : reopen failed but we are able to recover and retry the reopen - 1656 * either within this function immediately or via the calling function. 1657 */ 1658 1659 void 1660 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1661 open_claim_type4 claim, bool_t frc_use_claim_previous, 1662 bool_t is_recov) 1663 { 1664 COMPOUND4args_clnt args; 1665 COMPOUND4res_clnt res; 1666 nfs_argop4 argop[4]; 1667 nfs_resop4 *resop; 1668 OPEN4res *op_res = NULL; 1669 OPEN4cargs *open_args; 1670 GETFH4res *gf_res; 1671 rnode4_t *rp = VTOR4(vp); 1672 int doqueue = 1; 1673 cred_t *cr = NULL, *cred_otw = NULL; 1674 nfs4_open_owner_t *oop = NULL; 1675 seqid4 seqid; 1676 nfs4_ga_res_t *garp; 1677 char fn[MAXNAMELEN]; 1678 nfs4_recov_state_t recov = {NULL, 0}; 1679 nfs4_lost_rqst_t lost_rqst; 1680 mntinfo4_t *mi = VTOMI4(vp); 1681 bool_t abort; 1682 char *failed_msg = ""; 1683 int fh_different; 1684 hrtime_t t; 1685 nfs4_bseqid_entry_t *bsep = NULL; 1686 1687 ASSERT(nfs4_consistent_type(vp)); 1688 ASSERT(nfs_zone() == mi->mi_zone); 1689 1690 nfs4_error_zinit(ep); 1691 1692 /* this is the cred used to find the open owner */ 1693 cr = state_to_cred(osp); 1694 if (cr == NULL) { 1695 failed_msg = "Couldn't reopen: no cred"; 1696 goto kill_file; 1697 } 1698 /* use this cred for OTW operations */ 1699 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1700 1701 top: 1702 nfs4_error_zinit(ep); 1703 1704 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1705 /* File system has been unmounted, quit */ 1706 ep->error = EIO; 1707 failed_msg = "Couldn't reopen: file system has been unmounted"; 1708 goto kill_file; 1709 } 1710 1711 oop = osp->os_open_owner; 1712 1713 ASSERT(oop != NULL); 1714 if (oop == NULL) { /* be defensive in non-DEBUG */ 1715 failed_msg = "can't reopen: no open owner"; 1716 goto kill_file; 1717 } 1718 open_owner_hold(oop); 1719 1720 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1721 if (ep->error) { 1722 open_owner_rele(oop); 1723 oop = NULL; 1724 goto bailout; 1725 } 1726 1727 /* 1728 * If the rnode has a delegation and the delegation has been 1729 * recovered and the server didn't request a recall and the caller 1730 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1731 * recovery) and the rnode hasn't been marked dead, then install 1732 * the delegation stateid in the open stream. Otherwise, proceed 1733 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1734 */ 1735 mutex_enter(&rp->r_statev4_lock); 1736 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1737 !rp->r_deleg_return_pending && 1738 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1739 !rp->r_deleg_needs_recall && 1740 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1741 !(rp->r_flags & R4RECOVERR)) { 1742 mutex_enter(&osp->os_sync_lock); 1743 osp->os_delegation = 1; 1744 osp->open_stateid = rp->r_deleg_stateid; 1745 mutex_exit(&osp->os_sync_lock); 1746 mutex_exit(&rp->r_statev4_lock); 1747 goto bailout; 1748 } 1749 mutex_exit(&rp->r_statev4_lock); 1750 1751 /* 1752 * If the file failed recovery, just quit. This failure need not 1753 * affect other reopens, so don't return an error. 1754 */ 1755 mutex_enter(&rp->r_statelock); 1756 if (rp->r_flags & R4RECOVERR) { 1757 mutex_exit(&rp->r_statelock); 1758 ep->error = 0; 1759 goto failed_reopen; 1760 } 1761 mutex_exit(&rp->r_statelock); 1762 1763 /* 1764 * argop is empty here 1765 * 1766 * PUTFH, OPEN, GETATTR 1767 */ 1768 args.ctag = TAG_REOPEN; 1769 args.array_len = 4; 1770 args.array = argop; 1771 1772 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1773 "nfs4_reopen: file is type %d, id %s", 1774 vp->v_type, rnode4info(VTOR4(vp)))); 1775 1776 argop[0].argop = OP_CPUTFH; 1777 1778 if (claim != CLAIM_PREVIOUS) { 1779 /* 1780 * if this is a file mount then 1781 * use the mntinfo parentfh 1782 */ 1783 argop[0].nfs_argop4_u.opcputfh.sfh = 1784 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1785 VTOSV(vp)->sv_dfh; 1786 } else { 1787 /* putfh fh to reopen */ 1788 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1789 } 1790 1791 argop[1].argop = OP_COPEN; 1792 open_args = &argop[1].nfs_argop4_u.opcopen; 1793 open_args->claim = claim; 1794 1795 if (claim == CLAIM_NULL) { 1796 1797 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1798 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1799 "failed for vp 0x%p for CLAIM_NULL with %m", 1800 (void *)vp); 1801 failed_msg = "Couldn't reopen: vtoname failed for " 1802 "CLAIM_NULL"; 1803 /* nothing allocated yet */ 1804 goto kill_file; 1805 } 1806 1807 open_args->open_claim4_u.cfile = fn; 1808 } else if (claim == CLAIM_PREVIOUS) { 1809 1810 /* 1811 * We have two cases to deal with here: 1812 * 1) We're being called to reopen files in order to satisfy 1813 * a lock operation request which requires us to explicitly 1814 * reopen files which were opened under a delegation. If 1815 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1816 * that case, frc_use_claim_previous is TRUE and we must 1817 * use the rnode's current delegation type (r_deleg_type). 1818 * 2) We're reopening files during some form of recovery. 1819 * In this case, frc_use_claim_previous is FALSE and we 1820 * use the delegation type appropriate for recovery 1821 * (r_deleg_needs_recovery). 1822 */ 1823 mutex_enter(&rp->r_statev4_lock); 1824 open_args->open_claim4_u.delegate_type = 1825 frc_use_claim_previous ? 1826 rp->r_deleg_type : 1827 rp->r_deleg_needs_recovery; 1828 mutex_exit(&rp->r_statev4_lock); 1829 1830 } else if (claim == CLAIM_DELEGATE_CUR) { 1831 1832 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1833 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1834 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1835 "with %m", (void *)vp); 1836 failed_msg = "Couldn't reopen: vtoname failed for " 1837 "CLAIM_DELEGATE_CUR"; 1838 /* nothing allocated yet */ 1839 goto kill_file; 1840 } 1841 1842 mutex_enter(&rp->r_statev4_lock); 1843 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1844 rp->r_deleg_stateid; 1845 mutex_exit(&rp->r_statev4_lock); 1846 1847 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1848 } 1849 open_args->opentype = OPEN4_NOCREATE; 1850 open_args->owner.clientid = mi2clientid(mi); 1851 open_args->owner.owner_len = sizeof (oop->oo_name); 1852 open_args->owner.owner_val = 1853 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1854 bcopy(&oop->oo_name, open_args->owner.owner_val, 1855 open_args->owner.owner_len); 1856 open_args->share_access = 0; 1857 open_args->share_deny = 0; 1858 1859 mutex_enter(&osp->os_sync_lock); 1860 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1861 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1862 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1863 (void *)osp, (void *)rp, osp->os_share_acc_read, 1864 osp->os_share_acc_write, osp->os_open_ref_count, 1865 osp->os_mmap_read, osp->os_mmap_write, claim)); 1866 1867 if (osp->os_share_acc_read || osp->os_mmap_read) 1868 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1869 if (osp->os_share_acc_write || osp->os_mmap_write) 1870 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1871 if (osp->os_share_deny_read) 1872 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1873 if (osp->os_share_deny_write) 1874 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1875 mutex_exit(&osp->os_sync_lock); 1876 1877 seqid = nfs4_get_open_seqid(oop) + 1; 1878 open_args->seqid = seqid; 1879 1880 /* Construct the getfh part of the compound */ 1881 argop[2].argop = OP_GETFH; 1882 1883 /* Construct the getattr part of the compound */ 1884 argop[3].argop = OP_GETATTR; 1885 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1886 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1887 1888 t = gethrtime(); 1889 1890 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1891 1892 if (ep->error) { 1893 if (!is_recov && !frc_use_claim_previous && 1894 (ep->error == EINTR || ep->error == ETIMEDOUT || 1895 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1896 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1897 cred_otw, vp, NULL, open_args); 1898 abort = nfs4_start_recovery(ep, 1899 VTOMI4(vp), vp, NULL, NULL, 1900 lost_rqst.lr_op == OP_OPEN ? 1901 &lost_rqst : NULL, OP_OPEN, NULL); 1902 nfs4args_copen_free(open_args); 1903 goto bailout; 1904 } 1905 1906 nfs4args_copen_free(open_args); 1907 1908 if (ep->error == EACCES && cred_otw != cr) { 1909 crfree(cred_otw); 1910 cred_otw = cr; 1911 crhold(cred_otw); 1912 nfs4_end_open_seqid_sync(oop); 1913 open_owner_rele(oop); 1914 oop = NULL; 1915 goto top; 1916 } 1917 if (ep->error == ETIMEDOUT) 1918 goto bailout; 1919 failed_msg = "Couldn't reopen: rpc error"; 1920 goto kill_file; 1921 } 1922 1923 if (nfs4_need_to_bump_seqid(&res)) 1924 nfs4_set_open_seqid(seqid, oop, args.ctag); 1925 1926 switch (res.status) { 1927 case NFS4_OK: 1928 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1929 mutex_enter(&rp->r_statelock); 1930 rp->r_delay_interval = 0; 1931 mutex_exit(&rp->r_statelock); 1932 } 1933 break; 1934 case NFS4ERR_BAD_SEQID: 1935 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1936 args.ctag, open_args->seqid); 1937 1938 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1939 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1940 NULL, OP_OPEN, bsep); 1941 1942 nfs4args_copen_free(open_args); 1943 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1944 nfs4_end_open_seqid_sync(oop); 1945 open_owner_rele(oop); 1946 oop = NULL; 1947 kmem_free(bsep, sizeof (*bsep)); 1948 1949 goto kill_file; 1950 case NFS4ERR_NO_GRACE: 1951 nfs4args_copen_free(open_args); 1952 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1953 nfs4_end_open_seqid_sync(oop); 1954 open_owner_rele(oop); 1955 oop = NULL; 1956 if (claim == CLAIM_PREVIOUS) { 1957 /* 1958 * Retry as a plain open. We don't need to worry about 1959 * checking the changeinfo: it is acceptable for a 1960 * client to re-open a file and continue processing 1961 * (in the absence of locks). 1962 */ 1963 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1964 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1965 "will retry as CLAIM_NULL")); 1966 claim = CLAIM_NULL; 1967 nfs4_mi_kstat_inc_no_grace(mi); 1968 goto top; 1969 } 1970 failed_msg = 1971 "Couldn't reopen: tried reclaim outside grace period. "; 1972 goto kill_file; 1973 case NFS4ERR_GRACE: 1974 nfs4_set_grace_wait(mi); 1975 nfs4args_copen_free(open_args); 1976 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1977 nfs4_end_open_seqid_sync(oop); 1978 open_owner_rele(oop); 1979 oop = NULL; 1980 ep->error = nfs4_wait_for_grace(mi, &recov); 1981 if (ep->error != 0) 1982 goto bailout; 1983 goto top; 1984 case NFS4ERR_DELAY: 1985 nfs4_set_delay_wait(vp); 1986 nfs4args_copen_free(open_args); 1987 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1988 nfs4_end_open_seqid_sync(oop); 1989 open_owner_rele(oop); 1990 oop = NULL; 1991 ep->error = nfs4_wait_for_delay(vp, &recov); 1992 nfs4_mi_kstat_inc_delay(mi); 1993 if (ep->error != 0) 1994 goto bailout; 1995 goto top; 1996 case NFS4ERR_FHEXPIRED: 1997 /* recover filehandle and retry */ 1998 abort = nfs4_start_recovery(ep, 1999 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL); 2000 nfs4args_copen_free(open_args); 2001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2002 nfs4_end_open_seqid_sync(oop); 2003 open_owner_rele(oop); 2004 oop = NULL; 2005 if (abort == FALSE) 2006 goto top; 2007 failed_msg = "Couldn't reopen: recovery aborted"; 2008 goto kill_file; 2009 case NFS4ERR_RESOURCE: 2010 case NFS4ERR_STALE_CLIENTID: 2011 case NFS4ERR_WRONGSEC: 2012 case NFS4ERR_EXPIRED: 2013 /* 2014 * Do not mark the file dead and let the calling 2015 * function initiate recovery. 2016 */ 2017 nfs4args_copen_free(open_args); 2018 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2019 nfs4_end_open_seqid_sync(oop); 2020 open_owner_rele(oop); 2021 oop = NULL; 2022 goto bailout; 2023 case NFS4ERR_ACCESS: 2024 if (cred_otw != cr) { 2025 crfree(cred_otw); 2026 cred_otw = cr; 2027 crhold(cred_otw); 2028 nfs4args_copen_free(open_args); 2029 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2030 nfs4_end_open_seqid_sync(oop); 2031 open_owner_rele(oop); 2032 oop = NULL; 2033 goto top; 2034 } 2035 /* fall through */ 2036 default: 2037 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2038 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2039 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2040 rnode4info(VTOR4(vp)))); 2041 failed_msg = "Couldn't reopen: NFSv4 error"; 2042 nfs4args_copen_free(open_args); 2043 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2044 goto kill_file; 2045 } 2046 2047 resop = &res.array[1]; /* open res */ 2048 op_res = &resop->nfs_resop4_u.opopen; 2049 2050 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2051 2052 /* 2053 * Check if the path we reopened really is the same 2054 * file. We could end up in a situation where the file 2055 * was removed and a new file created with the same name. 2056 */ 2057 resop = &res.array[2]; 2058 gf_res = &resop->nfs_resop4_u.opgetfh; 2059 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2060 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2061 if (fh_different) { 2062 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2063 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2064 /* Oops, we don't have the same file */ 2065 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2066 failed_msg = "Couldn't reopen: Persistent " 2067 "file handle changed"; 2068 else 2069 failed_msg = "Couldn't reopen: Volatile " 2070 "(no expire on open) file handle changed"; 2071 2072 nfs4args_copen_free(open_args); 2073 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2074 nfs_rw_exit(&mi->mi_fh_lock); 2075 goto kill_file; 2076 2077 } else { 2078 /* 2079 * We have volatile file handles that don't compare. 2080 * If the fids are the same then we assume that the 2081 * file handle expired but the rnode still refers to 2082 * the same file object. 2083 * 2084 * First check that we have fids or not. 2085 * If we don't we have a dumb server so we will 2086 * just assume every thing is ok for now. 2087 */ 2088 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2089 rp->r_attr.va_mask & AT_NODEID && 2090 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2091 /* 2092 * We have fids, but they don't 2093 * compare. So kill the file. 2094 */ 2095 failed_msg = 2096 "Couldn't reopen: file handle changed" 2097 " due to mismatched fids"; 2098 nfs4args_copen_free(open_args); 2099 (void) xdr_free(xdr_COMPOUND4res_clnt, 2100 (caddr_t)&res); 2101 nfs_rw_exit(&mi->mi_fh_lock); 2102 goto kill_file; 2103 } else { 2104 /* 2105 * We have volatile file handles that refers 2106 * to the same file (at least they have the 2107 * same fid) or we don't have fids so we 2108 * can't tell. :(. We'll be a kind and accepting 2109 * client so we'll update the rnode's file 2110 * handle with the otw handle. 2111 * 2112 * We need to drop mi->mi_fh_lock since 2113 * sh4_update acquires it. Since there is 2114 * only one recovery thread there is no 2115 * race. 2116 */ 2117 nfs_rw_exit(&mi->mi_fh_lock); 2118 sfh4_update(rp->r_fh, &gf_res->object); 2119 } 2120 } 2121 } else { 2122 nfs_rw_exit(&mi->mi_fh_lock); 2123 } 2124 2125 ASSERT(nfs4_consistent_type(vp)); 2126 2127 /* 2128 * If the server wanted an OPEN_CONFIRM but that fails, just start 2129 * over. Presumably if there is a persistent error it will show up 2130 * when we resend the OPEN. 2131 */ 2132 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2133 bool_t retry_open = FALSE; 2134 2135 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2136 cred_otw, is_recov, &retry_open, 2137 oop, FALSE, ep, NULL); 2138 if (ep->error || ep->stat) { 2139 nfs4args_copen_free(open_args); 2140 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2141 nfs4_end_open_seqid_sync(oop); 2142 open_owner_rele(oop); 2143 oop = NULL; 2144 goto top; 2145 } 2146 } 2147 2148 mutex_enter(&osp->os_sync_lock); 2149 osp->open_stateid = op_res->stateid; 2150 osp->os_delegation = 0; 2151 /* 2152 * Need to reset this bitfield for the possible case where we were 2153 * going to OTW CLOSE the file, got a non-recoverable error, and before 2154 * we could retry the CLOSE, OPENed the file again. 2155 */ 2156 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2157 osp->os_final_close = 0; 2158 osp->os_force_close = 0; 2159 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2160 osp->os_dc_openacc = open_args->share_access; 2161 mutex_exit(&osp->os_sync_lock); 2162 2163 nfs4_end_open_seqid_sync(oop); 2164 2165 /* accept delegation, if any */ 2166 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2167 2168 nfs4args_copen_free(open_args); 2169 2170 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2171 2172 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2173 2174 ASSERT(nfs4_consistent_type(vp)); 2175 2176 open_owner_rele(oop); 2177 crfree(cr); 2178 crfree(cred_otw); 2179 return; 2180 2181 kill_file: 2182 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2183 failed_reopen: 2184 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2185 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2186 (void *)osp, (void *)cr, rnode4info(rp))); 2187 mutex_enter(&osp->os_sync_lock); 2188 osp->os_failed_reopen = 1; 2189 mutex_exit(&osp->os_sync_lock); 2190 bailout: 2191 if (oop != NULL) { 2192 nfs4_end_open_seqid_sync(oop); 2193 open_owner_rele(oop); 2194 } 2195 if (cr != NULL) 2196 crfree(cr); 2197 if (cred_otw != NULL) 2198 crfree(cred_otw); 2199 } 2200 2201 /* for . and .. OPENs */ 2202 /* ARGSUSED */ 2203 static int 2204 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2205 { 2206 rnode4_t *rp; 2207 nfs4_ga_res_t gar; 2208 2209 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2210 2211 /* 2212 * If close-to-open consistency checking is turned off or 2213 * if there is no cached data, we can avoid 2214 * the over the wire getattr. Otherwise, force a 2215 * call to the server to get fresh attributes and to 2216 * check caches. This is required for close-to-open 2217 * consistency. 2218 */ 2219 rp = VTOR4(*vpp); 2220 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2221 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2222 return (0); 2223 2224 gar.n4g_va.va_mask = AT_ALL; 2225 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2226 } 2227 2228 /* 2229 * CLOSE a file 2230 */ 2231 /* ARGSUSED */ 2232 static int 2233 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2234 caller_context_t *ct) 2235 { 2236 rnode4_t *rp; 2237 int error = 0; 2238 int r_error = 0; 2239 int n4error = 0; 2240 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2241 2242 /* 2243 * Remove client state for this (lockowner, file) pair. 2244 * Issue otw v4 call to have the server do the same. 2245 */ 2246 2247 rp = VTOR4(vp); 2248 2249 /* 2250 * zone_enter(2) prevents processes from changing zones with NFS files 2251 * open; if we happen to get here from the wrong zone we can't do 2252 * anything over the wire. 2253 */ 2254 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2255 /* 2256 * We could attempt to clean up locks, except we're sure 2257 * that the current process didn't acquire any locks on 2258 * the file: any attempt to lock a file belong to another zone 2259 * will fail, and one can't lock an NFS file and then change 2260 * zones, as that fails too. 2261 * 2262 * Returning an error here is the sane thing to do. A 2263 * subsequent call to VN_RELE() which translates to a 2264 * nfs4_inactive() will clean up state: if the zone of the 2265 * vnode's origin is still alive and kicking, the inactive 2266 * thread will handle the request (from the correct zone), and 2267 * everything (minus the OTW close call) should be OK. If the 2268 * zone is going away nfs4_async_inactive() will throw away 2269 * delegations, open streams and cached pages inline. 2270 */ 2271 return (EIO); 2272 } 2273 2274 /* 2275 * If we are using local locking for this filesystem, then 2276 * release all of the SYSV style record locks. Otherwise, 2277 * we are doing network locking and we need to release all 2278 * of the network locks. All of the locks held by this 2279 * process on this file are released no matter what the 2280 * incoming reference count is. 2281 */ 2282 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2283 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2284 cleanshares(vp, ttoproc(curthread)->p_pid); 2285 } else 2286 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2287 2288 if (e.error) { 2289 struct lm_sysid *lmsid; 2290 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2291 if (lmsid == NULL) { 2292 DTRACE_PROBE2(unknown__sysid, int, e.error, 2293 vnode_t *, vp); 2294 } else { 2295 cleanlocks(vp, ttoproc(curthread)->p_pid, 2296 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2297 } 2298 return (e.error); 2299 } 2300 2301 if (count > 1) 2302 return (0); 2303 2304 /* 2305 * If the file has been `unlinked', then purge the 2306 * DNLC so that this vnode will get reycled quicker 2307 * and the .nfs* file on the server will get removed. 2308 */ 2309 if (rp->r_unldvp != NULL) 2310 dnlc_purge_vp(vp); 2311 2312 /* 2313 * If the file was open for write and there are pages, 2314 * do a synchronous flush and commit of all of the 2315 * dirty and uncommitted pages. 2316 */ 2317 ASSERT(!e.error); 2318 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2319 error = nfs4_putpage_commit(vp, 0, 0, cr); 2320 2321 mutex_enter(&rp->r_statelock); 2322 r_error = rp->r_error; 2323 rp->r_error = 0; 2324 mutex_exit(&rp->r_statelock); 2325 2326 /* 2327 * If this file type is one for which no explicit 'open' was 2328 * done, then bail now (ie. no need for protocol 'close'). If 2329 * there was an error w/the vm subsystem, return _that_ error, 2330 * otherwise, return any errors that may've been reported via 2331 * the rnode. 2332 */ 2333 if (vp->v_type != VREG) 2334 return (error ? error : r_error); 2335 2336 /* 2337 * The sync putpage commit may have failed above, but since 2338 * we're working w/a regular file, we need to do the protocol 2339 * 'close' (nfs4close_one will figure out if an otw close is 2340 * needed or not). Report any errors _after_ doing the protocol 2341 * 'close'. 2342 */ 2343 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2344 n4error = e.error ? e.error : geterrno4(e.stat); 2345 2346 /* 2347 * Error reporting prio (Hi -> Lo) 2348 * 2349 * i) nfs4_putpage_commit (error) 2350 * ii) rnode's (r_error) 2351 * iii) nfs4close_one (n4error) 2352 */ 2353 return (error ? error : (r_error ? r_error : n4error)); 2354 } 2355 2356 /* 2357 * Initialize *lost_rqstp. 2358 */ 2359 2360 static void 2361 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2362 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2363 vnode_t *vp) 2364 { 2365 if (error != ETIMEDOUT && error != EINTR && 2366 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2367 lost_rqstp->lr_op = 0; 2368 return; 2369 } 2370 2371 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2372 "nfs4close_save_lost_rqst: error %d", error)); 2373 2374 lost_rqstp->lr_op = OP_CLOSE; 2375 /* 2376 * The vp is held and rele'd via the recovery code. 2377 * See nfs4_save_lost_rqst. 2378 */ 2379 lost_rqstp->lr_vp = vp; 2380 lost_rqstp->lr_dvp = NULL; 2381 lost_rqstp->lr_oop = oop; 2382 lost_rqstp->lr_osp = osp; 2383 ASSERT(osp != NULL); 2384 ASSERT(mutex_owned(&osp->os_sync_lock)); 2385 osp->os_pending_close = 1; 2386 lost_rqstp->lr_lop = NULL; 2387 lost_rqstp->lr_cr = cr; 2388 lost_rqstp->lr_flk = NULL; 2389 lost_rqstp->lr_putfirst = FALSE; 2390 } 2391 2392 /* 2393 * Assumes you already have the open seqid sync grabbed as well as the 2394 * 'os_sync_lock'. Note: this will release the open seqid sync and 2395 * 'os_sync_lock' if client recovery starts. Calling functions have to 2396 * be prepared to handle this. 2397 * 2398 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2399 * was needed and was started, and that the calling function should retry 2400 * this function; otherwise it is returned as 0. 2401 * 2402 * Errors are returned via the nfs4_error_t parameter. 2403 */ 2404 static void 2405 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2406 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2407 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2408 { 2409 COMPOUND4args_clnt args; 2410 COMPOUND4res_clnt res; 2411 CLOSE4args *close_args; 2412 nfs_resop4 *resop; 2413 nfs_argop4 argop[3]; 2414 int doqueue = 1; 2415 mntinfo4_t *mi; 2416 seqid4 seqid; 2417 vnode_t *vp; 2418 bool_t needrecov = FALSE; 2419 nfs4_lost_rqst_t lost_rqst; 2420 hrtime_t t; 2421 2422 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2423 2424 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2425 2426 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2427 2428 /* Only set this to 1 if recovery is started */ 2429 *recov = 0; 2430 2431 /* do the OTW call to close the file */ 2432 2433 if (close_type == CLOSE_RESEND) 2434 args.ctag = TAG_CLOSE_LOST; 2435 else if (close_type == CLOSE_AFTER_RESEND) 2436 args.ctag = TAG_CLOSE_UNDO; 2437 else 2438 args.ctag = TAG_CLOSE; 2439 2440 args.array_len = 3; 2441 args.array = argop; 2442 2443 vp = RTOV4(rp); 2444 2445 mi = VTOMI4(vp); 2446 2447 /* putfh target fh */ 2448 argop[0].argop = OP_CPUTFH; 2449 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2450 2451 argop[1].argop = OP_GETATTR; 2452 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2453 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2454 2455 argop[2].argop = OP_CLOSE; 2456 close_args = &argop[2].nfs_argop4_u.opclose; 2457 2458 seqid = nfs4_get_open_seqid(oop) + 1; 2459 2460 close_args->seqid = seqid; 2461 close_args->open_stateid = osp->open_stateid; 2462 2463 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2464 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2465 rnode4info(rp))); 2466 2467 t = gethrtime(); 2468 2469 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2470 2471 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2472 nfs4_set_open_seqid(seqid, oop, args.ctag); 2473 } 2474 2475 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2476 if (ep->error && !needrecov) { 2477 /* 2478 * if there was an error and no recovery is to be done 2479 * then then set up the file to flush its cache if 2480 * needed for the next caller. 2481 */ 2482 mutex_enter(&rp->r_statelock); 2483 PURGE_ATTRCACHE4_LOCKED(rp); 2484 rp->r_flags &= ~R4WRITEMODIFIED; 2485 mutex_exit(&rp->r_statelock); 2486 return; 2487 } 2488 2489 if (needrecov) { 2490 bool_t abort; 2491 nfs4_bseqid_entry_t *bsep = NULL; 2492 2493 if (close_type != CLOSE_RESEND) 2494 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2495 osp, cred_otw, vp); 2496 2497 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2498 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2499 0, args.ctag, close_args->seqid); 2500 2501 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2502 "nfs4close_otw: initiating recovery. error %d " 2503 "res.status %d", ep->error, res.status)); 2504 2505 /* 2506 * Drop the 'os_sync_lock' here so we don't hit 2507 * a potential recursive mutex_enter via an 2508 * 'open_stream_hold()'. 2509 */ 2510 mutex_exit(&osp->os_sync_lock); 2511 *have_sync_lockp = 0; 2512 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2513 (close_type != CLOSE_RESEND && 2514 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2515 OP_CLOSE, bsep); 2516 2517 /* drop open seq sync, and let the calling function regrab it */ 2518 nfs4_end_open_seqid_sync(oop); 2519 *did_start_seqid_syncp = 0; 2520 2521 if (bsep) 2522 kmem_free(bsep, sizeof (*bsep)); 2523 /* 2524 * For signals, the caller wants to quit, so don't say to 2525 * retry. For forced unmount, if it's a user thread, it 2526 * wants to quit. If it's a recovery thread, the retry 2527 * will happen higher-up on the call stack. Either way, 2528 * don't say to retry. 2529 */ 2530 if (abort == FALSE && ep->error != EINTR && 2531 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2532 close_type != CLOSE_RESEND && 2533 close_type != CLOSE_AFTER_RESEND) 2534 *recov = 1; 2535 else 2536 *recov = 0; 2537 2538 if (!ep->error) 2539 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2540 return; 2541 } 2542 2543 if (res.status) { 2544 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2545 return; 2546 } 2547 2548 mutex_enter(&rp->r_statev4_lock); 2549 rp->created_v4 = 0; 2550 mutex_exit(&rp->r_statev4_lock); 2551 2552 resop = &res.array[2]; 2553 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2554 osp->os_valid = 0; 2555 2556 /* 2557 * This removes the reference obtained at OPEN; ie, when the 2558 * open stream structure was created. 2559 * 2560 * We don't have to worry about calling 'open_stream_rele' 2561 * since we our currently holding a reference to the open 2562 * stream which means the count cannot go to 0 with this 2563 * decrement. 2564 */ 2565 ASSERT(osp->os_ref_count >= 2); 2566 osp->os_ref_count--; 2567 2568 if (!ep->error) 2569 nfs4_attr_cache(vp, 2570 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2571 t, cred_otw, TRUE, NULL); 2572 2573 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2574 " returning %d", ep->error)); 2575 2576 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2577 } 2578 2579 /* ARGSUSED */ 2580 static int 2581 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2582 caller_context_t *ct) 2583 { 2584 rnode4_t *rp; 2585 u_offset_t off; 2586 offset_t diff; 2587 uint_t on; 2588 uint_t n; 2589 caddr_t base; 2590 uint_t flags; 2591 int error; 2592 mntinfo4_t *mi; 2593 2594 rp = VTOR4(vp); 2595 2596 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2597 2598 if (IS_SHADOW(vp, rp)) 2599 vp = RTOV4(rp); 2600 2601 if (vp->v_type != VREG) 2602 return (EISDIR); 2603 2604 mi = VTOMI4(vp); 2605 2606 if (nfs_zone() != mi->mi_zone) 2607 return (EIO); 2608 2609 if (uiop->uio_resid == 0) 2610 return (0); 2611 2612 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2613 return (EINVAL); 2614 2615 mutex_enter(&rp->r_statelock); 2616 if (rp->r_flags & R4RECOVERRP) 2617 error = (rp->r_error ? rp->r_error : EIO); 2618 else 2619 error = 0; 2620 mutex_exit(&rp->r_statelock); 2621 if (error) 2622 return (error); 2623 2624 /* 2625 * Bypass VM if caching has been disabled (e.g., locking) or if 2626 * using client-side direct I/O and the file is not mmap'd and 2627 * there are no cached pages. 2628 */ 2629 if ((vp->v_flag & VNOCACHE) || 2630 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2631 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2632 size_t resid = 0; 2633 2634 return (nfs4read(vp, NULL, uiop->uio_loffset, 2635 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2636 } 2637 2638 error = 0; 2639 2640 do { 2641 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2642 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2643 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2644 2645 if (error = nfs4_validate_caches(vp, cr)) 2646 break; 2647 2648 mutex_enter(&rp->r_statelock); 2649 while (rp->r_flags & R4INCACHEPURGE) { 2650 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2651 mutex_exit(&rp->r_statelock); 2652 return (EINTR); 2653 } 2654 } 2655 diff = rp->r_size - uiop->uio_loffset; 2656 mutex_exit(&rp->r_statelock); 2657 if (diff <= 0) 2658 break; 2659 if (diff < n) 2660 n = (uint_t)diff; 2661 2662 if (vpm_enable) { 2663 /* 2664 * Copy data. 2665 */ 2666 error = vpm_data_copy(vp, off + on, n, uiop, 2667 1, NULL, 0, S_READ); 2668 } else { 2669 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2670 S_READ); 2671 2672 error = uiomove(base + on, n, UIO_READ, uiop); 2673 } 2674 2675 if (!error) { 2676 /* 2677 * If read a whole block or read to eof, 2678 * won't need this buffer again soon. 2679 */ 2680 mutex_enter(&rp->r_statelock); 2681 if (n + on == MAXBSIZE || 2682 uiop->uio_loffset == rp->r_size) 2683 flags = SM_DONTNEED; 2684 else 2685 flags = 0; 2686 mutex_exit(&rp->r_statelock); 2687 if (vpm_enable) { 2688 error = vpm_sync_pages(vp, off, n, flags); 2689 } else { 2690 error = segmap_release(segkmap, base, flags); 2691 } 2692 } else { 2693 if (vpm_enable) { 2694 (void) vpm_sync_pages(vp, off, n, 0); 2695 } else { 2696 (void) segmap_release(segkmap, base, 0); 2697 } 2698 } 2699 } while (!error && uiop->uio_resid > 0); 2700 2701 return (error); 2702 } 2703 2704 /* ARGSUSED */ 2705 static int 2706 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2707 caller_context_t *ct) 2708 { 2709 rlim64_t limit = uiop->uio_llimit; 2710 rnode4_t *rp; 2711 u_offset_t off; 2712 caddr_t base; 2713 uint_t flags; 2714 int remainder; 2715 size_t n; 2716 int on; 2717 int error; 2718 int resid; 2719 u_offset_t offset; 2720 mntinfo4_t *mi; 2721 uint_t bsize; 2722 2723 rp = VTOR4(vp); 2724 2725 if (IS_SHADOW(vp, rp)) 2726 vp = RTOV4(rp); 2727 2728 if (vp->v_type != VREG) 2729 return (EISDIR); 2730 2731 mi = VTOMI4(vp); 2732 2733 if (nfs_zone() != mi->mi_zone) 2734 return (EIO); 2735 2736 if (uiop->uio_resid == 0) 2737 return (0); 2738 2739 mutex_enter(&rp->r_statelock); 2740 if (rp->r_flags & R4RECOVERRP) 2741 error = (rp->r_error ? rp->r_error : EIO); 2742 else 2743 error = 0; 2744 mutex_exit(&rp->r_statelock); 2745 if (error) 2746 return (error); 2747 2748 if (ioflag & FAPPEND) { 2749 struct vattr va; 2750 2751 /* 2752 * Must serialize if appending. 2753 */ 2754 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2755 nfs_rw_exit(&rp->r_rwlock); 2756 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2757 INTR4(vp))) 2758 return (EINTR); 2759 } 2760 2761 va.va_mask = AT_SIZE; 2762 error = nfs4getattr(vp, &va, cr); 2763 if (error) 2764 return (error); 2765 uiop->uio_loffset = va.va_size; 2766 } 2767 2768 offset = uiop->uio_loffset + uiop->uio_resid; 2769 2770 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2771 return (EINVAL); 2772 2773 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2774 limit = MAXOFFSET_T; 2775 2776 /* 2777 * Check to make sure that the process will not exceed 2778 * its limit on file size. It is okay to write up to 2779 * the limit, but not beyond. Thus, the write which 2780 * reaches the limit will be short and the next write 2781 * will return an error. 2782 */ 2783 remainder = 0; 2784 if (offset > uiop->uio_llimit) { 2785 remainder = offset - uiop->uio_llimit; 2786 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2787 if (uiop->uio_resid <= 0) { 2788 proc_t *p = ttoproc(curthread); 2789 2790 uiop->uio_resid += remainder; 2791 mutex_enter(&p->p_lock); 2792 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2793 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2794 mutex_exit(&p->p_lock); 2795 return (EFBIG); 2796 } 2797 } 2798 2799 /* update the change attribute, if we have a write delegation */ 2800 2801 mutex_enter(&rp->r_statev4_lock); 2802 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2803 rp->r_deleg_change++; 2804 2805 mutex_exit(&rp->r_statev4_lock); 2806 2807 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2808 return (EINTR); 2809 2810 /* 2811 * Bypass VM if caching has been disabled (e.g., locking) or if 2812 * using client-side direct I/O and the file is not mmap'd and 2813 * there are no cached pages. 2814 */ 2815 if ((vp->v_flag & VNOCACHE) || 2816 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2817 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2818 size_t bufsize; 2819 int count; 2820 u_offset_t org_offset; 2821 stable_how4 stab_comm; 2822 nfs4_fwrite: 2823 if (rp->r_flags & R4STALE) { 2824 resid = uiop->uio_resid; 2825 offset = uiop->uio_loffset; 2826 error = rp->r_error; 2827 /* 2828 * A close may have cleared r_error, if so, 2829 * propagate ESTALE error return properly 2830 */ 2831 if (error == 0) 2832 error = ESTALE; 2833 goto bottom; 2834 } 2835 2836 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2837 base = kmem_alloc(bufsize, KM_SLEEP); 2838 do { 2839 if (ioflag & FDSYNC) 2840 stab_comm = DATA_SYNC4; 2841 else 2842 stab_comm = FILE_SYNC4; 2843 resid = uiop->uio_resid; 2844 offset = uiop->uio_loffset; 2845 count = MIN(uiop->uio_resid, bufsize); 2846 org_offset = uiop->uio_loffset; 2847 error = uiomove(base, count, UIO_WRITE, uiop); 2848 if (!error) { 2849 error = nfs4write(vp, base, org_offset, 2850 count, cr, &stab_comm); 2851 if (!error) { 2852 mutex_enter(&rp->r_statelock); 2853 if (rp->r_size < uiop->uio_loffset) 2854 rp->r_size = uiop->uio_loffset; 2855 mutex_exit(&rp->r_statelock); 2856 } 2857 } 2858 } while (!error && uiop->uio_resid > 0); 2859 kmem_free(base, bufsize); 2860 goto bottom; 2861 } 2862 2863 bsize = vp->v_vfsp->vfs_bsize; 2864 2865 do { 2866 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2867 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2868 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2869 2870 resid = uiop->uio_resid; 2871 offset = uiop->uio_loffset; 2872 2873 if (rp->r_flags & R4STALE) { 2874 error = rp->r_error; 2875 /* 2876 * A close may have cleared r_error, if so, 2877 * propagate ESTALE error return properly 2878 */ 2879 if (error == 0) 2880 error = ESTALE; 2881 break; 2882 } 2883 2884 /* 2885 * Don't create dirty pages faster than they 2886 * can be cleaned so that the system doesn't 2887 * get imbalanced. If the async queue is 2888 * maxed out, then wait for it to drain before 2889 * creating more dirty pages. Also, wait for 2890 * any threads doing pagewalks in the vop_getattr 2891 * entry points so that they don't block for 2892 * long periods. 2893 */ 2894 mutex_enter(&rp->r_statelock); 2895 while ((mi->mi_max_threads != 0 && 2896 rp->r_awcount > 2 * mi->mi_max_threads) || 2897 rp->r_gcount > 0) { 2898 if (INTR4(vp)) { 2899 klwp_t *lwp = ttolwp(curthread); 2900 2901 if (lwp != NULL) 2902 lwp->lwp_nostop++; 2903 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2904 mutex_exit(&rp->r_statelock); 2905 if (lwp != NULL) 2906 lwp->lwp_nostop--; 2907 error = EINTR; 2908 goto bottom; 2909 } 2910 if (lwp != NULL) 2911 lwp->lwp_nostop--; 2912 } else 2913 cv_wait(&rp->r_cv, &rp->r_statelock); 2914 } 2915 mutex_exit(&rp->r_statelock); 2916 2917 /* 2918 * Touch the page and fault it in if it is not in core 2919 * before segmap_getmapflt or vpm_data_copy can lock it. 2920 * This is to avoid the deadlock if the buffer is mapped 2921 * to the same file through mmap which we want to write. 2922 */ 2923 uio_prefaultpages((long)n, uiop); 2924 2925 if (vpm_enable) { 2926 /* 2927 * It will use kpm mappings, so no need to 2928 * pass an address. 2929 */ 2930 error = writerp4(rp, NULL, n, uiop, 0); 2931 } else { 2932 if (segmap_kpm) { 2933 int pon = uiop->uio_loffset & PAGEOFFSET; 2934 size_t pn = MIN(PAGESIZE - pon, 2935 uiop->uio_resid); 2936 int pagecreate; 2937 2938 mutex_enter(&rp->r_statelock); 2939 pagecreate = (pon == 0) && (pn == PAGESIZE || 2940 uiop->uio_loffset + pn >= rp->r_size); 2941 mutex_exit(&rp->r_statelock); 2942 2943 base = segmap_getmapflt(segkmap, vp, off + on, 2944 pn, !pagecreate, S_WRITE); 2945 2946 error = writerp4(rp, base + pon, n, uiop, 2947 pagecreate); 2948 2949 } else { 2950 base = segmap_getmapflt(segkmap, vp, off + on, 2951 n, 0, S_READ); 2952 error = writerp4(rp, base + on, n, uiop, 0); 2953 } 2954 } 2955 2956 if (!error) { 2957 if (mi->mi_flags & MI4_NOAC) 2958 flags = SM_WRITE; 2959 else if ((uiop->uio_loffset % bsize) == 0 || 2960 IS_SWAPVP(vp)) { 2961 /* 2962 * Have written a whole block. 2963 * Start an asynchronous write 2964 * and mark the buffer to 2965 * indicate that it won't be 2966 * needed again soon. 2967 */ 2968 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2969 } else 2970 flags = 0; 2971 if ((ioflag & (FSYNC|FDSYNC)) || 2972 (rp->r_flags & R4OUTOFSPACE)) { 2973 flags &= ~SM_ASYNC; 2974 flags |= SM_WRITE; 2975 } 2976 if (vpm_enable) { 2977 error = vpm_sync_pages(vp, off, n, flags); 2978 } else { 2979 error = segmap_release(segkmap, base, flags); 2980 } 2981 } else { 2982 if (vpm_enable) { 2983 (void) vpm_sync_pages(vp, off, n, 0); 2984 } else { 2985 (void) segmap_release(segkmap, base, 0); 2986 } 2987 /* 2988 * In the event that we got an access error while 2989 * faulting in a page for a write-only file just 2990 * force a write. 2991 */ 2992 if (error == EACCES) 2993 goto nfs4_fwrite; 2994 } 2995 } while (!error && uiop->uio_resid > 0); 2996 2997 bottom: 2998 if (error) { 2999 uiop->uio_resid = resid + remainder; 3000 uiop->uio_loffset = offset; 3001 } else { 3002 uiop->uio_resid += remainder; 3003 3004 mutex_enter(&rp->r_statev4_lock); 3005 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3006 gethrestime(&rp->r_attr.va_mtime); 3007 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3008 } 3009 mutex_exit(&rp->r_statev4_lock); 3010 } 3011 3012 nfs_rw_exit(&rp->r_lkserlock); 3013 3014 return (error); 3015 } 3016 3017 /* 3018 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3019 */ 3020 static int 3021 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3022 int flags, cred_t *cr) 3023 { 3024 struct buf *bp; 3025 int error; 3026 page_t *savepp; 3027 uchar_t fsdata; 3028 stable_how4 stab_comm; 3029 3030 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3031 bp = pageio_setup(pp, len, vp, flags); 3032 ASSERT(bp != NULL); 3033 3034 /* 3035 * pageio_setup should have set b_addr to 0. This 3036 * is correct since we want to do I/O on a page 3037 * boundary. bp_mapin will use this addr to calculate 3038 * an offset, and then set b_addr to the kernel virtual 3039 * address it allocated for us. 3040 */ 3041 ASSERT(bp->b_un.b_addr == 0); 3042 3043 bp->b_edev = 0; 3044 bp->b_dev = 0; 3045 bp->b_lblkno = lbtodb(off); 3046 bp->b_file = vp; 3047 bp->b_offset = (offset_t)off; 3048 bp_mapin(bp); 3049 3050 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3051 freemem > desfree) 3052 stab_comm = UNSTABLE4; 3053 else 3054 stab_comm = FILE_SYNC4; 3055 3056 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3057 3058 bp_mapout(bp); 3059 pageio_done(bp); 3060 3061 if (stab_comm == UNSTABLE4) 3062 fsdata = C_DELAYCOMMIT; 3063 else 3064 fsdata = C_NOCOMMIT; 3065 3066 savepp = pp; 3067 do { 3068 pp->p_fsdata = fsdata; 3069 } while ((pp = pp->p_next) != savepp); 3070 3071 return (error); 3072 } 3073 3074 /* 3075 */ 3076 static int 3077 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3078 { 3079 nfs4_open_owner_t *oop; 3080 nfs4_open_stream_t *osp; 3081 rnode4_t *rp = VTOR4(vp); 3082 mntinfo4_t *mi = VTOMI4(vp); 3083 int reopen_needed; 3084 3085 ASSERT(nfs_zone() == mi->mi_zone); 3086 3087 3088 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3089 if (!oop) 3090 return (EIO); 3091 3092 /* returns with 'os_sync_lock' held */ 3093 osp = find_open_stream(oop, rp); 3094 if (!osp) { 3095 open_owner_rele(oop); 3096 return (EIO); 3097 } 3098 3099 if (osp->os_failed_reopen) { 3100 mutex_exit(&osp->os_sync_lock); 3101 open_stream_rele(osp, rp); 3102 open_owner_rele(oop); 3103 return (EIO); 3104 } 3105 3106 /* 3107 * Determine whether a reopen is needed. If this 3108 * is a delegation open stream, then the os_delegation bit 3109 * should be set. 3110 */ 3111 3112 reopen_needed = osp->os_delegation; 3113 3114 mutex_exit(&osp->os_sync_lock); 3115 open_owner_rele(oop); 3116 3117 if (reopen_needed) { 3118 nfs4_error_zinit(ep); 3119 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3120 mutex_enter(&osp->os_sync_lock); 3121 if (ep->error || ep->stat || osp->os_failed_reopen) { 3122 mutex_exit(&osp->os_sync_lock); 3123 open_stream_rele(osp, rp); 3124 return (EIO); 3125 } 3126 mutex_exit(&osp->os_sync_lock); 3127 } 3128 open_stream_rele(osp, rp); 3129 3130 return (0); 3131 } 3132 3133 /* 3134 * Write to file. Writes to remote server in largest size 3135 * chunks that the server can handle. Write is synchronous. 3136 */ 3137 static int 3138 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3139 stable_how4 *stab_comm) 3140 { 3141 mntinfo4_t *mi; 3142 COMPOUND4args_clnt args; 3143 COMPOUND4res_clnt res; 3144 WRITE4args *wargs; 3145 WRITE4res *wres; 3146 nfs_argop4 argop[2]; 3147 nfs_resop4 *resop; 3148 int tsize; 3149 stable_how4 stable; 3150 rnode4_t *rp; 3151 int doqueue = 1; 3152 bool_t needrecov; 3153 nfs4_recov_state_t recov_state; 3154 nfs4_stateid_types_t sid_types; 3155 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3156 int recov; 3157 3158 rp = VTOR4(vp); 3159 mi = VTOMI4(vp); 3160 3161 ASSERT(nfs_zone() == mi->mi_zone); 3162 3163 stable = *stab_comm; 3164 *stab_comm = FILE_SYNC4; 3165 3166 needrecov = FALSE; 3167 recov_state.rs_flags = 0; 3168 recov_state.rs_num_retry_despite_err = 0; 3169 nfs4_init_stateid_types(&sid_types); 3170 3171 /* Is curthread the recovery thread? */ 3172 mutex_enter(&mi->mi_lock); 3173 recov = (mi->mi_recovthread == curthread); 3174 mutex_exit(&mi->mi_lock); 3175 3176 recov_retry: 3177 args.ctag = TAG_WRITE; 3178 args.array_len = 2; 3179 args.array = argop; 3180 3181 if (!recov) { 3182 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3183 &recov_state, NULL); 3184 if (e.error) 3185 return (e.error); 3186 } 3187 3188 /* 0. putfh target fh */ 3189 argop[0].argop = OP_CPUTFH; 3190 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3191 3192 /* 1. write */ 3193 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3194 3195 do { 3196 3197 wargs->offset = (offset4)offset; 3198 wargs->data_val = base; 3199 3200 if (mi->mi_io_kstats) { 3201 mutex_enter(&mi->mi_lock); 3202 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3203 mutex_exit(&mi->mi_lock); 3204 } 3205 3206 if ((vp->v_flag & VNOCACHE) || 3207 (rp->r_flags & R4DIRECTIO) || 3208 (mi->mi_flags & MI4_DIRECTIO)) 3209 tsize = MIN(mi->mi_stsize, count); 3210 else 3211 tsize = MIN(mi->mi_curwrite, count); 3212 wargs->data_len = (uint_t)tsize; 3213 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3214 3215 if (mi->mi_io_kstats) { 3216 mutex_enter(&mi->mi_lock); 3217 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3218 mutex_exit(&mi->mi_lock); 3219 } 3220 3221 if (!recov) { 3222 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3223 if (e.error && !needrecov) { 3224 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3225 &recov_state, needrecov); 3226 return (e.error); 3227 } 3228 } else { 3229 if (e.error) 3230 return (e.error); 3231 } 3232 3233 /* 3234 * Do handling of OLD_STATEID outside 3235 * of the normal recovery framework. 3236 * 3237 * If write receives a BAD stateid error while using a 3238 * delegation stateid, retry using the open stateid (if it 3239 * exists). If it doesn't have an open stateid, reopen the 3240 * file first, then retry. 3241 */ 3242 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3243 sid_types.cur_sid_type != SPEC_SID) { 3244 nfs4_save_stateid(&wargs->stateid, &sid_types); 3245 if (!recov) 3246 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3247 &recov_state, needrecov); 3248 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3249 goto recov_retry; 3250 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3251 sid_types.cur_sid_type == DEL_SID) { 3252 nfs4_save_stateid(&wargs->stateid, &sid_types); 3253 mutex_enter(&rp->r_statev4_lock); 3254 rp->r_deleg_return_pending = TRUE; 3255 mutex_exit(&rp->r_statev4_lock); 3256 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3257 if (!recov) 3258 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3259 &recov_state, needrecov); 3260 (void) xdr_free(xdr_COMPOUND4res_clnt, 3261 (caddr_t)&res); 3262 return (EIO); 3263 } 3264 if (!recov) 3265 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3266 &recov_state, needrecov); 3267 /* hold needed for nfs4delegreturn_thread */ 3268 VN_HOLD(vp); 3269 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3270 NFS4_DR_DISCARD), FALSE); 3271 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3272 goto recov_retry; 3273 } 3274 3275 if (needrecov) { 3276 bool_t abort; 3277 3278 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3279 "nfs4write: client got error %d, res.status %d" 3280 ", so start recovery", e.error, res.status)); 3281 3282 abort = nfs4_start_recovery(&e, 3283 VTOMI4(vp), vp, NULL, &wargs->stateid, 3284 NULL, OP_WRITE, NULL); 3285 if (!e.error) { 3286 e.error = geterrno4(res.status); 3287 (void) xdr_free(xdr_COMPOUND4res_clnt, 3288 (caddr_t)&res); 3289 } 3290 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3291 &recov_state, needrecov); 3292 if (abort == FALSE) 3293 goto recov_retry; 3294 return (e.error); 3295 } 3296 3297 if (res.status) { 3298 e.error = geterrno4(res.status); 3299 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3300 if (!recov) 3301 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3302 &recov_state, needrecov); 3303 return (e.error); 3304 } 3305 3306 resop = &res.array[1]; /* write res */ 3307 wres = &resop->nfs_resop4_u.opwrite; 3308 3309 if ((int)wres->count > tsize) { 3310 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3311 3312 zcmn_err(getzoneid(), CE_WARN, 3313 "nfs4write: server wrote %u, requested was %u", 3314 (int)wres->count, tsize); 3315 if (!recov) 3316 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3317 &recov_state, needrecov); 3318 return (EIO); 3319 } 3320 if (wres->committed == UNSTABLE4) { 3321 *stab_comm = UNSTABLE4; 3322 if (wargs->stable == DATA_SYNC4 || 3323 wargs->stable == FILE_SYNC4) { 3324 (void) xdr_free(xdr_COMPOUND4res_clnt, 3325 (caddr_t)&res); 3326 zcmn_err(getzoneid(), CE_WARN, 3327 "nfs4write: server %s did not commit " 3328 "to stable storage", 3329 rp->r_server->sv_hostname); 3330 if (!recov) 3331 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3332 OH_WRITE, &recov_state, needrecov); 3333 return (EIO); 3334 } 3335 } 3336 3337 tsize = (int)wres->count; 3338 count -= tsize; 3339 base += tsize; 3340 offset += tsize; 3341 if (mi->mi_io_kstats) { 3342 mutex_enter(&mi->mi_lock); 3343 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3344 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3345 tsize; 3346 mutex_exit(&mi->mi_lock); 3347 } 3348 lwp_stat_update(LWP_STAT_OUBLK, 1); 3349 mutex_enter(&rp->r_statelock); 3350 if (rp->r_flags & R4HAVEVERF) { 3351 if (rp->r_writeverf != wres->writeverf) { 3352 nfs4_set_mod(vp); 3353 rp->r_writeverf = wres->writeverf; 3354 } 3355 } else { 3356 rp->r_writeverf = wres->writeverf; 3357 rp->r_flags |= R4HAVEVERF; 3358 } 3359 PURGE_ATTRCACHE4_LOCKED(rp); 3360 rp->r_flags |= R4WRITEMODIFIED; 3361 gethrestime(&rp->r_attr.va_mtime); 3362 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3363 mutex_exit(&rp->r_statelock); 3364 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3365 } while (count); 3366 3367 if (!recov) 3368 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3369 needrecov); 3370 3371 return (e.error); 3372 } 3373 3374 /* 3375 * Read from a file. Reads data in largest chunks our interface can handle. 3376 */ 3377 static int 3378 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3379 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3380 { 3381 mntinfo4_t *mi; 3382 COMPOUND4args_clnt args; 3383 COMPOUND4res_clnt res; 3384 READ4args *rargs; 3385 nfs_argop4 argop[2]; 3386 int tsize; 3387 int doqueue; 3388 rnode4_t *rp; 3389 int data_len; 3390 bool_t is_eof; 3391 bool_t needrecov = FALSE; 3392 nfs4_recov_state_t recov_state; 3393 nfs4_stateid_types_t sid_types; 3394 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3395 3396 rp = VTOR4(vp); 3397 mi = VTOMI4(vp); 3398 doqueue = 1; 3399 3400 ASSERT(nfs_zone() == mi->mi_zone); 3401 3402 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3403 3404 args.array_len = 2; 3405 args.array = argop; 3406 3407 nfs4_init_stateid_types(&sid_types); 3408 3409 recov_state.rs_flags = 0; 3410 recov_state.rs_num_retry_despite_err = 0; 3411 3412 recov_retry: 3413 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3414 &recov_state, NULL); 3415 if (e.error) 3416 return (e.error); 3417 3418 /* putfh target fh */ 3419 argop[0].argop = OP_CPUTFH; 3420 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3421 3422 /* read */ 3423 argop[1].argop = OP_READ; 3424 rargs = &argop[1].nfs_argop4_u.opread; 3425 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3426 OP_READ, &sid_types, async); 3427 3428 do { 3429 if (mi->mi_io_kstats) { 3430 mutex_enter(&mi->mi_lock); 3431 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3432 mutex_exit(&mi->mi_lock); 3433 } 3434 3435 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3436 "nfs4read: %s call, rp %s", 3437 needrecov ? "recov" : "first", 3438 rnode4info(rp))); 3439 3440 if ((vp->v_flag & VNOCACHE) || 3441 (rp->r_flags & R4DIRECTIO) || 3442 (mi->mi_flags & MI4_DIRECTIO)) 3443 tsize = MIN(mi->mi_tsize, count); 3444 else 3445 tsize = MIN(mi->mi_curread, count); 3446 3447 rargs->offset = (offset4)offset; 3448 rargs->count = (count4)tsize; 3449 rargs->res_data_val_alt = NULL; 3450 rargs->res_mblk = NULL; 3451 rargs->res_uiop = NULL; 3452 rargs->res_maxsize = 0; 3453 rargs->wlist = NULL; 3454 3455 if (uiop) 3456 rargs->res_uiop = uiop; 3457 else 3458 rargs->res_data_val_alt = base; 3459 rargs->res_maxsize = tsize; 3460 3461 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3462 #ifdef DEBUG 3463 if (nfs4read_error_inject) { 3464 res.status = nfs4read_error_inject; 3465 nfs4read_error_inject = 0; 3466 } 3467 #endif 3468 3469 if (mi->mi_io_kstats) { 3470 mutex_enter(&mi->mi_lock); 3471 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3472 mutex_exit(&mi->mi_lock); 3473 } 3474 3475 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3476 if (e.error != 0 && !needrecov) { 3477 nfs4_end_fop(mi, vp, NULL, OH_READ, 3478 &recov_state, needrecov); 3479 return (e.error); 3480 } 3481 3482 /* 3483 * Do proper retry for OLD and BAD stateid errors outside 3484 * of the normal recovery framework. There are two differences 3485 * between async and sync reads. The first is that we allow 3486 * retry on BAD_STATEID for async reads, but not sync reads. 3487 * The second is that we mark the file dead for a failed 3488 * attempt with a special stateid for sync reads, but just 3489 * return EIO for async reads. 3490 * 3491 * If a sync read receives a BAD stateid error while using a 3492 * delegation stateid, retry using the open stateid (if it 3493 * exists). If it doesn't have an open stateid, reopen the 3494 * file first, then retry. 3495 */ 3496 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3497 res.status == NFS4ERR_BAD_STATEID) && async) { 3498 nfs4_end_fop(mi, vp, NULL, OH_READ, 3499 &recov_state, needrecov); 3500 if (sid_types.cur_sid_type == SPEC_SID) { 3501 (void) xdr_free(xdr_COMPOUND4res_clnt, 3502 (caddr_t)&res); 3503 return (EIO); 3504 } 3505 nfs4_save_stateid(&rargs->stateid, &sid_types); 3506 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3507 goto recov_retry; 3508 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3509 !async && sid_types.cur_sid_type != SPEC_SID) { 3510 nfs4_save_stateid(&rargs->stateid, &sid_types); 3511 nfs4_end_fop(mi, vp, NULL, OH_READ, 3512 &recov_state, needrecov); 3513 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3514 goto recov_retry; 3515 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3516 sid_types.cur_sid_type == DEL_SID) { 3517 nfs4_save_stateid(&rargs->stateid, &sid_types); 3518 mutex_enter(&rp->r_statev4_lock); 3519 rp->r_deleg_return_pending = TRUE; 3520 mutex_exit(&rp->r_statev4_lock); 3521 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3522 nfs4_end_fop(mi, vp, NULL, OH_READ, 3523 &recov_state, needrecov); 3524 (void) xdr_free(xdr_COMPOUND4res_clnt, 3525 (caddr_t)&res); 3526 return (EIO); 3527 } 3528 nfs4_end_fop(mi, vp, NULL, OH_READ, 3529 &recov_state, needrecov); 3530 /* hold needed for nfs4delegreturn_thread */ 3531 VN_HOLD(vp); 3532 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3533 NFS4_DR_DISCARD), FALSE); 3534 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3535 goto recov_retry; 3536 } 3537 if (needrecov) { 3538 bool_t abort; 3539 3540 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3541 "nfs4read: initiating recovery\n")); 3542 abort = nfs4_start_recovery(&e, 3543 mi, vp, NULL, &rargs->stateid, 3544 NULL, OP_READ, NULL); 3545 nfs4_end_fop(mi, vp, NULL, OH_READ, 3546 &recov_state, needrecov); 3547 /* 3548 * Do not retry if we got OLD_STATEID using a special 3549 * stateid. This avoids looping with a broken server. 3550 */ 3551 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3552 sid_types.cur_sid_type == SPEC_SID) 3553 abort = TRUE; 3554 3555 if (abort == FALSE) { 3556 /* 3557 * Need to retry all possible stateids in 3558 * case the recovery error wasn't stateid 3559 * related or the stateids have become 3560 * stale (server reboot). 3561 */ 3562 nfs4_init_stateid_types(&sid_types); 3563 (void) xdr_free(xdr_COMPOUND4res_clnt, 3564 (caddr_t)&res); 3565 goto recov_retry; 3566 } 3567 3568 if (!e.error) { 3569 e.error = geterrno4(res.status); 3570 (void) xdr_free(xdr_COMPOUND4res_clnt, 3571 (caddr_t)&res); 3572 } 3573 return (e.error); 3574 } 3575 3576 if (res.status) { 3577 e.error = geterrno4(res.status); 3578 nfs4_end_fop(mi, vp, NULL, OH_READ, 3579 &recov_state, needrecov); 3580 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3581 return (e.error); 3582 } 3583 3584 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3585 count -= data_len; 3586 if (base) 3587 base += data_len; 3588 offset += data_len; 3589 if (mi->mi_io_kstats) { 3590 mutex_enter(&mi->mi_lock); 3591 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3592 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3593 mutex_exit(&mi->mi_lock); 3594 } 3595 lwp_stat_update(LWP_STAT_INBLK, 1); 3596 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3597 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3598 3599 } while (count && !is_eof); 3600 3601 *residp = count; 3602 3603 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3604 3605 return (e.error); 3606 } 3607 3608 /* ARGSUSED */ 3609 static int 3610 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3611 caller_context_t *ct) 3612 { 3613 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3614 return (EIO); 3615 switch (cmd) { 3616 case _FIODIRECTIO: 3617 return (nfs4_directio(vp, (int)arg, cr)); 3618 default: 3619 return (ENOTTY); 3620 } 3621 } 3622 3623 /* ARGSUSED */ 3624 int 3625 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3626 caller_context_t *ct) 3627 { 3628 int error; 3629 rnode4_t *rp = VTOR4(vp); 3630 3631 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3632 return (EIO); 3633 /* 3634 * If it has been specified that the return value will 3635 * just be used as a hint, and we are only being asked 3636 * for size, fsid or rdevid, then return the client's 3637 * notion of these values without checking to make sure 3638 * that the attribute cache is up to date. 3639 * The whole point is to avoid an over the wire GETATTR 3640 * call. 3641 */ 3642 if (flags & ATTR_HINT) { 3643 if (vap->va_mask == 3644 (vap->va_mask & (AT_SIZE | AT_FSID | AT_RDEV))) { 3645 mutex_enter(&rp->r_statelock); 3646 if (vap->va_mask | AT_SIZE) 3647 vap->va_size = rp->r_size; 3648 if (vap->va_mask | AT_FSID) 3649 vap->va_fsid = rp->r_attr.va_fsid; 3650 if (vap->va_mask | AT_RDEV) 3651 vap->va_rdev = rp->r_attr.va_rdev; 3652 mutex_exit(&rp->r_statelock); 3653 return (0); 3654 } 3655 } 3656 3657 /* 3658 * Only need to flush pages if asking for the mtime 3659 * and if there any dirty pages or any outstanding 3660 * asynchronous (write) requests for this file. 3661 */ 3662 if (vap->va_mask & AT_MTIME) { 3663 rp = VTOR4(vp); 3664 if (nfs4_has_pages(vp)) { 3665 mutex_enter(&rp->r_statev4_lock); 3666 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3667 mutex_exit(&rp->r_statev4_lock); 3668 if (rp->r_flags & R4DIRTY || 3669 rp->r_awcount > 0) { 3670 mutex_enter(&rp->r_statelock); 3671 rp->r_gcount++; 3672 mutex_exit(&rp->r_statelock); 3673 error = 3674 nfs4_putpage(vp, (u_offset_t)0, 3675 0, 0, cr, NULL); 3676 mutex_enter(&rp->r_statelock); 3677 if (error && (error == ENOSPC || 3678 error == EDQUOT)) { 3679 if (!rp->r_error) 3680 rp->r_error = error; 3681 } 3682 if (--rp->r_gcount == 0) 3683 cv_broadcast(&rp->r_cv); 3684 mutex_exit(&rp->r_statelock); 3685 } 3686 } else { 3687 mutex_exit(&rp->r_statev4_lock); 3688 } 3689 } 3690 } 3691 return (nfs4getattr(vp, vap, cr)); 3692 } 3693 3694 int 3695 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3696 { 3697 /* 3698 * If these are the only two bits cleared 3699 * on the server then return 0 (OK) else 3700 * return 1 (BAD). 3701 */ 3702 on_client &= ~(S_ISUID|S_ISGID); 3703 if (on_client == from_server) 3704 return (0); 3705 else 3706 return (1); 3707 } 3708 3709 /*ARGSUSED4*/ 3710 static int 3711 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3712 caller_context_t *ct) 3713 { 3714 if (vap->va_mask & AT_NOSET) 3715 return (EINVAL); 3716 3717 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3718 return (EIO); 3719 3720 /* 3721 * Don't call secpolicy_vnode_setattr, the client cannot 3722 * use its cached attributes to make security decisions 3723 * as the server may be faking mode bits or mapping uid/gid. 3724 * Always just let the server to the checking. 3725 * If we provide the ability to remove basic priviledges 3726 * to setattr (e.g. basic without chmod) then we will 3727 * need to add a check here before calling the server. 3728 */ 3729 3730 return (nfs4setattr(vp, vap, flags, cr, NULL)); 3731 } 3732 3733 /* 3734 * To replace the "guarded" version 3 setattr, we use two types of compound 3735 * setattr requests: 3736 * 1. The "normal" setattr, used when the size of the file isn't being 3737 * changed - { Putfh <fh>; Setattr; Getattr }/ 3738 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3739 * with only ctime as the argument. If the server ctime differs from 3740 * what is cached on the client, the verify will fail, but we would 3741 * already have the ctime from the preceding getattr, so just set it 3742 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3743 * Setattr; Getattr }. 3744 * 3745 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3746 * this setattr and NULL if they are not. 3747 */ 3748 static int 3749 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3750 vsecattr_t *vsap) 3751 { 3752 COMPOUND4args_clnt args; 3753 COMPOUND4res_clnt res, *resp = NULL; 3754 nfs4_ga_res_t *garp = NULL; 3755 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3756 nfs_argop4 argop[5]; 3757 int verify_argop = -1; 3758 int setattr_argop = 1; 3759 nfs_resop4 *resop; 3760 vattr_t va; 3761 rnode4_t *rp; 3762 int doqueue = 1; 3763 uint_t mask = vap->va_mask; 3764 mode_t omode; 3765 vsecattr_t *vsp; 3766 timestruc_t ctime; 3767 bool_t needrecov = FALSE; 3768 nfs4_recov_state_t recov_state; 3769 nfs4_stateid_types_t sid_types; 3770 stateid4 stateid; 3771 hrtime_t t; 3772 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3773 servinfo4_t *svp; 3774 bitmap4 supp_attrs; 3775 3776 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3777 rp = VTOR4(vp); 3778 nfs4_init_stateid_types(&sid_types); 3779 3780 /* 3781 * Only need to flush pages if there are any pages and 3782 * if the file is marked as dirty in some fashion. The 3783 * file must be flushed so that we can accurately 3784 * determine the size of the file and the cached data 3785 * after the SETATTR returns. A file is considered to 3786 * be dirty if it is either marked with R4DIRTY, has 3787 * outstanding i/o's active, or is mmap'd. In this 3788 * last case, we can't tell whether there are dirty 3789 * pages, so we flush just to be sure. 3790 */ 3791 if (nfs4_has_pages(vp) && 3792 ((rp->r_flags & R4DIRTY) || 3793 rp->r_count > 0 || 3794 rp->r_mapcnt > 0)) { 3795 ASSERT(vp->v_type != VCHR); 3796 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3797 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3798 mutex_enter(&rp->r_statelock); 3799 if (!rp->r_error) 3800 rp->r_error = e.error; 3801 mutex_exit(&rp->r_statelock); 3802 } 3803 } 3804 3805 if (mask & AT_SIZE) { 3806 /* 3807 * Verification setattr compound for non-deleg AT_SIZE: 3808 * { Putfh; Getattr; Verify; Setattr; Getattr } 3809 * Set ctime local here (outside the do_again label) 3810 * so that subsequent retries (after failed VERIFY) 3811 * will use ctime from GETATTR results (from failed 3812 * verify compound) as VERIFY arg. 3813 * If file has delegation, then VERIFY(time_metadata) 3814 * is of little added value, so don't bother. 3815 */ 3816 mutex_enter(&rp->r_statev4_lock); 3817 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3818 rp->r_deleg_return_pending) { 3819 numops = 5; 3820 ctime = rp->r_attr.va_ctime; 3821 } 3822 mutex_exit(&rp->r_statev4_lock); 3823 } 3824 3825 recov_state.rs_flags = 0; 3826 recov_state.rs_num_retry_despite_err = 0; 3827 3828 args.ctag = TAG_SETATTR; 3829 do_again: 3830 recov_retry: 3831 setattr_argop = numops - 2; 3832 3833 args.array = argop; 3834 args.array_len = numops; 3835 3836 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3837 if (e.error) 3838 return (e.error); 3839 3840 3841 /* putfh target fh */ 3842 argop[0].argop = OP_CPUTFH; 3843 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3844 3845 if (numops == 5) { 3846 /* 3847 * We only care about the ctime, but need to get mtime 3848 * and size for proper cache update. 3849 */ 3850 /* getattr */ 3851 argop[1].argop = OP_GETATTR; 3852 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3853 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3854 3855 /* verify - set later in loop */ 3856 verify_argop = 2; 3857 } 3858 3859 /* setattr */ 3860 svp = rp->r_server; 3861 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3862 supp_attrs = svp->sv_supp_attrs; 3863 nfs_rw_exit(&svp->sv_lock); 3864 3865 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3866 supp_attrs, &e.error, &sid_types); 3867 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3868 if (e.error) { 3869 /* req time field(s) overflow - return immediately */ 3870 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3871 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3872 opsetattr.obj_attributes); 3873 return (e.error); 3874 } 3875 omode = rp->r_attr.va_mode; 3876 3877 /* getattr */ 3878 argop[numops-1].argop = OP_GETATTR; 3879 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3880 /* 3881 * If we are setting the ACL (indicated only by vsap != NULL), request 3882 * the ACL in this getattr. The ACL returned from this getattr will be 3883 * used in updating the ACL cache. 3884 */ 3885 if (vsap != NULL) 3886 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3887 FATTR4_ACL_MASK; 3888 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3889 3890 /* 3891 * setattr iterates if the object size is set and the cached ctime 3892 * does not match the file ctime. In that case, verify the ctime first. 3893 */ 3894 3895 do { 3896 if (verify_argop != -1) { 3897 /* 3898 * Verify that the ctime match before doing setattr. 3899 */ 3900 va.va_mask = AT_CTIME; 3901 va.va_ctime = ctime; 3902 svp = rp->r_server; 3903 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3904 supp_attrs = svp->sv_supp_attrs; 3905 nfs_rw_exit(&svp->sv_lock); 3906 e.error = nfs4args_verify(&argop[verify_argop], &va, 3907 OP_VERIFY, supp_attrs); 3908 if (e.error) { 3909 /* req time field(s) overflow - return */ 3910 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3911 needrecov); 3912 break; 3913 } 3914 } 3915 3916 doqueue = 1; 3917 3918 t = gethrtime(); 3919 3920 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3921 3922 /* 3923 * Purge the access cache and ACL cache if changing either the 3924 * owner of the file, the group owner, or the mode. These may 3925 * change the access permissions of the file, so purge old 3926 * information and start over again. 3927 */ 3928 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3929 (void) nfs4_access_purge_rp(rp); 3930 if (rp->r_secattr != NULL) { 3931 mutex_enter(&rp->r_statelock); 3932 vsp = rp->r_secattr; 3933 rp->r_secattr = NULL; 3934 mutex_exit(&rp->r_statelock); 3935 if (vsp != NULL) 3936 nfs4_acl_free_cache(vsp); 3937 } 3938 } 3939 3940 /* 3941 * If res.array_len == numops, then everything succeeded, 3942 * except for possibly the final getattr. If only the 3943 * last getattr failed, give up, and don't try recovery. 3944 */ 3945 if (res.array_len == numops) { 3946 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3947 needrecov); 3948 if (! e.error) 3949 resp = &res; 3950 break; 3951 } 3952 3953 /* 3954 * if either rpc call failed or completely succeeded - done 3955 */ 3956 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3957 if (e.error) { 3958 PURGE_ATTRCACHE4(vp); 3959 if (!needrecov) { 3960 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3961 needrecov); 3962 break; 3963 } 3964 } 3965 3966 /* 3967 * Do proper retry for OLD_STATEID outside of the normal 3968 * recovery framework. 3969 */ 3970 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3971 sid_types.cur_sid_type != SPEC_SID && 3972 sid_types.cur_sid_type != NO_SID) { 3973 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3974 needrecov); 3975 nfs4_save_stateid(&stateid, &sid_types); 3976 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3977 opsetattr.obj_attributes); 3978 if (verify_argop != -1) { 3979 nfs4args_verify_free(&argop[verify_argop]); 3980 verify_argop = -1; 3981 } 3982 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3983 goto recov_retry; 3984 } 3985 3986 if (needrecov) { 3987 bool_t abort; 3988 3989 abort = nfs4_start_recovery(&e, 3990 VTOMI4(vp), vp, NULL, NULL, NULL, 3991 OP_SETATTR, NULL); 3992 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3993 needrecov); 3994 /* 3995 * Do not retry if we failed with OLD_STATEID using 3996 * a special stateid. This is done to avoid looping 3997 * with a broken server. 3998 */ 3999 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4000 (sid_types.cur_sid_type == SPEC_SID || 4001 sid_types.cur_sid_type == NO_SID)) 4002 abort = TRUE; 4003 if (!e.error) { 4004 if (res.status == NFS4ERR_BADOWNER) 4005 nfs4_log_badowner(VTOMI4(vp), 4006 OP_SETATTR); 4007 4008 e.error = geterrno4(res.status); 4009 (void) xdr_free(xdr_COMPOUND4res_clnt, 4010 (caddr_t)&res); 4011 } 4012 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4013 opsetattr.obj_attributes); 4014 if (verify_argop != -1) { 4015 nfs4args_verify_free(&argop[verify_argop]); 4016 verify_argop = -1; 4017 } 4018 if (abort == FALSE) { 4019 /* 4020 * Need to retry all possible stateids in 4021 * case the recovery error wasn't stateid 4022 * related or the stateids have become 4023 * stale (server reboot). 4024 */ 4025 nfs4_init_stateid_types(&sid_types); 4026 goto recov_retry; 4027 } 4028 return (e.error); 4029 } 4030 4031 /* 4032 * Need to call nfs4_end_op before nfs4getattr to 4033 * avoid potential nfs4_start_op deadlock. See RFE 4034 * 4777612. Calls to nfs4_invalidate_pages() and 4035 * nfs4_purge_stale_fh() might also generate over the 4036 * wire calls which my cause nfs4_start_op() deadlock. 4037 */ 4038 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4039 4040 /* 4041 * Check to update lease. 4042 */ 4043 resp = &res; 4044 if (res.status == NFS4_OK) { 4045 break; 4046 } 4047 4048 /* 4049 * Check if verify failed to see if try again 4050 */ 4051 if ((verify_argop == -1) || (res.array_len != 3)) { 4052 /* 4053 * can't continue... 4054 */ 4055 if (res.status == NFS4ERR_BADOWNER) 4056 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4057 4058 e.error = geterrno4(res.status); 4059 } else { 4060 /* 4061 * When the verify request fails, the client ctime is 4062 * not in sync with the server. This is the same as 4063 * the version 3 "not synchronized" error, and we 4064 * handle it in a similar manner (XXX do we need to???). 4065 * Use the ctime returned in the first getattr for 4066 * the input to the next verify. 4067 * If we couldn't get the attributes, then we give up 4068 * because we can't complete the operation as required. 4069 */ 4070 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4071 } 4072 if (e.error) { 4073 PURGE_ATTRCACHE4(vp); 4074 nfs4_purge_stale_fh(e.error, vp, cr); 4075 } else { 4076 /* 4077 * retry with a new verify value 4078 */ 4079 ctime = garp->n4g_va.va_ctime; 4080 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4081 resp = NULL; 4082 } 4083 if (!e.error) { 4084 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4085 opsetattr.obj_attributes); 4086 if (verify_argop != -1) { 4087 nfs4args_verify_free(&argop[verify_argop]); 4088 verify_argop = -1; 4089 } 4090 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4091 goto do_again; 4092 } 4093 } while (!e.error); 4094 4095 if (e.error) { 4096 /* 4097 * If we are here, rfs4call has an irrecoverable error - return 4098 */ 4099 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4100 opsetattr.obj_attributes); 4101 if (verify_argop != -1) { 4102 nfs4args_verify_free(&argop[verify_argop]); 4103 verify_argop = -1; 4104 } 4105 if (resp) 4106 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4107 return (e.error); 4108 } 4109 4110 4111 4112 /* 4113 * If changing the size of the file, invalidate 4114 * any local cached data which is no longer part 4115 * of the file. We also possibly invalidate the 4116 * last page in the file. We could use 4117 * pvn_vpzero(), but this would mark the page as 4118 * modified and require it to be written back to 4119 * the server for no particularly good reason. 4120 * This way, if we access it, then we bring it 4121 * back in. A read should be cheaper than a 4122 * write. 4123 */ 4124 if (mask & AT_SIZE) { 4125 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4126 } 4127 4128 /* either no error or one of the postop getattr failed */ 4129 4130 /* 4131 * XXX Perform a simplified version of wcc checking. Instead of 4132 * have another getattr to get pre-op, just purge cache if 4133 * any of the ops prior to and including the getattr failed. 4134 * If the getattr succeeded then update the attrcache accordingly. 4135 */ 4136 4137 garp = NULL; 4138 if (res.status == NFS4_OK) { 4139 /* 4140 * Last getattr 4141 */ 4142 resop = &res.array[numops - 1]; 4143 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4144 } 4145 /* 4146 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4147 * rather than filling it. See the function itself for details. 4148 */ 4149 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4150 if (garp != NULL) { 4151 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4152 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4153 vs_ace4_destroy(&garp->n4g_vsa); 4154 } else { 4155 if (vsap != NULL) { 4156 /* 4157 * The ACL was supposed to be set and to be 4158 * returned in the last getattr of this 4159 * compound, but for some reason the getattr 4160 * result doesn't contain the ACL. In this 4161 * case, purge the ACL cache. 4162 */ 4163 if (rp->r_secattr != NULL) { 4164 mutex_enter(&rp->r_statelock); 4165 vsp = rp->r_secattr; 4166 rp->r_secattr = NULL; 4167 mutex_exit(&rp->r_statelock); 4168 if (vsp != NULL) 4169 nfs4_acl_free_cache(vsp); 4170 } 4171 } 4172 } 4173 } 4174 4175 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4176 /* 4177 * Set the size, rather than relying on getting it updated 4178 * via a GETATTR. With delegations the client tries to 4179 * suppress GETATTR calls. 4180 */ 4181 mutex_enter(&rp->r_statelock); 4182 rp->r_size = vap->va_size; 4183 mutex_exit(&rp->r_statelock); 4184 } 4185 4186 /* 4187 * Can free up request args and res 4188 */ 4189 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4190 opsetattr.obj_attributes); 4191 if (verify_argop != -1) { 4192 nfs4args_verify_free(&argop[verify_argop]); 4193 verify_argop = -1; 4194 } 4195 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4196 4197 /* 4198 * Some servers will change the mode to clear the setuid 4199 * and setgid bits when changing the uid or gid. The 4200 * client needs to compensate appropriately. 4201 */ 4202 if (mask & (AT_UID | AT_GID)) { 4203 int terror, do_setattr; 4204 4205 do_setattr = 0; 4206 va.va_mask = AT_MODE; 4207 terror = nfs4getattr(vp, &va, cr); 4208 if (!terror && 4209 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4210 (!(mask & AT_MODE) && va.va_mode != omode))) { 4211 va.va_mask = AT_MODE; 4212 if (mask & AT_MODE) { 4213 /* 4214 * We asked the mode to be changed and what 4215 * we just got from the server in getattr is 4216 * not what we wanted it to be, so set it now. 4217 */ 4218 va.va_mode = vap->va_mode; 4219 do_setattr = 1; 4220 } else { 4221 /* 4222 * We did not ask the mode to be changed, 4223 * Check to see that the server just cleared 4224 * I_SUID and I_GUID from it. If not then 4225 * set mode to omode with UID/GID cleared. 4226 */ 4227 if (nfs4_compare_modes(va.va_mode, omode)) { 4228 omode &= ~(S_ISUID|S_ISGID); 4229 va.va_mode = omode; 4230 do_setattr = 1; 4231 } 4232 } 4233 4234 if (do_setattr) 4235 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4236 } 4237 } 4238 4239 return (e.error); 4240 } 4241 4242 /* ARGSUSED */ 4243 static int 4244 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4245 { 4246 COMPOUND4args_clnt args; 4247 COMPOUND4res_clnt res; 4248 int doqueue; 4249 uint32_t acc, resacc, argacc; 4250 rnode4_t *rp; 4251 cred_t *cred, *ncr, *ncrfree = NULL; 4252 nfs4_access_type_t cacc; 4253 int num_ops; 4254 nfs_argop4 argop[3]; 4255 nfs_resop4 *resop; 4256 bool_t needrecov = FALSE, do_getattr; 4257 nfs4_recov_state_t recov_state; 4258 int rpc_error; 4259 hrtime_t t; 4260 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4261 mntinfo4_t *mi = VTOMI4(vp); 4262 4263 if (nfs_zone() != mi->mi_zone) 4264 return (EIO); 4265 4266 acc = 0; 4267 if (mode & VREAD) 4268 acc |= ACCESS4_READ; 4269 if (mode & VWRITE) { 4270 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4271 return (EROFS); 4272 if (vp->v_type == VDIR) 4273 acc |= ACCESS4_DELETE; 4274 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4275 } 4276 if (mode & VEXEC) { 4277 if (vp->v_type == VDIR) 4278 acc |= ACCESS4_LOOKUP; 4279 else 4280 acc |= ACCESS4_EXECUTE; 4281 } 4282 4283 if (VTOR4(vp)->r_acache != NULL) { 4284 e.error = nfs4_validate_caches(vp, cr); 4285 if (e.error) 4286 return (e.error); 4287 } 4288 4289 rp = VTOR4(vp); 4290 if (vp->v_type == VDIR) 4291 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4292 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4293 else 4294 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4295 ACCESS4_EXECUTE; 4296 recov_state.rs_flags = 0; 4297 recov_state.rs_num_retry_despite_err = 0; 4298 4299 cred = cr; 4300 /* 4301 * ncr and ncrfree both initially 4302 * point to the memory area returned 4303 * by crnetadjust(); 4304 * ncrfree not NULL when exiting means 4305 * that we need to release it 4306 */ 4307 ncr = crnetadjust(cred); 4308 ncrfree = ncr; 4309 4310 tryagain: 4311 cacc = nfs4_access_check(rp, acc, cred); 4312 if (cacc == NFS4_ACCESS_ALLOWED) { 4313 if (ncrfree != NULL) 4314 crfree(ncrfree); 4315 return (0); 4316 } 4317 if (cacc == NFS4_ACCESS_DENIED) { 4318 /* 4319 * If the cred can be adjusted, try again 4320 * with the new cred. 4321 */ 4322 if (ncr != NULL) { 4323 cred = ncr; 4324 ncr = NULL; 4325 goto tryagain; 4326 } 4327 if (ncrfree != NULL) 4328 crfree(ncrfree); 4329 return (EACCES); 4330 } 4331 4332 recov_retry: 4333 /* 4334 * Don't take with r_statev4_lock here. r_deleg_type could 4335 * change as soon as lock is released. Since it is an int, 4336 * there is no atomicity issue. 4337 */ 4338 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4339 num_ops = do_getattr ? 3 : 2; 4340 4341 args.ctag = TAG_ACCESS; 4342 4343 args.array_len = num_ops; 4344 args.array = argop; 4345 4346 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4347 &recov_state, NULL)) { 4348 if (ncrfree != NULL) 4349 crfree(ncrfree); 4350 return (e.error); 4351 } 4352 4353 /* putfh target fh */ 4354 argop[0].argop = OP_CPUTFH; 4355 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4356 4357 /* access */ 4358 argop[1].argop = OP_ACCESS; 4359 argop[1].nfs_argop4_u.opaccess.access = argacc; 4360 4361 /* getattr */ 4362 if (do_getattr) { 4363 argop[2].argop = OP_GETATTR; 4364 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4365 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4366 } 4367 4368 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4369 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4370 rnode4info(VTOR4(vp)))); 4371 4372 doqueue = 1; 4373 t = gethrtime(); 4374 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4375 rpc_error = e.error; 4376 4377 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4378 if (needrecov) { 4379 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4380 "nfs4_access: initiating recovery\n")); 4381 4382 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4383 NULL, OP_ACCESS, NULL) == FALSE) { 4384 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4385 &recov_state, needrecov); 4386 if (!e.error) 4387 (void) xdr_free(xdr_COMPOUND4res_clnt, 4388 (caddr_t)&res); 4389 goto recov_retry; 4390 } 4391 } 4392 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4393 4394 if (e.error) 4395 goto out; 4396 4397 if (res.status) { 4398 e.error = geterrno4(res.status); 4399 /* 4400 * This might generate over the wire calls throught 4401 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4402 * here to avoid a deadlock. 4403 */ 4404 nfs4_purge_stale_fh(e.error, vp, cr); 4405 goto out; 4406 } 4407 resop = &res.array[1]; /* access res */ 4408 4409 resacc = resop->nfs_resop4_u.opaccess.access; 4410 4411 if (do_getattr) { 4412 resop++; /* getattr res */ 4413 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4414 t, cr, FALSE, NULL); 4415 } 4416 4417 if (!e.error) { 4418 nfs4_access_cache(rp, argacc, resacc, cred); 4419 /* 4420 * we just cached results with cred; if cred is the 4421 * adjusted credentials from crnetadjust, we do not want 4422 * to release them before exiting: hence setting ncrfree 4423 * to NULL 4424 */ 4425 if (cred != cr) 4426 ncrfree = NULL; 4427 /* XXX check the supported bits too? */ 4428 if ((acc & resacc) != acc) { 4429 /* 4430 * The following code implements the semantic 4431 * that a setuid root program has *at least* the 4432 * permissions of the user that is running the 4433 * program. See rfs3call() for more portions 4434 * of the implementation of this functionality. 4435 */ 4436 /* XXX-LP */ 4437 if (ncr != NULL) { 4438 (void) xdr_free(xdr_COMPOUND4res_clnt, 4439 (caddr_t)&res); 4440 cred = ncr; 4441 ncr = NULL; 4442 goto tryagain; 4443 } 4444 e.error = EACCES; 4445 } 4446 } 4447 4448 out: 4449 if (!rpc_error) 4450 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4451 4452 if (ncrfree != NULL) 4453 crfree(ncrfree); 4454 4455 return (e.error); 4456 } 4457 4458 /* ARGSUSED */ 4459 static int 4460 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4461 { 4462 COMPOUND4args_clnt args; 4463 COMPOUND4res_clnt res; 4464 int doqueue; 4465 rnode4_t *rp; 4466 nfs_argop4 argop[3]; 4467 nfs_resop4 *resop; 4468 READLINK4res *lr_res; 4469 nfs4_ga_res_t *garp; 4470 uint_t len; 4471 char *linkdata; 4472 bool_t needrecov = FALSE; 4473 nfs4_recov_state_t recov_state; 4474 hrtime_t t; 4475 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4476 4477 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4478 return (EIO); 4479 /* 4480 * Can't readlink anything other than a symbolic link. 4481 */ 4482 if (vp->v_type != VLNK) 4483 return (EINVAL); 4484 4485 rp = VTOR4(vp); 4486 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4487 e.error = nfs4_validate_caches(vp, cr); 4488 if (e.error) 4489 return (e.error); 4490 mutex_enter(&rp->r_statelock); 4491 if (rp->r_symlink.contents != NULL) { 4492 e.error = uiomove(rp->r_symlink.contents, 4493 rp->r_symlink.len, UIO_READ, uiop); 4494 mutex_exit(&rp->r_statelock); 4495 return (e.error); 4496 } 4497 mutex_exit(&rp->r_statelock); 4498 } 4499 recov_state.rs_flags = 0; 4500 recov_state.rs_num_retry_despite_err = 0; 4501 4502 recov_retry: 4503 args.array_len = 3; 4504 args.array = argop; 4505 args.ctag = TAG_READLINK; 4506 4507 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4508 if (e.error) { 4509 return (e.error); 4510 } 4511 4512 /* 0. putfh symlink fh */ 4513 argop[0].argop = OP_CPUTFH; 4514 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4515 4516 /* 1. readlink */ 4517 argop[1].argop = OP_READLINK; 4518 4519 /* 2. getattr */ 4520 argop[2].argop = OP_GETATTR; 4521 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4522 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4523 4524 doqueue = 1; 4525 4526 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4527 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4528 rnode4info(VTOR4(vp)))); 4529 4530 t = gethrtime(); 4531 4532 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4533 4534 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4535 if (needrecov) { 4536 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4537 "nfs4_readlink: initiating recovery\n")); 4538 4539 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4540 NULL, OP_READLINK, NULL) == FALSE) { 4541 if (!e.error) 4542 (void) xdr_free(xdr_COMPOUND4res_clnt, 4543 (caddr_t)&res); 4544 4545 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4546 needrecov); 4547 goto recov_retry; 4548 } 4549 } 4550 4551 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4552 4553 if (e.error) 4554 return (e.error); 4555 4556 /* 4557 * There is an path in the code below which calls 4558 * nfs4_purge_stale_fh(), which may generate otw calls through 4559 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4560 * here to avoid nfs4_start_op() deadlock. 4561 */ 4562 4563 if (res.status && (res.array_len < args.array_len)) { 4564 /* 4565 * either Putfh or Link failed 4566 */ 4567 e.error = geterrno4(res.status); 4568 nfs4_purge_stale_fh(e.error, vp, cr); 4569 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4570 return (e.error); 4571 } 4572 4573 resop = &res.array[1]; /* readlink res */ 4574 lr_res = &resop->nfs_resop4_u.opreadlink; 4575 4576 /* 4577 * treat symlink names as data 4578 */ 4579 linkdata = utf8_to_str(&lr_res->link, &len, NULL); 4580 if (linkdata != NULL) { 4581 int uio_len = len - 1; 4582 /* len includes null byte, which we won't uiomove */ 4583 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4584 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4585 mutex_enter(&rp->r_statelock); 4586 if (rp->r_symlink.contents == NULL) { 4587 rp->r_symlink.contents = linkdata; 4588 rp->r_symlink.len = uio_len; 4589 rp->r_symlink.size = len; 4590 mutex_exit(&rp->r_statelock); 4591 } else { 4592 mutex_exit(&rp->r_statelock); 4593 kmem_free(linkdata, len); 4594 } 4595 } else { 4596 kmem_free(linkdata, len); 4597 } 4598 } 4599 if (res.status == NFS4_OK) { 4600 resop++; /* getattr res */ 4601 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4602 } 4603 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4604 4605 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4606 4607 /* 4608 * The over the wire error for attempting to readlink something 4609 * other than a symbolic link is ENXIO. However, we need to 4610 * return EINVAL instead of ENXIO, so we map it here. 4611 */ 4612 return (e.error == ENXIO ? EINVAL : e.error); 4613 } 4614 4615 /* 4616 * Flush local dirty pages to stable storage on the server. 4617 * 4618 * If FNODSYNC is specified, then there is nothing to do because 4619 * metadata changes are not cached on the client before being 4620 * sent to the server. 4621 */ 4622 /* ARGSUSED */ 4623 static int 4624 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4625 { 4626 int error; 4627 4628 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4629 return (0); 4630 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4631 return (EIO); 4632 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4633 if (!error) 4634 error = VTOR4(vp)->r_error; 4635 return (error); 4636 } 4637 4638 /* 4639 * Weirdness: if the file was removed or the target of a rename 4640 * operation while it was open, it got renamed instead. Here we 4641 * remove the renamed file. 4642 */ 4643 /* ARGSUSED */ 4644 void 4645 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4646 { 4647 rnode4_t *rp; 4648 4649 ASSERT(vp != DNLC_NO_VNODE); 4650 4651 rp = VTOR4(vp); 4652 4653 if (IS_SHADOW(vp, rp)) { 4654 sv_inactive(vp); 4655 return; 4656 } 4657 4658 /* 4659 * If this is coming from the wrong zone, we let someone in the right 4660 * zone take care of it asynchronously. We can get here due to 4661 * VN_RELE() being called from pageout() or fsflush(). This call may 4662 * potentially turn into an expensive no-op if, for instance, v_count 4663 * gets incremented in the meantime, but it's still correct. 4664 */ 4665 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4666 nfs4_async_inactive(vp, cr); 4667 return; 4668 } 4669 4670 /* 4671 * Some of the cleanup steps might require over-the-wire 4672 * operations. Since VOP_INACTIVE can get called as a result of 4673 * other over-the-wire operations (e.g., an attribute cache update 4674 * can lead to a DNLC purge), doing those steps now would lead to a 4675 * nested call to the recovery framework, which can deadlock. So 4676 * do any over-the-wire cleanups asynchronously, in a separate 4677 * thread. 4678 */ 4679 4680 mutex_enter(&rp->r_os_lock); 4681 mutex_enter(&rp->r_statelock); 4682 mutex_enter(&rp->r_statev4_lock); 4683 4684 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4685 mutex_exit(&rp->r_statev4_lock); 4686 mutex_exit(&rp->r_statelock); 4687 mutex_exit(&rp->r_os_lock); 4688 nfs4_async_inactive(vp, cr); 4689 return; 4690 } 4691 4692 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4693 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4694 mutex_exit(&rp->r_statev4_lock); 4695 mutex_exit(&rp->r_statelock); 4696 mutex_exit(&rp->r_os_lock); 4697 nfs4_async_inactive(vp, cr); 4698 return; 4699 } 4700 4701 if (rp->r_unldvp != NULL) { 4702 mutex_exit(&rp->r_statev4_lock); 4703 mutex_exit(&rp->r_statelock); 4704 mutex_exit(&rp->r_os_lock); 4705 nfs4_async_inactive(vp, cr); 4706 return; 4707 } 4708 mutex_exit(&rp->r_statev4_lock); 4709 mutex_exit(&rp->r_statelock); 4710 mutex_exit(&rp->r_os_lock); 4711 4712 rp4_addfree(rp, cr); 4713 } 4714 4715 /* 4716 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4717 * various bits of state. The caller must not refer to vp after this call. 4718 */ 4719 4720 void 4721 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4722 { 4723 rnode4_t *rp = VTOR4(vp); 4724 nfs4_recov_state_t recov_state; 4725 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4726 vnode_t *unldvp; 4727 char *unlname; 4728 cred_t *unlcred; 4729 COMPOUND4args_clnt args; 4730 COMPOUND4res_clnt res, *resp; 4731 nfs_argop4 argop[2]; 4732 int doqueue; 4733 #ifdef DEBUG 4734 char *name; 4735 #endif 4736 4737 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4738 ASSERT(!IS_SHADOW(vp, rp)); 4739 4740 #ifdef DEBUG 4741 name = fn_name(VTOSV(vp)->sv_name); 4742 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4743 "release vnode %s", name)); 4744 kmem_free(name, MAXNAMELEN); 4745 #endif 4746 4747 if (vp->v_type == VREG) { 4748 bool_t recov_failed = FALSE; 4749 4750 e.error = nfs4close_all(vp, cr); 4751 if (e.error) { 4752 /* Check to see if recovery failed */ 4753 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4754 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4755 recov_failed = TRUE; 4756 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4757 if (!recov_failed) { 4758 mutex_enter(&rp->r_statelock); 4759 if (rp->r_flags & R4RECOVERR) 4760 recov_failed = TRUE; 4761 mutex_exit(&rp->r_statelock); 4762 } 4763 if (recov_failed) { 4764 NFS4_DEBUG(nfs4_client_recov_debug, 4765 (CE_NOTE, "nfs4_inactive_otw: " 4766 "close failed (recovery failure)")); 4767 } 4768 } 4769 } 4770 4771 redo: 4772 if (rp->r_unldvp == NULL) { 4773 rp4_addfree(rp, cr); 4774 return; 4775 } 4776 4777 /* 4778 * Save the vnode pointer for the directory where the 4779 * unlinked-open file got renamed, then set it to NULL 4780 * to prevent another thread from getting here before 4781 * we're done with the remove. While we have the 4782 * statelock, make local copies of the pertinent rnode 4783 * fields. If we weren't to do this in an atomic way, the 4784 * the unl* fields could become inconsistent with respect 4785 * to each other due to a race condition between this 4786 * code and nfs_remove(). See bug report 1034328. 4787 */ 4788 mutex_enter(&rp->r_statelock); 4789 if (rp->r_unldvp == NULL) { 4790 mutex_exit(&rp->r_statelock); 4791 rp4_addfree(rp, cr); 4792 return; 4793 } 4794 4795 unldvp = rp->r_unldvp; 4796 rp->r_unldvp = NULL; 4797 unlname = rp->r_unlname; 4798 rp->r_unlname = NULL; 4799 unlcred = rp->r_unlcred; 4800 rp->r_unlcred = NULL; 4801 mutex_exit(&rp->r_statelock); 4802 4803 /* 4804 * If there are any dirty pages left, then flush 4805 * them. This is unfortunate because they just 4806 * may get thrown away during the remove operation, 4807 * but we have to do this for correctness. 4808 */ 4809 if (nfs4_has_pages(vp) && 4810 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4811 ASSERT(vp->v_type != VCHR); 4812 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4813 if (e.error) { 4814 mutex_enter(&rp->r_statelock); 4815 if (!rp->r_error) 4816 rp->r_error = e.error; 4817 mutex_exit(&rp->r_statelock); 4818 } 4819 } 4820 4821 recov_state.rs_flags = 0; 4822 recov_state.rs_num_retry_despite_err = 0; 4823 recov_retry_remove: 4824 /* 4825 * Do the remove operation on the renamed file 4826 */ 4827 args.ctag = TAG_INACTIVE; 4828 4829 /* 4830 * Remove ops: putfh dir; remove 4831 */ 4832 args.array_len = 2; 4833 args.array = argop; 4834 4835 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4836 if (e.error) { 4837 kmem_free(unlname, MAXNAMELEN); 4838 crfree(unlcred); 4839 VN_RELE(unldvp); 4840 /* 4841 * Try again; this time around r_unldvp will be NULL, so we'll 4842 * just call rp4_addfree() and return. 4843 */ 4844 goto redo; 4845 } 4846 4847 /* putfh directory */ 4848 argop[0].argop = OP_CPUTFH; 4849 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4850 4851 /* remove */ 4852 argop[1].argop = OP_CREMOVE; 4853 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4854 4855 doqueue = 1; 4856 resp = &res; 4857 4858 #if 0 /* notyet */ 4859 /* 4860 * Can't do this yet. We may be being called from 4861 * dnlc_purge_XXX while that routine is holding a 4862 * mutex lock to the nc_rele list. The calls to 4863 * nfs3_cache_wcc_data may result in calls to 4864 * dnlc_purge_XXX. This will result in a deadlock. 4865 */ 4866 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4867 if (e.error) { 4868 PURGE_ATTRCACHE4(unldvp); 4869 resp = NULL; 4870 } else if (res.status) { 4871 e.error = geterrno4(res.status); 4872 PURGE_ATTRCACHE4(unldvp); 4873 /* 4874 * This code is inactive right now 4875 * but if made active there should 4876 * be a nfs4_end_op() call before 4877 * nfs4_purge_stale_fh to avoid start_op() 4878 * deadlock. See BugId: 4948726 4879 */ 4880 nfs4_purge_stale_fh(error, unldvp, cr); 4881 } else { 4882 nfs_resop4 *resop; 4883 REMOVE4res *rm_res; 4884 4885 resop = &res.array[1]; 4886 rm_res = &resop->nfs_resop4_u.opremove; 4887 /* 4888 * Update directory cache attribute, 4889 * readdir and dnlc caches. 4890 */ 4891 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4892 } 4893 #else 4894 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4895 4896 PURGE_ATTRCACHE4(unldvp); 4897 #endif 4898 4899 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4900 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4901 NULL, NULL, OP_REMOVE, NULL) == FALSE) { 4902 if (!e.error) 4903 (void) xdr_free(xdr_COMPOUND4res_clnt, 4904 (caddr_t)&res); 4905 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4906 &recov_state, TRUE); 4907 goto recov_retry_remove; 4908 } 4909 } 4910 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4911 4912 /* 4913 * Release stuff held for the remove 4914 */ 4915 VN_RELE(unldvp); 4916 if (!e.error && resp) 4917 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4918 4919 kmem_free(unlname, MAXNAMELEN); 4920 crfree(unlcred); 4921 goto redo; 4922 } 4923 4924 /* 4925 * Remote file system operations having to do with directory manipulation. 4926 */ 4927 /* ARGSUSED3 */ 4928 int 4929 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4930 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4931 int *direntflags, pathname_t *realpnp) 4932 { 4933 int error; 4934 vnode_t *vp, *avp = NULL; 4935 rnode4_t *drp; 4936 4937 *vpp = NULL; 4938 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4939 return (EPERM); 4940 /* 4941 * if LOOKUP_XATTR, must replace dvp (object) with 4942 * object's attrdir before continuing with lookup 4943 */ 4944 if (flags & LOOKUP_XATTR) { 4945 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4946 if (error) 4947 return (error); 4948 4949 dvp = avp; 4950 4951 /* 4952 * If lookup is for "", just return dvp now. The attrdir 4953 * has already been activated (from nfs4lookup_xattr), and 4954 * the caller will RELE the original dvp -- not 4955 * the attrdir. So, set vpp and return. 4956 * Currently, when the LOOKUP_XATTR flag is 4957 * passed to VOP_LOOKUP, the name is always empty, and 4958 * shortcircuiting here avoids 3 unneeded lock/unlock 4959 * pairs. 4960 * 4961 * If a non-empty name was provided, then it is the 4962 * attribute name, and it will be looked up below. 4963 */ 4964 if (*nm == '\0') { 4965 *vpp = dvp; 4966 return (0); 4967 } 4968 4969 /* 4970 * The vfs layer never sends a name when asking for the 4971 * attrdir, so we should never get here (unless of course 4972 * name is passed at some time in future -- at which time 4973 * we'll blow up here). 4974 */ 4975 ASSERT(0); 4976 } 4977 4978 drp = VTOR4(dvp); 4979 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4980 return (EINTR); 4981 4982 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4983 nfs_rw_exit(&drp->r_rwlock); 4984 4985 /* 4986 * If vnode is a device, create special vnode. 4987 */ 4988 if (!error && ISVDEV((*vpp)->v_type)) { 4989 vp = *vpp; 4990 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 4991 VN_RELE(vp); 4992 } 4993 4994 return (error); 4995 } 4996 4997 /* ARGSUSED */ 4998 static int 4999 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5000 { 5001 int error; 5002 rnode4_t *drp; 5003 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5004 mntinfo4_t *mi; 5005 5006 mi = VTOMI4(dvp); 5007 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5008 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5009 return (EINVAL); 5010 5011 drp = VTOR4(dvp); 5012 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5013 return (EINTR); 5014 5015 mutex_enter(&drp->r_statelock); 5016 /* 5017 * If the server doesn't support xattrs just return EINVAL 5018 */ 5019 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5020 mutex_exit(&drp->r_statelock); 5021 nfs_rw_exit(&drp->r_rwlock); 5022 return (EINVAL); 5023 } 5024 5025 /* 5026 * If there is a cached xattr directory entry, 5027 * use it as long as the attributes are valid. If the 5028 * attributes are not valid, take the simple approach and 5029 * free the cached value and re-fetch a new value. 5030 * 5031 * We don't negative entry cache for now, if we did we 5032 * would need to check if the file has changed on every 5033 * lookup. But xattrs don't exist very often and failing 5034 * an openattr is not much more expensive than and NVERIFY or GETATTR 5035 * so do an openattr over the wire for now. 5036 */ 5037 if (drp->r_xattr_dir != NULL) { 5038 if (ATTRCACHE4_VALID(dvp)) { 5039 VN_HOLD(drp->r_xattr_dir); 5040 *vpp = drp->r_xattr_dir; 5041 mutex_exit(&drp->r_statelock); 5042 nfs_rw_exit(&drp->r_rwlock); 5043 return (0); 5044 } 5045 VN_RELE(drp->r_xattr_dir); 5046 drp->r_xattr_dir = NULL; 5047 } 5048 mutex_exit(&drp->r_statelock); 5049 5050 error = nfs4openattr(dvp, vpp, cflag, cr); 5051 5052 nfs_rw_exit(&drp->r_rwlock); 5053 5054 return (error); 5055 } 5056 5057 static int 5058 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5059 { 5060 int error; 5061 rnode4_t *drp; 5062 5063 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5064 5065 /* 5066 * If lookup is for "", just return dvp. Don't need 5067 * to send it over the wire, look it up in the dnlc, 5068 * or perform any access checks. 5069 */ 5070 if (*nm == '\0') { 5071 VN_HOLD(dvp); 5072 *vpp = dvp; 5073 return (0); 5074 } 5075 5076 /* 5077 * Can't do lookups in non-directories. 5078 */ 5079 if (dvp->v_type != VDIR) 5080 return (ENOTDIR); 5081 5082 /* 5083 * If lookup is for ".", just return dvp. Don't need 5084 * to send it over the wire or look it up in the dnlc, 5085 * just need to check access. 5086 */ 5087 if (nm[0] == '.' && nm[1] == '\0') { 5088 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5089 if (error) 5090 return (error); 5091 VN_HOLD(dvp); 5092 *vpp = dvp; 5093 return (0); 5094 } 5095 5096 drp = VTOR4(dvp); 5097 if (!(drp->r_flags & R4LOOKUP)) { 5098 mutex_enter(&drp->r_statelock); 5099 drp->r_flags |= R4LOOKUP; 5100 mutex_exit(&drp->r_statelock); 5101 } 5102 5103 *vpp = NULL; 5104 /* 5105 * Lookup this name in the DNLC. If there is no entry 5106 * lookup over the wire. 5107 */ 5108 if (!skipdnlc) 5109 *vpp = dnlc_lookup(dvp, nm); 5110 if (*vpp == NULL) { 5111 /* 5112 * We need to go over the wire to lookup the name. 5113 */ 5114 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5115 } 5116 5117 /* 5118 * We hit on the dnlc 5119 */ 5120 if (*vpp != DNLC_NO_VNODE || 5121 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5122 /* 5123 * But our attrs may not be valid. 5124 */ 5125 if (ATTRCACHE4_VALID(dvp)) { 5126 error = nfs4_waitfor_purge_complete(dvp); 5127 if (error) { 5128 VN_RELE(*vpp); 5129 *vpp = NULL; 5130 return (error); 5131 } 5132 5133 /* 5134 * If after the purge completes, check to make sure 5135 * our attrs are still valid. 5136 */ 5137 if (ATTRCACHE4_VALID(dvp)) { 5138 /* 5139 * If we waited for a purge we may have 5140 * lost our vnode so look it up again. 5141 */ 5142 VN_RELE(*vpp); 5143 *vpp = dnlc_lookup(dvp, nm); 5144 if (*vpp == NULL) 5145 return (nfs4lookupnew_otw(dvp, 5146 nm, vpp, cr)); 5147 5148 /* 5149 * The access cache should almost always hit 5150 */ 5151 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5152 5153 if (error) { 5154 VN_RELE(*vpp); 5155 *vpp = NULL; 5156 return (error); 5157 } 5158 if (*vpp == DNLC_NO_VNODE) { 5159 VN_RELE(*vpp); 5160 *vpp = NULL; 5161 return (ENOENT); 5162 } 5163 return (0); 5164 } 5165 } 5166 } 5167 5168 ASSERT(*vpp != NULL); 5169 5170 /* 5171 * We may have gotten here we have one of the following cases: 5172 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5173 * need to validate them. 5174 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5175 * must validate. 5176 * 5177 * Go to the server and check if the directory has changed, if 5178 * it hasn't we are done and can use the dnlc entry. 5179 */ 5180 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5181 } 5182 5183 /* 5184 * Go to the server and check if the directory has changed, if 5185 * it hasn't we are done and can use the dnlc entry. If it 5186 * has changed we get a new copy of its attributes and check 5187 * the access for VEXEC, then relookup the filename and 5188 * get its filehandle and attributes. 5189 * 5190 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5191 * if the NVERIFY failed we must 5192 * purge the caches 5193 * cache new attributes (will set r_time_attr_inval) 5194 * cache new access 5195 * recheck VEXEC access 5196 * add name to dnlc, possibly negative 5197 * if LOOKUP succeeded 5198 * cache new attributes 5199 * else 5200 * set a new r_time_attr_inval for dvp 5201 * check to make sure we have access 5202 * 5203 * The vpp returned is the vnode passed in if the directory is valid, 5204 * a new vnode if successful lookup, or NULL on error. 5205 */ 5206 static int 5207 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5208 { 5209 COMPOUND4args_clnt args; 5210 COMPOUND4res_clnt res; 5211 fattr4 *ver_fattr; 5212 fattr4_change dchange; 5213 int32_t *ptr; 5214 int argoplist_size = 7 * sizeof (nfs_argop4); 5215 nfs_argop4 *argop; 5216 int doqueue; 5217 mntinfo4_t *mi; 5218 nfs4_recov_state_t recov_state; 5219 hrtime_t t; 5220 int isdotdot; 5221 vnode_t *nvp; 5222 nfs_fh4 *fhp; 5223 nfs4_sharedfh_t *sfhp; 5224 nfs4_access_type_t cacc; 5225 rnode4_t *nrp; 5226 rnode4_t *drp = VTOR4(dvp); 5227 nfs4_ga_res_t *garp = NULL; 5228 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5229 5230 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5231 ASSERT(nm != NULL); 5232 ASSERT(nm[0] != '\0'); 5233 ASSERT(dvp->v_type == VDIR); 5234 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5235 ASSERT(*vpp != NULL); 5236 5237 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5238 isdotdot = 1; 5239 args.ctag = TAG_LOOKUP_VPARENT; 5240 } else { 5241 /* 5242 * If dvp were a stub, it should have triggered and caused 5243 * a mount for us to get this far. 5244 */ 5245 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5246 5247 isdotdot = 0; 5248 args.ctag = TAG_LOOKUP_VALID; 5249 } 5250 5251 mi = VTOMI4(dvp); 5252 recov_state.rs_flags = 0; 5253 recov_state.rs_num_retry_despite_err = 0; 5254 5255 nvp = NULL; 5256 5257 /* Save the original mount point security information */ 5258 (void) save_mnt_secinfo(mi->mi_curr_serv); 5259 5260 recov_retry: 5261 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5262 &recov_state, NULL); 5263 if (e.error) { 5264 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5265 VN_RELE(*vpp); 5266 *vpp = NULL; 5267 return (e.error); 5268 } 5269 5270 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5271 5272 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5273 args.array_len = 7; 5274 args.array = argop; 5275 5276 /* 0. putfh file */ 5277 argop[0].argop = OP_CPUTFH; 5278 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5279 5280 /* 1. nverify the change info */ 5281 argop[1].argop = OP_NVERIFY; 5282 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5283 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5284 ver_fattr->attrlist4 = (char *)&dchange; 5285 ptr = (int32_t *)&dchange; 5286 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5287 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5288 5289 /* 2. getattr directory */ 5290 argop[2].argop = OP_GETATTR; 5291 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5292 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5293 5294 /* 3. access directory */ 5295 argop[3].argop = OP_ACCESS; 5296 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5297 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5298 5299 /* 4. lookup name */ 5300 if (isdotdot) { 5301 argop[4].argop = OP_LOOKUPP; 5302 } else { 5303 argop[4].argop = OP_CLOOKUP; 5304 argop[4].nfs_argop4_u.opclookup.cname = nm; 5305 } 5306 5307 /* 5. resulting file handle */ 5308 argop[5].argop = OP_GETFH; 5309 5310 /* 6. resulting file attributes */ 5311 argop[6].argop = OP_GETATTR; 5312 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5313 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5314 5315 doqueue = 1; 5316 t = gethrtime(); 5317 5318 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5319 5320 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5321 /* 5322 * For WRONGSEC of a non-dotdot case, send secinfo directly 5323 * from this thread, do not go thru the recovery thread since 5324 * we need the nm information. 5325 * 5326 * Not doing dotdot case because there is no specification 5327 * for (PUTFH, SECINFO "..") yet. 5328 */ 5329 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5330 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5331 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5332 &recov_state, FALSE); 5333 else 5334 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5335 &recov_state, TRUE); 5336 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5337 kmem_free(argop, argoplist_size); 5338 if (!e.error) 5339 goto recov_retry; 5340 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5341 VN_RELE(*vpp); 5342 *vpp = NULL; 5343 return (e.error); 5344 } 5345 5346 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5347 OP_LOOKUP, NULL) == FALSE) { 5348 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5349 &recov_state, TRUE); 5350 5351 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5352 kmem_free(argop, argoplist_size); 5353 goto recov_retry; 5354 } 5355 } 5356 5357 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5358 5359 if (e.error || res.array_len == 0) { 5360 /* 5361 * If e.error isn't set, then reply has no ops (or we couldn't 5362 * be here). The only legal way to reply without an op array 5363 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5364 * be in the reply for all other status values. 5365 * 5366 * For valid replies without an ops array, return ENOTSUP 5367 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5368 * return EIO -- don't trust status. 5369 */ 5370 if (e.error == 0) 5371 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5372 ENOTSUP : EIO; 5373 VN_RELE(*vpp); 5374 *vpp = NULL; 5375 kmem_free(argop, argoplist_size); 5376 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5377 return (e.error); 5378 } 5379 5380 if (res.status != NFS4ERR_SAME) { 5381 e.error = geterrno4(res.status); 5382 5383 /* 5384 * The NVERIFY "failed" so the directory has changed 5385 * First make sure PUTFH succeeded and NVERIFY "failed" 5386 * cleanly. 5387 */ 5388 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5389 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5390 nfs4_purge_stale_fh(e.error, dvp, cr); 5391 VN_RELE(*vpp); 5392 *vpp = NULL; 5393 goto exit; 5394 } 5395 5396 /* 5397 * We know the NVERIFY "failed" so we must: 5398 * purge the caches (access and indirectly dnlc if needed) 5399 */ 5400 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5401 5402 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5403 nfs4_purge_stale_fh(e.error, dvp, cr); 5404 VN_RELE(*vpp); 5405 *vpp = NULL; 5406 goto exit; 5407 } 5408 5409 /* 5410 * Install new cached attributes for the directory 5411 */ 5412 nfs4_attr_cache(dvp, 5413 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5414 t, cr, FALSE, NULL); 5415 5416 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5417 nfs4_purge_stale_fh(e.error, dvp, cr); 5418 VN_RELE(*vpp); 5419 *vpp = NULL; 5420 e.error = geterrno4(res.status); 5421 goto exit; 5422 } 5423 5424 /* 5425 * Now we know the directory is valid, 5426 * cache new directory access 5427 */ 5428 nfs4_access_cache(drp, 5429 args.array[3].nfs_argop4_u.opaccess.access, 5430 res.array[3].nfs_resop4_u.opaccess.access, cr); 5431 5432 /* 5433 * recheck VEXEC access 5434 */ 5435 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5436 if (cacc != NFS4_ACCESS_ALLOWED) { 5437 /* 5438 * Directory permissions might have been revoked 5439 */ 5440 if (cacc == NFS4_ACCESS_DENIED) { 5441 e.error = EACCES; 5442 VN_RELE(*vpp); 5443 *vpp = NULL; 5444 goto exit; 5445 } 5446 5447 /* 5448 * Somehow we must not have asked for enough 5449 * so try a singleton ACCESS, should never happen. 5450 */ 5451 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5452 if (e.error) { 5453 VN_RELE(*vpp); 5454 *vpp = NULL; 5455 goto exit; 5456 } 5457 } 5458 5459 e.error = geterrno4(res.status); 5460 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5461 /* 5462 * The lookup failed, probably no entry 5463 */ 5464 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5465 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5466 } else { 5467 /* 5468 * Might be some other error, so remove 5469 * the dnlc entry to make sure we start all 5470 * over again, next time. 5471 */ 5472 dnlc_remove(dvp, nm); 5473 } 5474 VN_RELE(*vpp); 5475 *vpp = NULL; 5476 goto exit; 5477 } 5478 5479 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5480 /* 5481 * The file exists but we can't get its fh for 5482 * some unknown reason. Remove it from the dnlc 5483 * and error out to be safe. 5484 */ 5485 dnlc_remove(dvp, nm); 5486 VN_RELE(*vpp); 5487 *vpp = NULL; 5488 goto exit; 5489 } 5490 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5491 if (fhp->nfs_fh4_len == 0) { 5492 /* 5493 * The file exists but a bogus fh 5494 * some unknown reason. Remove it from the dnlc 5495 * and error out to be safe. 5496 */ 5497 e.error = ENOENT; 5498 dnlc_remove(dvp, nm); 5499 VN_RELE(*vpp); 5500 *vpp = NULL; 5501 goto exit; 5502 } 5503 sfhp = sfh4_get(fhp, mi); 5504 5505 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5506 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5507 5508 /* 5509 * Make the new rnode 5510 */ 5511 if (isdotdot) { 5512 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5513 if (e.error) { 5514 sfh4_rele(&sfhp); 5515 VN_RELE(*vpp); 5516 *vpp = NULL; 5517 goto exit; 5518 } 5519 /* 5520 * XXX if nfs4_make_dotdot uses an existing rnode 5521 * XXX it doesn't update the attributes. 5522 * XXX for now just save them again to save an OTW 5523 */ 5524 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5525 } else { 5526 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5527 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5528 /* 5529 * If v_type == VNON, then garp was NULL because 5530 * the last op in the compound failed and makenfs4node 5531 * could not find the vnode for sfhp. It created 5532 * a new vnode, so we have nothing to purge here. 5533 */ 5534 if (nvp->v_type == VNON) { 5535 vattr_t vattr; 5536 5537 vattr.va_mask = AT_TYPE; 5538 /* 5539 * N.B. We've already called nfs4_end_fop above. 5540 */ 5541 e.error = nfs4getattr(nvp, &vattr, cr); 5542 if (e.error) { 5543 sfh4_rele(&sfhp); 5544 VN_RELE(*vpp); 5545 *vpp = NULL; 5546 VN_RELE(nvp); 5547 goto exit; 5548 } 5549 nvp->v_type = vattr.va_type; 5550 } 5551 } 5552 sfh4_rele(&sfhp); 5553 5554 nrp = VTOR4(nvp); 5555 mutex_enter(&nrp->r_statev4_lock); 5556 if (!nrp->created_v4) { 5557 mutex_exit(&nrp->r_statev4_lock); 5558 dnlc_update(dvp, nm, nvp); 5559 } else 5560 mutex_exit(&nrp->r_statev4_lock); 5561 5562 VN_RELE(*vpp); 5563 *vpp = nvp; 5564 } else { 5565 hrtime_t now; 5566 hrtime_t delta = 0; 5567 5568 e.error = 0; 5569 5570 /* 5571 * Because the NVERIFY "succeeded" we know that the 5572 * directory attributes are still valid 5573 * so update r_time_attr_inval 5574 */ 5575 now = gethrtime(); 5576 mutex_enter(&drp->r_statelock); 5577 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5578 delta = now - drp->r_time_attr_saved; 5579 if (delta < mi->mi_acdirmin) 5580 delta = mi->mi_acdirmin; 5581 else if (delta > mi->mi_acdirmax) 5582 delta = mi->mi_acdirmax; 5583 } 5584 drp->r_time_attr_inval = now + delta; 5585 mutex_exit(&drp->r_statelock); 5586 dnlc_update(dvp, nm, *vpp); 5587 5588 /* 5589 * Even though we have a valid directory attr cache 5590 * and dnlc entry, we may not have access. 5591 * This should almost always hit the cache. 5592 */ 5593 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5594 if (e.error) { 5595 VN_RELE(*vpp); 5596 *vpp = NULL; 5597 } 5598 5599 if (*vpp == DNLC_NO_VNODE) { 5600 VN_RELE(*vpp); 5601 *vpp = NULL; 5602 e.error = ENOENT; 5603 } 5604 } 5605 5606 exit: 5607 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5608 kmem_free(argop, argoplist_size); 5609 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5610 return (e.error); 5611 } 5612 5613 /* 5614 * We need to go over the wire to lookup the name, but 5615 * while we are there verify the directory has not 5616 * changed but if it has, get new attributes and check access 5617 * 5618 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5619 * NVERIFY GETATTR ACCESS 5620 * 5621 * With the results: 5622 * if the NVERIFY failed we must purge the caches, add new attributes, 5623 * and cache new access. 5624 * set a new r_time_attr_inval 5625 * add name to dnlc, possibly negative 5626 * if LOOKUP succeeded 5627 * cache new attributes 5628 */ 5629 static int 5630 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5631 { 5632 COMPOUND4args_clnt args; 5633 COMPOUND4res_clnt res; 5634 fattr4 *ver_fattr; 5635 fattr4_change dchange; 5636 int32_t *ptr; 5637 nfs4_ga_res_t *garp = NULL; 5638 int argoplist_size = 9 * sizeof (nfs_argop4); 5639 nfs_argop4 *argop; 5640 int doqueue; 5641 mntinfo4_t *mi; 5642 nfs4_recov_state_t recov_state; 5643 hrtime_t t; 5644 int isdotdot; 5645 vnode_t *nvp; 5646 nfs_fh4 *fhp; 5647 nfs4_sharedfh_t *sfhp; 5648 nfs4_access_type_t cacc; 5649 rnode4_t *nrp; 5650 rnode4_t *drp = VTOR4(dvp); 5651 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5652 5653 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5654 ASSERT(nm != NULL); 5655 ASSERT(nm[0] != '\0'); 5656 ASSERT(dvp->v_type == VDIR); 5657 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5658 ASSERT(*vpp == NULL); 5659 5660 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5661 isdotdot = 1; 5662 args.ctag = TAG_LOOKUP_PARENT; 5663 } else { 5664 /* 5665 * If dvp were a stub, it should have triggered and caused 5666 * a mount for us to get this far. 5667 */ 5668 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5669 5670 isdotdot = 0; 5671 args.ctag = TAG_LOOKUP; 5672 } 5673 5674 mi = VTOMI4(dvp); 5675 recov_state.rs_flags = 0; 5676 recov_state.rs_num_retry_despite_err = 0; 5677 5678 nvp = NULL; 5679 5680 /* Save the original mount point security information */ 5681 (void) save_mnt_secinfo(mi->mi_curr_serv); 5682 5683 recov_retry: 5684 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5685 &recov_state, NULL); 5686 if (e.error) { 5687 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5688 return (e.error); 5689 } 5690 5691 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5692 5693 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5694 args.array_len = 9; 5695 args.array = argop; 5696 5697 /* 0. putfh file */ 5698 argop[0].argop = OP_CPUTFH; 5699 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5700 5701 /* 1. savefh for the nverify */ 5702 argop[1].argop = OP_SAVEFH; 5703 5704 /* 2. lookup name */ 5705 if (isdotdot) { 5706 argop[2].argop = OP_LOOKUPP; 5707 } else { 5708 argop[2].argop = OP_CLOOKUP; 5709 argop[2].nfs_argop4_u.opclookup.cname = nm; 5710 } 5711 5712 /* 3. resulting file handle */ 5713 argop[3].argop = OP_GETFH; 5714 5715 /* 4. resulting file attributes */ 5716 argop[4].argop = OP_GETATTR; 5717 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5718 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5719 5720 /* 5. restorefh back the directory for the nverify */ 5721 argop[5].argop = OP_RESTOREFH; 5722 5723 /* 6. nverify the change info */ 5724 argop[6].argop = OP_NVERIFY; 5725 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5726 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5727 ver_fattr->attrlist4 = (char *)&dchange; 5728 ptr = (int32_t *)&dchange; 5729 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5730 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5731 5732 /* 7. getattr directory */ 5733 argop[7].argop = OP_GETATTR; 5734 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5735 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5736 5737 /* 8. access directory */ 5738 argop[8].argop = OP_ACCESS; 5739 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5740 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5741 5742 doqueue = 1; 5743 t = gethrtime(); 5744 5745 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5746 5747 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5748 /* 5749 * For WRONGSEC of a non-dotdot case, send secinfo directly 5750 * from this thread, do not go thru the recovery thread since 5751 * we need the nm information. 5752 * 5753 * Not doing dotdot case because there is no specification 5754 * for (PUTFH, SECINFO "..") yet. 5755 */ 5756 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5757 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5758 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5759 &recov_state, FALSE); 5760 else 5761 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5762 &recov_state, TRUE); 5763 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5764 kmem_free(argop, argoplist_size); 5765 if (!e.error) 5766 goto recov_retry; 5767 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5768 return (e.error); 5769 } 5770 5771 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5772 OP_LOOKUP, NULL) == FALSE) { 5773 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5774 &recov_state, TRUE); 5775 5776 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5777 kmem_free(argop, argoplist_size); 5778 goto recov_retry; 5779 } 5780 } 5781 5782 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5783 5784 if (e.error || res.array_len == 0) { 5785 /* 5786 * If e.error isn't set, then reply has no ops (or we couldn't 5787 * be here). The only legal way to reply without an op array 5788 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5789 * be in the reply for all other status values. 5790 * 5791 * For valid replies without an ops array, return ENOTSUP 5792 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5793 * return EIO -- don't trust status. 5794 */ 5795 if (e.error == 0) 5796 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5797 ENOTSUP : EIO; 5798 5799 kmem_free(argop, argoplist_size); 5800 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5801 return (e.error); 5802 } 5803 5804 e.error = geterrno4(res.status); 5805 5806 /* 5807 * The PUTFH and SAVEFH may have failed. 5808 */ 5809 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5810 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5811 nfs4_purge_stale_fh(e.error, dvp, cr); 5812 goto exit; 5813 } 5814 5815 /* 5816 * Check if the file exists, if it does delay entering 5817 * into the dnlc until after we update the directory 5818 * attributes so we don't cause it to get purged immediately. 5819 */ 5820 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5821 /* 5822 * The lookup failed, probably no entry 5823 */ 5824 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5825 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5826 goto exit; 5827 } 5828 5829 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5830 /* 5831 * The file exists but we can't get its fh for 5832 * some unknown reason. Error out to be safe. 5833 */ 5834 goto exit; 5835 } 5836 5837 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5838 if (fhp->nfs_fh4_len == 0) { 5839 /* 5840 * The file exists but a bogus fh 5841 * some unknown reason. Error out to be safe. 5842 */ 5843 e.error = EIO; 5844 goto exit; 5845 } 5846 sfhp = sfh4_get(fhp, mi); 5847 5848 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5849 sfh4_rele(&sfhp); 5850 goto exit; 5851 } 5852 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5853 5854 /* 5855 * The RESTOREFH may have failed 5856 */ 5857 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5858 sfh4_rele(&sfhp); 5859 e.error = EIO; 5860 goto exit; 5861 } 5862 5863 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5864 /* 5865 * First make sure the NVERIFY failed as we expected, 5866 * if it didn't then be conservative and error out 5867 * as we can't trust the directory. 5868 */ 5869 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5870 sfh4_rele(&sfhp); 5871 e.error = EIO; 5872 goto exit; 5873 } 5874 5875 /* 5876 * We know the NVERIFY "failed" so the directory has changed, 5877 * so we must: 5878 * purge the caches (access and indirectly dnlc if needed) 5879 */ 5880 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5881 5882 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5883 sfh4_rele(&sfhp); 5884 goto exit; 5885 } 5886 nfs4_attr_cache(dvp, 5887 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5888 t, cr, FALSE, NULL); 5889 5890 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5891 nfs4_purge_stale_fh(e.error, dvp, cr); 5892 sfh4_rele(&sfhp); 5893 e.error = geterrno4(res.status); 5894 goto exit; 5895 } 5896 5897 /* 5898 * Now we know the directory is valid, 5899 * cache new directory access 5900 */ 5901 nfs4_access_cache(drp, 5902 args.array[8].nfs_argop4_u.opaccess.access, 5903 res.array[8].nfs_resop4_u.opaccess.access, cr); 5904 5905 /* 5906 * recheck VEXEC access 5907 */ 5908 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5909 if (cacc != NFS4_ACCESS_ALLOWED) { 5910 /* 5911 * Directory permissions might have been revoked 5912 */ 5913 if (cacc == NFS4_ACCESS_DENIED) { 5914 sfh4_rele(&sfhp); 5915 e.error = EACCES; 5916 goto exit; 5917 } 5918 5919 /* 5920 * Somehow we must not have asked for enough 5921 * so try a singleton ACCESS should never happen 5922 */ 5923 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5924 if (e.error) { 5925 sfh4_rele(&sfhp); 5926 goto exit; 5927 } 5928 } 5929 5930 e.error = geterrno4(res.status); 5931 } else { 5932 hrtime_t now; 5933 hrtime_t delta = 0; 5934 5935 e.error = 0; 5936 5937 /* 5938 * Because the NVERIFY "succeeded" we know that the 5939 * directory attributes are still valid 5940 * so update r_time_attr_inval 5941 */ 5942 now = gethrtime(); 5943 mutex_enter(&drp->r_statelock); 5944 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5945 delta = now - drp->r_time_attr_saved; 5946 if (delta < mi->mi_acdirmin) 5947 delta = mi->mi_acdirmin; 5948 else if (delta > mi->mi_acdirmax) 5949 delta = mi->mi_acdirmax; 5950 } 5951 drp->r_time_attr_inval = now + delta; 5952 mutex_exit(&drp->r_statelock); 5953 5954 /* 5955 * Even though we have a valid directory attr cache, 5956 * we may not have access. 5957 * This should almost always hit the cache. 5958 */ 5959 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5960 if (e.error) { 5961 sfh4_rele(&sfhp); 5962 goto exit; 5963 } 5964 } 5965 5966 /* 5967 * Now we have successfully completed the lookup, if the 5968 * directory has changed we now have the valid attributes. 5969 * We also know we have directory access. 5970 * Create the new rnode and insert it in the dnlc. 5971 */ 5972 if (isdotdot) { 5973 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5974 if (e.error) { 5975 sfh4_rele(&sfhp); 5976 goto exit; 5977 } 5978 /* 5979 * XXX if nfs4_make_dotdot uses an existing rnode 5980 * XXX it doesn't update the attributes. 5981 * XXX for now just save them again to save an OTW 5982 */ 5983 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5984 } else { 5985 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5986 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5987 } 5988 sfh4_rele(&sfhp); 5989 5990 nrp = VTOR4(nvp); 5991 mutex_enter(&nrp->r_statev4_lock); 5992 if (!nrp->created_v4) { 5993 mutex_exit(&nrp->r_statev4_lock); 5994 dnlc_update(dvp, nm, nvp); 5995 } else 5996 mutex_exit(&nrp->r_statev4_lock); 5997 5998 *vpp = nvp; 5999 6000 exit: 6001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6002 kmem_free(argop, argoplist_size); 6003 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6004 return (e.error); 6005 } 6006 6007 #ifdef DEBUG 6008 void 6009 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6010 { 6011 uint_t i, len; 6012 zoneid_t zoneid = getzoneid(); 6013 char *s; 6014 6015 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6016 for (i = 0; i < argcnt; i++) { 6017 nfs_argop4 *op = &argbase[i]; 6018 switch (op->argop) { 6019 case OP_CPUTFH: 6020 case OP_PUTFH: 6021 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6022 break; 6023 case OP_PUTROOTFH: 6024 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6025 break; 6026 case OP_CLOOKUP: 6027 s = op->nfs_argop4_u.opclookup.cname; 6028 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6029 break; 6030 case OP_LOOKUP: 6031 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6032 &len, NULL); 6033 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6034 kmem_free(s, len); 6035 break; 6036 case OP_LOOKUPP: 6037 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6038 break; 6039 case OP_GETFH: 6040 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6041 break; 6042 case OP_GETATTR: 6043 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6044 break; 6045 case OP_OPENATTR: 6046 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6047 break; 6048 default: 6049 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6050 op->argop); 6051 break; 6052 } 6053 } 6054 } 6055 #endif 6056 6057 /* 6058 * nfs4lookup_setup - constructs a multi-lookup compound request. 6059 * 6060 * Given the path "nm1/nm2/.../nmn", the following compound requests 6061 * may be created: 6062 * 6063 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6064 * is faster, for now. 6065 * 6066 * l4_getattrs indicates the type of compound requested. 6067 * 6068 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6069 * 6070 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6071 * 6072 * total number of ops is n + 1. 6073 * 6074 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6075 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6076 * before the last component, and only get attributes 6077 * for the last component. Note that the second-to-last 6078 * pathname component is XATTR_RPATH, which does NOT go 6079 * over-the-wire as a lookup. 6080 * 6081 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6082 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6083 * 6084 * and total number of ops is n + 5. 6085 * 6086 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6087 * attribute directory: create lookups plus an OPENATTR 6088 * replacing the last lookup. Note that the last pathname 6089 * component is XATTR_RPATH, which does NOT go over-the-wire 6090 * as a lookup. 6091 * 6092 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6093 * Openattr; Getfh; Getattr } 6094 * 6095 * and total number of ops is n + 5. 6096 * 6097 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6098 * nodes too. 6099 * 6100 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6101 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6102 * 6103 * and total number of ops is 3*n + 1. 6104 * 6105 * All cases: returns the index in the arg array of the final LOOKUP op, or 6106 * -1 if no LOOKUPs were used. 6107 */ 6108 int 6109 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6110 { 6111 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6112 nfs_argop4 *argbase, *argop; 6113 int arglen, argcnt; 6114 int n = 1; /* number of components */ 6115 int nga = 1; /* number of Getattr's in request */ 6116 char c = '\0', *s, *p; 6117 int lookup_idx = -1; 6118 int argoplist_size; 6119 6120 /* set lookuparg response result to 0 */ 6121 lookupargp->resp->status = NFS4_OK; 6122 6123 /* skip leading "/" or "." e.g. ".//./" if there is */ 6124 for (; ; nm++) { 6125 if (*nm != '/' && *nm != '.') 6126 break; 6127 6128 /* ".." is counted as 1 component */ 6129 if (*nm == '.' && *(nm + 1) != '/') 6130 break; 6131 } 6132 6133 /* 6134 * Find n = number of components - nm must be null terminated 6135 * Skip "." components. 6136 */ 6137 if (*nm != '\0') 6138 for (n = 1, s = nm; *s != '\0'; s++) { 6139 if ((*s == '/') && (*(s + 1) != '/') && 6140 (*(s + 1) != '\0') && 6141 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6142 *(s + 2) == '\0'))) 6143 n++; 6144 } 6145 else 6146 n = 0; 6147 6148 /* 6149 * nga is number of components that need Getfh+Getattr 6150 */ 6151 switch (l4_getattrs) { 6152 case LKP4_NO_ATTRIBUTES: 6153 nga = 0; 6154 break; 6155 case LKP4_ALL_ATTRIBUTES: 6156 nga = n; 6157 /* 6158 * Always have at least 1 getfh, getattr pair 6159 */ 6160 if (nga == 0) 6161 nga++; 6162 break; 6163 case LKP4_LAST_ATTRDIR: 6164 case LKP4_LAST_NAMED_ATTR: 6165 nga = n+1; 6166 break; 6167 } 6168 6169 /* 6170 * If change to use the filehandle attr instead of getfh 6171 * the following line can be deleted. 6172 */ 6173 nga *= 2; 6174 6175 /* 6176 * calculate number of ops in request as 6177 * header + trailer + lookups + getattrs 6178 */ 6179 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6180 6181 argoplist_size = arglen * sizeof (nfs_argop4); 6182 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6183 lookupargp->argsp->array = argop; 6184 6185 argcnt = lookupargp->header_len; 6186 argop += argcnt; 6187 6188 /* 6189 * loop and create a lookup op and possibly getattr/getfh for 6190 * each component. Skip "." components. 6191 */ 6192 for (s = nm; *s != '\0'; s = p) { 6193 /* 6194 * Set up a pathname struct for each component if needed 6195 */ 6196 while (*s == '/') 6197 s++; 6198 if (*s == '\0') 6199 break; 6200 6201 for (p = s; (*p != '/') && (*p != '\0'); p++) 6202 ; 6203 c = *p; 6204 *p = '\0'; 6205 6206 if (s[0] == '.' && s[1] == '\0') { 6207 *p = c; 6208 continue; 6209 } 6210 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6211 strcmp(s, XATTR_RPATH) == 0) { 6212 /* getfh XXX may not be needed in future */ 6213 argop->argop = OP_GETFH; 6214 argop++; 6215 argcnt++; 6216 6217 /* getattr */ 6218 argop->argop = OP_GETATTR; 6219 argop->nfs_argop4_u.opgetattr.attr_request = 6220 lookupargp->ga_bits; 6221 argop->nfs_argop4_u.opgetattr.mi = 6222 lookupargp->mi; 6223 argop++; 6224 argcnt++; 6225 6226 /* openattr */ 6227 argop->argop = OP_OPENATTR; 6228 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6229 strcmp(s, XATTR_RPATH) == 0) { 6230 /* openattr */ 6231 argop->argop = OP_OPENATTR; 6232 argop++; 6233 argcnt++; 6234 6235 /* getfh XXX may not be needed in future */ 6236 argop->argop = OP_GETFH; 6237 argop++; 6238 argcnt++; 6239 6240 /* getattr */ 6241 argop->argop = OP_GETATTR; 6242 argop->nfs_argop4_u.opgetattr.attr_request = 6243 lookupargp->ga_bits; 6244 argop->nfs_argop4_u.opgetattr.mi = 6245 lookupargp->mi; 6246 argop++; 6247 argcnt++; 6248 *p = c; 6249 continue; 6250 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6251 /* lookupp */ 6252 argop->argop = OP_LOOKUPP; 6253 } else { 6254 /* lookup */ 6255 argop->argop = OP_LOOKUP; 6256 (void) str_to_utf8(s, 6257 &argop->nfs_argop4_u.oplookup.objname); 6258 } 6259 lookup_idx = argcnt; 6260 argop++; 6261 argcnt++; 6262 6263 *p = c; 6264 6265 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6266 /* getfh XXX may not be needed in future */ 6267 argop->argop = OP_GETFH; 6268 argop++; 6269 argcnt++; 6270 6271 /* getattr */ 6272 argop->argop = OP_GETATTR; 6273 argop->nfs_argop4_u.opgetattr.attr_request = 6274 lookupargp->ga_bits; 6275 argop->nfs_argop4_u.opgetattr.mi = 6276 lookupargp->mi; 6277 argop++; 6278 argcnt++; 6279 } 6280 } 6281 6282 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6283 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6284 if (needgetfh) { 6285 /* stick in a post-lookup getfh */ 6286 argop->argop = OP_GETFH; 6287 argcnt++; 6288 argop++; 6289 } 6290 /* post-lookup getattr */ 6291 argop->argop = OP_GETATTR; 6292 argop->nfs_argop4_u.opgetattr.attr_request = 6293 lookupargp->ga_bits; 6294 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6295 argcnt++; 6296 } 6297 argcnt += lookupargp->trailer_len; /* actual op count */ 6298 lookupargp->argsp->array_len = argcnt; 6299 lookupargp->arglen = arglen; 6300 6301 #ifdef DEBUG 6302 if (nfs4_client_lookup_debug) 6303 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6304 #endif 6305 6306 return (lookup_idx); 6307 } 6308 6309 static int 6310 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6311 { 6312 COMPOUND4args_clnt args; 6313 COMPOUND4res_clnt res; 6314 GETFH4res *gf_res = NULL; 6315 nfs_argop4 argop[4]; 6316 nfs_resop4 *resop = NULL; 6317 nfs4_sharedfh_t *sfhp; 6318 hrtime_t t; 6319 nfs4_error_t e; 6320 6321 rnode4_t *drp; 6322 int doqueue = 1; 6323 vnode_t *vp; 6324 int needrecov = 0; 6325 nfs4_recov_state_t recov_state; 6326 6327 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6328 6329 *avp = NULL; 6330 recov_state.rs_flags = 0; 6331 recov_state.rs_num_retry_despite_err = 0; 6332 6333 recov_retry: 6334 /* COMPOUND: putfh, openattr, getfh, getattr */ 6335 args.array_len = 4; 6336 args.array = argop; 6337 args.ctag = TAG_OPENATTR; 6338 6339 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6340 if (e.error) 6341 return (e.error); 6342 6343 drp = VTOR4(dvp); 6344 6345 /* putfh */ 6346 argop[0].argop = OP_CPUTFH; 6347 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6348 6349 /* openattr */ 6350 argop[1].argop = OP_OPENATTR; 6351 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6352 6353 /* getfh */ 6354 argop[2].argop = OP_GETFH; 6355 6356 /* getattr */ 6357 argop[3].argop = OP_GETATTR; 6358 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6359 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6360 6361 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6362 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6363 rnode4info(drp))); 6364 6365 t = gethrtime(); 6366 6367 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6368 6369 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6370 if (needrecov) { 6371 bool_t abort; 6372 6373 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6374 "nfs4openattr: initiating recovery\n")); 6375 6376 abort = nfs4_start_recovery(&e, 6377 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6378 OP_OPENATTR, NULL); 6379 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6380 if (!e.error) { 6381 e.error = geterrno4(res.status); 6382 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6383 } 6384 if (abort == FALSE) 6385 goto recov_retry; 6386 return (e.error); 6387 } 6388 6389 if (e.error) { 6390 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6391 return (e.error); 6392 } 6393 6394 if (res.status) { 6395 /* 6396 * If OTW errro is NOTSUPP, then it should be 6397 * translated to EINVAL. All Solaris file system 6398 * implementations return EINVAL to the syscall layer 6399 * when the attrdir cannot be created due to an 6400 * implementation restriction or noxattr mount option. 6401 */ 6402 if (res.status == NFS4ERR_NOTSUPP) { 6403 mutex_enter(&drp->r_statelock); 6404 if (drp->r_xattr_dir) 6405 VN_RELE(drp->r_xattr_dir); 6406 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6407 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6408 mutex_exit(&drp->r_statelock); 6409 6410 e.error = EINVAL; 6411 } else { 6412 e.error = geterrno4(res.status); 6413 } 6414 6415 if (e.error) { 6416 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6417 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6418 needrecov); 6419 return (e.error); 6420 } 6421 } 6422 6423 resop = &res.array[0]; /* putfh res */ 6424 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6425 6426 resop = &res.array[1]; /* openattr res */ 6427 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6428 6429 resop = &res.array[2]; /* getfh res */ 6430 gf_res = &resop->nfs_resop4_u.opgetfh; 6431 if (gf_res->object.nfs_fh4_len == 0) { 6432 *avp = NULL; 6433 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6434 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6435 return (ENOENT); 6436 } 6437 6438 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6439 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6440 dvp->v_vfsp, t, cr, dvp, 6441 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6442 sfh4_rele(&sfhp); 6443 6444 if (e.error) 6445 PURGE_ATTRCACHE4(vp); 6446 6447 mutex_enter(&vp->v_lock); 6448 vp->v_flag |= V_XATTRDIR; 6449 mutex_exit(&vp->v_lock); 6450 6451 *avp = vp; 6452 6453 mutex_enter(&drp->r_statelock); 6454 if (drp->r_xattr_dir) 6455 VN_RELE(drp->r_xattr_dir); 6456 VN_HOLD(vp); 6457 drp->r_xattr_dir = vp; 6458 6459 /* 6460 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6461 * NULL. xattrs could be created at any time, and we have no 6462 * way to update pc4_xattr_exists in the base object if/when 6463 * it happens. 6464 */ 6465 drp->r_pathconf.pc4_xattr_valid = 0; 6466 6467 mutex_exit(&drp->r_statelock); 6468 6469 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6470 6471 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6472 6473 return (0); 6474 } 6475 6476 /* ARGSUSED */ 6477 static int 6478 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6479 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6480 vsecattr_t *vsecp) 6481 { 6482 int error; 6483 vnode_t *vp = NULL; 6484 rnode4_t *rp; 6485 struct vattr vattr; 6486 rnode4_t *drp; 6487 vnode_t *tempvp; 6488 enum createmode4 createmode; 6489 bool_t must_trunc = FALSE; 6490 int truncating = 0; 6491 6492 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6493 return (EPERM); 6494 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6495 return (EINVAL); 6496 } 6497 6498 /* . and .. have special meaning in the protocol, reject them. */ 6499 6500 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6501 return (EISDIR); 6502 6503 drp = VTOR4(dvp); 6504 6505 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6506 return (EINTR); 6507 6508 top: 6509 /* 6510 * We make a copy of the attributes because the caller does not 6511 * expect us to change what va points to. 6512 */ 6513 vattr = *va; 6514 6515 /* 6516 * If the pathname is "", then dvp is the root vnode of 6517 * a remote file mounted over a local directory. 6518 * All that needs to be done is access 6519 * checking and truncation. Note that we avoid doing 6520 * open w/ create because the parent directory might 6521 * be in pseudo-fs and the open would fail. 6522 */ 6523 if (*nm == '\0') { 6524 error = 0; 6525 VN_HOLD(dvp); 6526 vp = dvp; 6527 must_trunc = TRUE; 6528 } else { 6529 /* 6530 * We need to go over the wire, just to be sure whether the 6531 * file exists or not. Using the DNLC can be dangerous in 6532 * this case when making a decision regarding existence. 6533 */ 6534 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6535 } 6536 6537 if (exclusive) 6538 createmode = EXCLUSIVE4; 6539 else 6540 createmode = GUARDED4; 6541 6542 /* 6543 * error would be set if the file does not exist on the 6544 * server, so lets go create it. 6545 */ 6546 if (error) { 6547 goto create_otw; 6548 } 6549 6550 /* 6551 * File does exist on the server 6552 */ 6553 if (exclusive == EXCL) 6554 error = EEXIST; 6555 else if (vp->v_type == VDIR && (mode & VWRITE)) 6556 error = EISDIR; 6557 else { 6558 /* 6559 * If vnode is a device, create special vnode. 6560 */ 6561 if (ISVDEV(vp->v_type)) { 6562 tempvp = vp; 6563 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6564 VN_RELE(tempvp); 6565 } 6566 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6567 if ((vattr.va_mask & AT_SIZE) && 6568 vp->v_type == VREG) { 6569 rp = VTOR4(vp); 6570 /* 6571 * Check here for large file handled 6572 * by LF-unaware process (as 6573 * ufs_create() does) 6574 */ 6575 if (!(flags & FOFFMAX)) { 6576 mutex_enter(&rp->r_statelock); 6577 if (rp->r_size > MAXOFF32_T) 6578 error = EOVERFLOW; 6579 mutex_exit(&rp->r_statelock); 6580 } 6581 6582 /* if error is set then we need to return */ 6583 if (error) { 6584 nfs_rw_exit(&drp->r_rwlock); 6585 VN_RELE(vp); 6586 return (error); 6587 } 6588 6589 if (must_trunc) { 6590 vattr.va_mask = AT_SIZE; 6591 error = nfs4setattr(vp, &vattr, 0, cr, 6592 NULL); 6593 } else { 6594 /* 6595 * we know we have a regular file that already 6596 * exists and we may end up truncating the file 6597 * as a result of the open_otw, so flush out 6598 * any dirty pages for this file first. 6599 */ 6600 if (nfs4_has_pages(vp) && 6601 ((rp->r_flags & R4DIRTY) || 6602 rp->r_count > 0 || 6603 rp->r_mapcnt > 0)) { 6604 error = nfs4_putpage(vp, 6605 (offset_t)0, 0, 0, cr, ct); 6606 if (error && (error == ENOSPC || 6607 error == EDQUOT)) { 6608 mutex_enter( 6609 &rp->r_statelock); 6610 if (!rp->r_error) 6611 rp->r_error = 6612 error; 6613 mutex_exit( 6614 &rp->r_statelock); 6615 } 6616 } 6617 vattr.va_mask = (AT_SIZE | 6618 AT_TYPE | AT_MODE); 6619 vattr.va_type = VREG; 6620 createmode = UNCHECKED4; 6621 truncating = 1; 6622 goto create_otw; 6623 } 6624 } 6625 } 6626 } 6627 nfs_rw_exit(&drp->r_rwlock); 6628 if (error) { 6629 VN_RELE(vp); 6630 } else { 6631 vnode_t *tvp; 6632 rnode4_t *trp; 6633 /* 6634 * existing file got truncated, notify. 6635 */ 6636 tvp = vp; 6637 if (vp->v_type == VREG) { 6638 trp = VTOR4(vp); 6639 if (IS_SHADOW(vp, trp)) 6640 tvp = RTOV4(trp); 6641 } 6642 vnevent_create(tvp, ct); 6643 *vpp = vp; 6644 } 6645 return (error); 6646 6647 create_otw: 6648 dnlc_remove(dvp, nm); 6649 6650 ASSERT(vattr.va_mask & AT_TYPE); 6651 6652 /* 6653 * If not a regular file let nfs4mknod() handle it. 6654 */ 6655 if (vattr.va_type != VREG) { 6656 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6657 nfs_rw_exit(&drp->r_rwlock); 6658 return (error); 6659 } 6660 6661 /* 6662 * It _is_ a regular file. 6663 */ 6664 ASSERT(vattr.va_mask & AT_MODE); 6665 if (MANDMODE(vattr.va_mode)) { 6666 nfs_rw_exit(&drp->r_rwlock); 6667 return (EACCES); 6668 } 6669 6670 /* 6671 * If this happens to be a mknod of a regular file, then flags will 6672 * have neither FREAD or FWRITE. However, we must set at least one 6673 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6674 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6675 * set (based on openmode specified by app). 6676 */ 6677 if ((flags & (FREAD|FWRITE)) == 0) 6678 flags |= (FREAD|FWRITE); 6679 6680 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6681 6682 if (vp != NULL) { 6683 /* if create was successful, throw away the file's pages */ 6684 if (!error && (vattr.va_mask & AT_SIZE)) 6685 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6686 cr); 6687 /* release the lookup hold */ 6688 VN_RELE(vp); 6689 vp = NULL; 6690 } 6691 6692 /* 6693 * validate that we opened a regular file. This handles a misbehaving 6694 * server that returns an incorrect FH. 6695 */ 6696 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6697 error = EISDIR; 6698 VN_RELE(*vpp); 6699 } 6700 6701 /* 6702 * If this is not an exclusive create, then the CREATE 6703 * request will be made with the GUARDED mode set. This 6704 * means that the server will return EEXIST if the file 6705 * exists. The file could exist because of a retransmitted 6706 * request. In this case, we recover by starting over and 6707 * checking to see whether the file exists. This second 6708 * time through it should and a CREATE request will not be 6709 * sent. 6710 * 6711 * This handles the problem of a dangling CREATE request 6712 * which contains attributes which indicate that the file 6713 * should be truncated. This retransmitted request could 6714 * possibly truncate valid data in the file if not caught 6715 * by the duplicate request mechanism on the server or if 6716 * not caught by other means. The scenario is: 6717 * 6718 * Client transmits CREATE request with size = 0 6719 * Client times out, retransmits request. 6720 * Response to the first request arrives from the server 6721 * and the client proceeds on. 6722 * Client writes data to the file. 6723 * The server now processes retransmitted CREATE request 6724 * and truncates file. 6725 * 6726 * The use of the GUARDED CREATE request prevents this from 6727 * happening because the retransmitted CREATE would fail 6728 * with EEXIST and would not truncate the file. 6729 */ 6730 if (error == EEXIST && exclusive == NONEXCL) { 6731 #ifdef DEBUG 6732 nfs4_create_misses++; 6733 #endif 6734 goto top; 6735 } 6736 nfs_rw_exit(&drp->r_rwlock); 6737 if (truncating && !error && *vpp) { 6738 vnode_t *tvp; 6739 rnode4_t *trp; 6740 /* 6741 * existing file got truncated, notify. 6742 */ 6743 tvp = *vpp; 6744 trp = VTOR4(tvp); 6745 if (IS_SHADOW(tvp, trp)) 6746 tvp = RTOV4(trp); 6747 vnevent_create(tvp, ct); 6748 } 6749 return (error); 6750 } 6751 6752 /* 6753 * Create compound (for mkdir, mknod, symlink): 6754 * { Putfh <dfh>; Create; Getfh; Getattr } 6755 * It's okay if setattr failed to set gid - this is not considered 6756 * an error, but purge attrs in that case. 6757 */ 6758 static int 6759 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6760 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6761 { 6762 int need_end_op = FALSE; 6763 COMPOUND4args_clnt args; 6764 COMPOUND4res_clnt res, *resp = NULL; 6765 nfs_argop4 *argop; 6766 nfs_resop4 *resop; 6767 int doqueue; 6768 mntinfo4_t *mi; 6769 rnode4_t *drp = VTOR4(dvp); 6770 change_info4 *cinfo; 6771 GETFH4res *gf_res; 6772 struct vattr vattr; 6773 vnode_t *vp; 6774 fattr4 *crattr; 6775 bool_t needrecov = FALSE; 6776 nfs4_recov_state_t recov_state; 6777 nfs4_sharedfh_t *sfhp = NULL; 6778 hrtime_t t; 6779 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6780 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6781 dirattr_info_t dinfo, *dinfop; 6782 servinfo4_t *svp; 6783 bitmap4 supp_attrs; 6784 6785 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6786 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6787 6788 mi = VTOMI4(dvp); 6789 6790 /* 6791 * Make sure we properly deal with setting the right gid 6792 * on a new directory to reflect the parent's setgid bit 6793 */ 6794 setgid_flag = 0; 6795 if (type == NF4DIR) { 6796 struct vattr dva; 6797 6798 va->va_mode &= ~VSGID; 6799 dva.va_mask = AT_MODE | AT_GID; 6800 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6801 6802 /* 6803 * If the parent's directory has the setgid bit set 6804 * _and_ the client was able to get a valid mapping 6805 * for the parent dir's owner_group, we want to 6806 * append NVERIFY(owner_group == dva.va_gid) and 6807 * SETTATTR to the CREATE compound. 6808 */ 6809 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6810 setgid_flag = 1; 6811 va->va_mode |= VSGID; 6812 if (dva.va_gid != GID_NOBODY) { 6813 va->va_mask |= AT_GID; 6814 va->va_gid = dva.va_gid; 6815 } 6816 } 6817 } 6818 } 6819 6820 /* 6821 * Create ops: 6822 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6823 * 5:restorefh(dir) 6:getattr(dir) 6824 * 6825 * if (setgid) 6826 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6827 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6828 * 8:nverify 9:setattr 6829 */ 6830 if (setgid_flag) { 6831 numops = 10; 6832 idx_create = 1; 6833 idx_fattr = 3; 6834 } else { 6835 numops = 7; 6836 idx_create = 2; 6837 idx_fattr = 4; 6838 } 6839 6840 ASSERT(nfs_zone() == mi->mi_zone); 6841 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6842 return (EINTR); 6843 } 6844 recov_state.rs_flags = 0; 6845 recov_state.rs_num_retry_despite_err = 0; 6846 6847 argoplist_size = numops * sizeof (nfs_argop4); 6848 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6849 6850 recov_retry: 6851 if (type == NF4LNK) 6852 args.ctag = TAG_SYMLINK; 6853 else if (type == NF4DIR) 6854 args.ctag = TAG_MKDIR; 6855 else 6856 args.ctag = TAG_MKNOD; 6857 6858 args.array_len = numops; 6859 args.array = argop; 6860 6861 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6862 nfs_rw_exit(&drp->r_rwlock); 6863 kmem_free(argop, argoplist_size); 6864 return (e.error); 6865 } 6866 need_end_op = TRUE; 6867 6868 6869 /* 0: putfh directory */ 6870 argop[0].argop = OP_CPUTFH; 6871 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6872 6873 /* 1/2: Create object */ 6874 argop[idx_create].argop = OP_CCREATE; 6875 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6876 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6877 if (type == NF4LNK) { 6878 /* 6879 * symlink, treat name as data 6880 */ 6881 ASSERT(data != NULL); 6882 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6883 (char *)data; 6884 } 6885 if (type == NF4BLK || type == NF4CHR) { 6886 ASSERT(data != NULL); 6887 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6888 *((specdata4 *)data); 6889 } 6890 6891 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6892 6893 svp = drp->r_server; 6894 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6895 supp_attrs = svp->sv_supp_attrs; 6896 nfs_rw_exit(&svp->sv_lock); 6897 6898 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6899 nfs_rw_exit(&drp->r_rwlock); 6900 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6901 e.error = EINVAL; 6902 kmem_free(argop, argoplist_size); 6903 return (e.error); 6904 } 6905 6906 /* 2/3: getfh fh of created object */ 6907 ASSERT(idx_create + 1 == idx_fattr - 1); 6908 argop[idx_create + 1].argop = OP_GETFH; 6909 6910 /* 3/4: getattr of new object */ 6911 argop[idx_fattr].argop = OP_GETATTR; 6912 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6913 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6914 6915 if (setgid_flag) { 6916 vattr_t _v; 6917 6918 argop[4].argop = OP_SAVEFH; 6919 6920 argop[5].argop = OP_CPUTFH; 6921 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6922 6923 argop[6].argop = OP_GETATTR; 6924 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6925 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6926 6927 argop[7].argop = OP_RESTOREFH; 6928 6929 /* 6930 * nverify 6931 * 6932 * XXX - Revisit the last argument to nfs4_end_op() 6933 * once 5020486 is fixed. 6934 */ 6935 _v.va_mask = AT_GID; 6936 _v.va_gid = va->va_gid; 6937 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6938 supp_attrs)) { 6939 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6940 nfs_rw_exit(&drp->r_rwlock); 6941 nfs4_fattr4_free(crattr); 6942 kmem_free(argop, argoplist_size); 6943 return (e.error); 6944 } 6945 6946 /* 6947 * setattr 6948 * 6949 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6950 * so no need for stateid or flags. Also we specify NULL 6951 * rp since we're only interested in setting owner_group 6952 * attributes. 6953 */ 6954 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6955 &e.error, 0); 6956 6957 if (e.error) { 6958 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6959 nfs_rw_exit(&drp->r_rwlock); 6960 nfs4_fattr4_free(crattr); 6961 nfs4args_verify_free(&argop[8]); 6962 kmem_free(argop, argoplist_size); 6963 return (e.error); 6964 } 6965 } else { 6966 argop[1].argop = OP_SAVEFH; 6967 6968 argop[5].argop = OP_RESTOREFH; 6969 6970 argop[6].argop = OP_GETATTR; 6971 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6972 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6973 } 6974 6975 dnlc_remove(dvp, nm); 6976 6977 doqueue = 1; 6978 t = gethrtime(); 6979 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 6980 6981 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 6982 if (e.error) { 6983 PURGE_ATTRCACHE4(dvp); 6984 if (!needrecov) 6985 goto out; 6986 } 6987 6988 if (needrecov) { 6989 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 6990 OP_CREATE, NULL) == FALSE) { 6991 nfs4_end_op(mi, dvp, NULL, &recov_state, 6992 needrecov); 6993 need_end_op = FALSE; 6994 nfs4_fattr4_free(crattr); 6995 if (setgid_flag) { 6996 nfs4args_verify_free(&argop[8]); 6997 nfs4args_setattr_free(&argop[9]); 6998 } 6999 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7000 goto recov_retry; 7001 } 7002 } 7003 7004 resp = &res; 7005 7006 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7007 7008 if (res.status == NFS4ERR_BADOWNER) 7009 nfs4_log_badowner(mi, OP_CREATE); 7010 7011 e.error = geterrno4(res.status); 7012 7013 /* 7014 * This check is left over from when create was implemented 7015 * using a setattr op (instead of createattrs). If the 7016 * putfh/create/getfh failed, the error was returned. If 7017 * setattr/getattr failed, we keep going. 7018 * 7019 * It might be better to get rid of the GETFH also, and just 7020 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7021 * Then if any of the operations failed, we could return the 7022 * error now, and remove much of the error code below. 7023 */ 7024 if (res.array_len <= idx_fattr) { 7025 /* 7026 * Either Putfh, Create or Getfh failed. 7027 */ 7028 PURGE_ATTRCACHE4(dvp); 7029 /* 7030 * nfs4_purge_stale_fh() may generate otw calls through 7031 * nfs4_invalidate_pages. Hence the need to call 7032 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7033 */ 7034 nfs4_end_op(mi, dvp, NULL, &recov_state, 7035 needrecov); 7036 need_end_op = FALSE; 7037 nfs4_purge_stale_fh(e.error, dvp, cr); 7038 goto out; 7039 } 7040 } 7041 7042 resop = &res.array[idx_create]; /* create res */ 7043 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7044 7045 resop = &res.array[idx_create + 1]; /* getfh res */ 7046 gf_res = &resop->nfs_resop4_u.opgetfh; 7047 7048 sfhp = sfh4_get(&gf_res->object, mi); 7049 if (e.error) { 7050 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7051 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7052 if (vp->v_type == VNON) { 7053 vattr.va_mask = AT_TYPE; 7054 /* 7055 * Need to call nfs4_end_op before nfs4getattr to avoid 7056 * potential nfs4_start_op deadlock. See RFE 4777612. 7057 */ 7058 nfs4_end_op(mi, dvp, NULL, &recov_state, 7059 needrecov); 7060 need_end_op = FALSE; 7061 e.error = nfs4getattr(vp, &vattr, cr); 7062 if (e.error) { 7063 VN_RELE(vp); 7064 *vpp = NULL; 7065 goto out; 7066 } 7067 vp->v_type = vattr.va_type; 7068 } 7069 e.error = 0; 7070 } else { 7071 *vpp = vp = makenfs4node(sfhp, 7072 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7073 dvp->v_vfsp, t, cr, 7074 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7075 } 7076 7077 /* 7078 * If compound succeeded, then update dir attrs 7079 */ 7080 if (res.status == NFS4_OK) { 7081 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7082 dinfo.di_cred = cr; 7083 dinfo.di_time_call = t; 7084 dinfop = &dinfo; 7085 } else 7086 dinfop = NULL; 7087 7088 /* Update directory cache attribute, readdir and dnlc caches */ 7089 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7090 7091 out: 7092 if (sfhp != NULL) 7093 sfh4_rele(&sfhp); 7094 nfs_rw_exit(&drp->r_rwlock); 7095 nfs4_fattr4_free(crattr); 7096 if (setgid_flag) { 7097 nfs4args_verify_free(&argop[8]); 7098 nfs4args_setattr_free(&argop[9]); 7099 } 7100 if (resp) 7101 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7102 if (need_end_op) 7103 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7104 7105 kmem_free(argop, argoplist_size); 7106 return (e.error); 7107 } 7108 7109 /* ARGSUSED */ 7110 static int 7111 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7112 int mode, vnode_t **vpp, cred_t *cr) 7113 { 7114 int error; 7115 vnode_t *vp; 7116 nfs_ftype4 type; 7117 specdata4 spec, *specp = NULL; 7118 7119 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7120 7121 switch (va->va_type) { 7122 case VCHR: 7123 case VBLK: 7124 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7125 spec.specdata1 = getmajor(va->va_rdev); 7126 spec.specdata2 = getminor(va->va_rdev); 7127 specp = &spec; 7128 break; 7129 7130 case VFIFO: 7131 type = NF4FIFO; 7132 break; 7133 case VSOCK: 7134 type = NF4SOCK; 7135 break; 7136 7137 default: 7138 return (EINVAL); 7139 } 7140 7141 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7142 if (error) { 7143 return (error); 7144 } 7145 7146 /* 7147 * This might not be needed any more; special case to deal 7148 * with problematic v2/v3 servers. Since create was unable 7149 * to set group correctly, not sure what hope setattr has. 7150 */ 7151 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7152 va->va_mask = AT_GID; 7153 (void) nfs4setattr(vp, va, 0, cr, NULL); 7154 } 7155 7156 /* 7157 * If vnode is a device create special vnode 7158 */ 7159 if (ISVDEV(vp->v_type)) { 7160 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7161 VN_RELE(vp); 7162 } else { 7163 *vpp = vp; 7164 } 7165 return (error); 7166 } 7167 7168 /* 7169 * Remove requires that the current fh be the target directory. 7170 * After the operation, the current fh is unchanged. 7171 * The compound op structure is: 7172 * PUTFH(targetdir), REMOVE 7173 * 7174 * Weirdness: if the vnode to be removed is open 7175 * we rename it instead of removing it and nfs_inactive 7176 * will remove the new name. 7177 */ 7178 /* ARGSUSED */ 7179 static int 7180 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7181 { 7182 COMPOUND4args_clnt args; 7183 COMPOUND4res_clnt res, *resp = NULL; 7184 REMOVE4res *rm_res; 7185 nfs_argop4 argop[3]; 7186 nfs_resop4 *resop; 7187 vnode_t *vp; 7188 char *tmpname; 7189 int doqueue; 7190 mntinfo4_t *mi; 7191 rnode4_t *rp; 7192 rnode4_t *drp; 7193 int needrecov = 0; 7194 nfs4_recov_state_t recov_state; 7195 int isopen; 7196 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7197 dirattr_info_t dinfo; 7198 7199 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7200 return (EPERM); 7201 drp = VTOR4(dvp); 7202 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7203 return (EINTR); 7204 7205 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7206 if (e.error) { 7207 nfs_rw_exit(&drp->r_rwlock); 7208 return (e.error); 7209 } 7210 7211 if (vp->v_type == VDIR) { 7212 VN_RELE(vp); 7213 nfs_rw_exit(&drp->r_rwlock); 7214 return (EISDIR); 7215 } 7216 7217 /* 7218 * First just remove the entry from the name cache, as it 7219 * is most likely the only entry for this vp. 7220 */ 7221 dnlc_remove(dvp, nm); 7222 7223 rp = VTOR4(vp); 7224 7225 /* 7226 * For regular file types, check to see if the file is open by looking 7227 * at the open streams. 7228 * For all other types, check the reference count on the vnode. Since 7229 * they are not opened OTW they never have an open stream. 7230 * 7231 * If the file is open, rename it to .nfsXXXX. 7232 */ 7233 if (vp->v_type != VREG) { 7234 /* 7235 * If the file has a v_count > 1 then there may be more than one 7236 * entry in the name cache due multiple links or an open file, 7237 * but we don't have the real reference count so flush all 7238 * possible entries. 7239 */ 7240 if (vp->v_count > 1) 7241 dnlc_purge_vp(vp); 7242 7243 /* 7244 * Now we have the real reference count. 7245 */ 7246 isopen = vp->v_count > 1; 7247 } else { 7248 mutex_enter(&rp->r_os_lock); 7249 isopen = list_head(&rp->r_open_streams) != NULL; 7250 mutex_exit(&rp->r_os_lock); 7251 } 7252 7253 mutex_enter(&rp->r_statelock); 7254 if (isopen && 7255 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7256 mutex_exit(&rp->r_statelock); 7257 tmpname = newname(); 7258 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7259 if (e.error) 7260 kmem_free(tmpname, MAXNAMELEN); 7261 else { 7262 mutex_enter(&rp->r_statelock); 7263 if (rp->r_unldvp == NULL) { 7264 VN_HOLD(dvp); 7265 rp->r_unldvp = dvp; 7266 if (rp->r_unlcred != NULL) 7267 crfree(rp->r_unlcred); 7268 crhold(cr); 7269 rp->r_unlcred = cr; 7270 rp->r_unlname = tmpname; 7271 } else { 7272 kmem_free(rp->r_unlname, MAXNAMELEN); 7273 rp->r_unlname = tmpname; 7274 } 7275 mutex_exit(&rp->r_statelock); 7276 } 7277 VN_RELE(vp); 7278 nfs_rw_exit(&drp->r_rwlock); 7279 return (e.error); 7280 } 7281 /* 7282 * Actually remove the file/dir 7283 */ 7284 mutex_exit(&rp->r_statelock); 7285 7286 /* 7287 * We need to flush any dirty pages which happen to 7288 * be hanging around before removing the file. 7289 * This shouldn't happen very often since in NFSv4 7290 * we should be close to open consistent. 7291 */ 7292 if (nfs4_has_pages(vp) && 7293 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7294 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7295 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7296 mutex_enter(&rp->r_statelock); 7297 if (!rp->r_error) 7298 rp->r_error = e.error; 7299 mutex_exit(&rp->r_statelock); 7300 } 7301 } 7302 7303 mi = VTOMI4(dvp); 7304 7305 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7306 recov_state.rs_flags = 0; 7307 recov_state.rs_num_retry_despite_err = 0; 7308 7309 recov_retry: 7310 /* 7311 * Remove ops: putfh dir; remove 7312 */ 7313 args.ctag = TAG_REMOVE; 7314 args.array_len = 3; 7315 args.array = argop; 7316 7317 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7318 if (e.error) { 7319 nfs_rw_exit(&drp->r_rwlock); 7320 VN_RELE(vp); 7321 return (e.error); 7322 } 7323 7324 /* putfh directory */ 7325 argop[0].argop = OP_CPUTFH; 7326 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7327 7328 /* remove */ 7329 argop[1].argop = OP_CREMOVE; 7330 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7331 7332 /* getattr dir */ 7333 argop[2].argop = OP_GETATTR; 7334 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7335 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7336 7337 doqueue = 1; 7338 dinfo.di_time_call = gethrtime(); 7339 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7340 7341 PURGE_ATTRCACHE4(vp); 7342 7343 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7344 if (e.error) 7345 PURGE_ATTRCACHE4(dvp); 7346 7347 if (needrecov) { 7348 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7349 NULL, NULL, NULL, OP_REMOVE, NULL) == FALSE) { 7350 if (!e.error) 7351 (void) xdr_free(xdr_COMPOUND4res_clnt, 7352 (caddr_t)&res); 7353 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7354 needrecov); 7355 goto recov_retry; 7356 } 7357 } 7358 7359 /* 7360 * Matching nfs4_end_op() for start_op() above. 7361 * There is a path in the code below which calls 7362 * nfs4_purge_stale_fh(), which may generate otw calls through 7363 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7364 * here to avoid nfs4_start_op() deadlock. 7365 */ 7366 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7367 7368 if (!e.error) { 7369 resp = &res; 7370 7371 if (res.status) { 7372 e.error = geterrno4(res.status); 7373 PURGE_ATTRCACHE4(dvp); 7374 nfs4_purge_stale_fh(e.error, dvp, cr); 7375 } else { 7376 resop = &res.array[1]; /* remove res */ 7377 rm_res = &resop->nfs_resop4_u.opremove; 7378 7379 dinfo.di_garp = 7380 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7381 dinfo.di_cred = cr; 7382 7383 /* Update directory attr, readdir and dnlc caches */ 7384 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7385 &dinfo); 7386 } 7387 } 7388 nfs_rw_exit(&drp->r_rwlock); 7389 if (resp) 7390 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7391 7392 if (e.error == 0) { 7393 vnode_t *tvp; 7394 rnode4_t *trp; 7395 trp = VTOR4(vp); 7396 tvp = vp; 7397 if (IS_SHADOW(vp, trp)) 7398 tvp = RTOV4(trp); 7399 vnevent_remove(tvp, dvp, nm, ct); 7400 } 7401 VN_RELE(vp); 7402 return (e.error); 7403 } 7404 7405 /* 7406 * Link requires that the current fh be the target directory and the 7407 * saved fh be the source fh. After the operation, the current fh is unchanged. 7408 * Thus the compound op structure is: 7409 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7410 * GETATTR(file) 7411 */ 7412 /* ARGSUSED */ 7413 static int 7414 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7415 caller_context_t *ct, int flags) 7416 { 7417 COMPOUND4args_clnt args; 7418 COMPOUND4res_clnt res, *resp = NULL; 7419 LINK4res *ln_res; 7420 int argoplist_size = 7 * sizeof (nfs_argop4); 7421 nfs_argop4 *argop; 7422 nfs_resop4 *resop; 7423 vnode_t *realvp, *nvp; 7424 int doqueue; 7425 mntinfo4_t *mi; 7426 rnode4_t *tdrp; 7427 bool_t needrecov = FALSE; 7428 nfs4_recov_state_t recov_state; 7429 hrtime_t t; 7430 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7431 dirattr_info_t dinfo; 7432 7433 ASSERT(*tnm != '\0'); 7434 ASSERT(tdvp->v_type == VDIR); 7435 ASSERT(nfs4_consistent_type(tdvp)); 7436 ASSERT(nfs4_consistent_type(svp)); 7437 7438 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7439 return (EPERM); 7440 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7441 svp = realvp; 7442 ASSERT(nfs4_consistent_type(svp)); 7443 } 7444 7445 tdrp = VTOR4(tdvp); 7446 mi = VTOMI4(svp); 7447 7448 if (!(mi->mi_flags & MI4_LINK)) { 7449 return (EOPNOTSUPP); 7450 } 7451 recov_state.rs_flags = 0; 7452 recov_state.rs_num_retry_despite_err = 0; 7453 7454 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7455 return (EINTR); 7456 7457 recov_retry: 7458 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7459 7460 args.ctag = TAG_LINK; 7461 7462 /* 7463 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7464 * restorefh; getattr(fl) 7465 */ 7466 args.array_len = 7; 7467 args.array = argop; 7468 7469 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7470 if (e.error) { 7471 kmem_free(argop, argoplist_size); 7472 nfs_rw_exit(&tdrp->r_rwlock); 7473 return (e.error); 7474 } 7475 7476 /* 0. putfh file */ 7477 argop[0].argop = OP_CPUTFH; 7478 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7479 7480 /* 1. save current fh to free up the space for the dir */ 7481 argop[1].argop = OP_SAVEFH; 7482 7483 /* 2. putfh targetdir */ 7484 argop[2].argop = OP_CPUTFH; 7485 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7486 7487 /* 3. link: current_fh is targetdir, saved_fh is source */ 7488 argop[3].argop = OP_CLINK; 7489 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7490 7491 /* 4. Get attributes of dir */ 7492 argop[4].argop = OP_GETATTR; 7493 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7494 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7495 7496 /* 5. If link was successful, restore current vp to file */ 7497 argop[5].argop = OP_RESTOREFH; 7498 7499 /* 6. Get attributes of linked object */ 7500 argop[6].argop = OP_GETATTR; 7501 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7502 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7503 7504 dnlc_remove(tdvp, tnm); 7505 7506 doqueue = 1; 7507 t = gethrtime(); 7508 7509 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7510 7511 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7512 if (e.error != 0 && !needrecov) { 7513 PURGE_ATTRCACHE4(tdvp); 7514 PURGE_ATTRCACHE4(svp); 7515 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7516 goto out; 7517 } 7518 7519 if (needrecov) { 7520 bool_t abort; 7521 7522 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7523 NULL, NULL, OP_LINK, NULL); 7524 if (abort == FALSE) { 7525 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7526 needrecov); 7527 kmem_free(argop, argoplist_size); 7528 if (!e.error) 7529 (void) xdr_free(xdr_COMPOUND4res_clnt, 7530 (caddr_t)&res); 7531 goto recov_retry; 7532 } else { 7533 if (e.error != 0) { 7534 PURGE_ATTRCACHE4(tdvp); 7535 PURGE_ATTRCACHE4(svp); 7536 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7537 &recov_state, needrecov); 7538 goto out; 7539 } 7540 /* fall through for res.status case */ 7541 } 7542 } 7543 7544 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7545 7546 resp = &res; 7547 if (res.status) { 7548 /* If link succeeded, then don't return error */ 7549 e.error = geterrno4(res.status); 7550 if (res.array_len <= 4) { 7551 /* 7552 * Either Putfh, Savefh, Putfh dir, or Link failed 7553 */ 7554 PURGE_ATTRCACHE4(svp); 7555 PURGE_ATTRCACHE4(tdvp); 7556 if (e.error == EOPNOTSUPP) { 7557 mutex_enter(&mi->mi_lock); 7558 mi->mi_flags &= ~MI4_LINK; 7559 mutex_exit(&mi->mi_lock); 7560 } 7561 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7562 /* XXX-LP */ 7563 if (e.error == EISDIR && crgetuid(cr) != 0) 7564 e.error = EPERM; 7565 goto out; 7566 } 7567 } 7568 7569 /* either no error or one of the postop getattr failed */ 7570 7571 /* 7572 * XXX - if LINK succeeded, but no attrs were returned for link 7573 * file, purge its cache. 7574 * 7575 * XXX Perform a simplified version of wcc checking. Instead of 7576 * have another getattr to get pre-op, just purge cache if 7577 * any of the ops prior to and including the getattr failed. 7578 * If the getattr succeeded then update the attrcache accordingly. 7579 */ 7580 7581 /* 7582 * update cache with link file postattrs. 7583 * Note: at this point resop points to link res. 7584 */ 7585 resop = &res.array[3]; /* link res */ 7586 ln_res = &resop->nfs_resop4_u.oplink; 7587 if (res.status == NFS4_OK) 7588 e.error = nfs4_update_attrcache(res.status, 7589 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7590 t, svp, cr); 7591 7592 /* 7593 * Call makenfs4node to create the new shadow vp for tnm. 7594 * We pass NULL attrs because we just cached attrs for 7595 * the src object. All we're trying to accomplish is to 7596 * to create the new shadow vnode. 7597 */ 7598 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7599 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7600 7601 /* Update target cache attribute, readdir and dnlc caches */ 7602 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7603 dinfo.di_time_call = t; 7604 dinfo.di_cred = cr; 7605 7606 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7607 ASSERT(nfs4_consistent_type(tdvp)); 7608 ASSERT(nfs4_consistent_type(svp)); 7609 ASSERT(nfs4_consistent_type(nvp)); 7610 VN_RELE(nvp); 7611 7612 if (!e.error) { 7613 vnode_t *tvp; 7614 rnode4_t *trp; 7615 /* 7616 * Notify the source file of this link operation. 7617 */ 7618 trp = VTOR4(svp); 7619 tvp = svp; 7620 if (IS_SHADOW(svp, trp)) 7621 tvp = RTOV4(trp); 7622 vnevent_link(tvp, ct); 7623 } 7624 out: 7625 kmem_free(argop, argoplist_size); 7626 if (resp) 7627 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7628 7629 nfs_rw_exit(&tdrp->r_rwlock); 7630 7631 return (e.error); 7632 } 7633 7634 /* ARGSUSED */ 7635 static int 7636 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7637 caller_context_t *ct, int flags) 7638 { 7639 vnode_t *realvp; 7640 7641 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7642 return (EPERM); 7643 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7644 ndvp = realvp; 7645 7646 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7647 } 7648 7649 /* 7650 * nfs4rename does the real work of renaming in NFS Version 4. 7651 * 7652 * A file handle is considered volatile for renaming purposes if either 7653 * of the volatile bits are turned on. However, the compound may differ 7654 * based on the likelihood of the filehandle to change during rename. 7655 */ 7656 static int 7657 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7658 caller_context_t *ct) 7659 { 7660 int error; 7661 mntinfo4_t *mi; 7662 vnode_t *nvp = NULL; 7663 vnode_t *ovp = NULL; 7664 char *tmpname = NULL; 7665 rnode4_t *rp; 7666 rnode4_t *odrp; 7667 rnode4_t *ndrp; 7668 int did_link = 0; 7669 int do_link = 1; 7670 nfsstat4 stat = NFS4_OK; 7671 7672 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7673 ASSERT(nfs4_consistent_type(odvp)); 7674 ASSERT(nfs4_consistent_type(ndvp)); 7675 7676 if (onm[0] == '.' && (onm[1] == '\0' || 7677 (onm[1] == '.' && onm[2] == '\0'))) 7678 return (EINVAL); 7679 7680 if (nnm[0] == '.' && (nnm[1] == '\0' || 7681 (nnm[1] == '.' && nnm[2] == '\0'))) 7682 return (EINVAL); 7683 7684 odrp = VTOR4(odvp); 7685 ndrp = VTOR4(ndvp); 7686 if ((intptr_t)odrp < (intptr_t)ndrp) { 7687 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7688 return (EINTR); 7689 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7690 nfs_rw_exit(&odrp->r_rwlock); 7691 return (EINTR); 7692 } 7693 } else { 7694 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7695 return (EINTR); 7696 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7697 nfs_rw_exit(&ndrp->r_rwlock); 7698 return (EINTR); 7699 } 7700 } 7701 7702 /* 7703 * Lookup the target file. If it exists, it needs to be 7704 * checked to see whether it is a mount point and whether 7705 * it is active (open). 7706 */ 7707 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7708 if (!error) { 7709 int isactive; 7710 7711 ASSERT(nfs4_consistent_type(nvp)); 7712 /* 7713 * If this file has been mounted on, then just 7714 * return busy because renaming to it would remove 7715 * the mounted file system from the name space. 7716 */ 7717 if (vn_ismntpt(nvp)) { 7718 VN_RELE(nvp); 7719 nfs_rw_exit(&odrp->r_rwlock); 7720 nfs_rw_exit(&ndrp->r_rwlock); 7721 return (EBUSY); 7722 } 7723 7724 /* 7725 * First just remove the entry from the name cache, as it 7726 * is most likely the only entry for this vp. 7727 */ 7728 dnlc_remove(ndvp, nnm); 7729 7730 rp = VTOR4(nvp); 7731 7732 if (nvp->v_type != VREG) { 7733 /* 7734 * Purge the name cache of all references to this vnode 7735 * so that we can check the reference count to infer 7736 * whether it is active or not. 7737 */ 7738 if (nvp->v_count > 1) 7739 dnlc_purge_vp(nvp); 7740 7741 isactive = nvp->v_count > 1; 7742 } else { 7743 mutex_enter(&rp->r_os_lock); 7744 isactive = list_head(&rp->r_open_streams) != NULL; 7745 mutex_exit(&rp->r_os_lock); 7746 } 7747 7748 /* 7749 * If the vnode is active and is not a directory, 7750 * arrange to rename it to a 7751 * temporary file so that it will continue to be 7752 * accessible. This implements the "unlink-open-file" 7753 * semantics for the target of a rename operation. 7754 * Before doing this though, make sure that the 7755 * source and target files are not already the same. 7756 */ 7757 if (isactive && nvp->v_type != VDIR) { 7758 /* 7759 * Lookup the source name. 7760 */ 7761 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7762 7763 /* 7764 * The source name *should* already exist. 7765 */ 7766 if (error) { 7767 VN_RELE(nvp); 7768 nfs_rw_exit(&odrp->r_rwlock); 7769 nfs_rw_exit(&ndrp->r_rwlock); 7770 return (error); 7771 } 7772 7773 ASSERT(nfs4_consistent_type(ovp)); 7774 7775 /* 7776 * Compare the two vnodes. If they are the same, 7777 * just release all held vnodes and return success. 7778 */ 7779 if (VN_CMP(ovp, nvp)) { 7780 VN_RELE(ovp); 7781 VN_RELE(nvp); 7782 nfs_rw_exit(&odrp->r_rwlock); 7783 nfs_rw_exit(&ndrp->r_rwlock); 7784 return (0); 7785 } 7786 7787 /* 7788 * Can't mix and match directories and non- 7789 * directories in rename operations. We already 7790 * know that the target is not a directory. If 7791 * the source is a directory, return an error. 7792 */ 7793 if (ovp->v_type == VDIR) { 7794 VN_RELE(ovp); 7795 VN_RELE(nvp); 7796 nfs_rw_exit(&odrp->r_rwlock); 7797 nfs_rw_exit(&ndrp->r_rwlock); 7798 return (ENOTDIR); 7799 } 7800 link_call: 7801 /* 7802 * The target file exists, is not the same as 7803 * the source file, and is active. We first 7804 * try to Link it to a temporary filename to 7805 * avoid having the server removing the file 7806 * completely (which could cause data loss to 7807 * the user's POV in the event the Rename fails 7808 * -- see bug 1165874). 7809 */ 7810 /* 7811 * The do_link and did_link booleans are 7812 * introduced in the event we get NFS4ERR_FILE_OPEN 7813 * returned for the Rename. Some servers can 7814 * not Rename over an Open file, so they return 7815 * this error. The client needs to Remove the 7816 * newly created Link and do two Renames, just 7817 * as if the server didn't support LINK. 7818 */ 7819 tmpname = newname(); 7820 error = 0; 7821 7822 if (do_link) { 7823 error = nfs4_link(ndvp, nvp, tmpname, cr, 7824 NULL, 0); 7825 } 7826 if (error == EOPNOTSUPP || !do_link) { 7827 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7828 cr, NULL, 0); 7829 did_link = 0; 7830 } else { 7831 did_link = 1; 7832 } 7833 if (error) { 7834 kmem_free(tmpname, MAXNAMELEN); 7835 VN_RELE(ovp); 7836 VN_RELE(nvp); 7837 nfs_rw_exit(&odrp->r_rwlock); 7838 nfs_rw_exit(&ndrp->r_rwlock); 7839 return (error); 7840 } 7841 7842 mutex_enter(&rp->r_statelock); 7843 if (rp->r_unldvp == NULL) { 7844 VN_HOLD(ndvp); 7845 rp->r_unldvp = ndvp; 7846 if (rp->r_unlcred != NULL) 7847 crfree(rp->r_unlcred); 7848 crhold(cr); 7849 rp->r_unlcred = cr; 7850 rp->r_unlname = tmpname; 7851 } else { 7852 if (rp->r_unlname) 7853 kmem_free(rp->r_unlname, MAXNAMELEN); 7854 rp->r_unlname = tmpname; 7855 } 7856 mutex_exit(&rp->r_statelock); 7857 } 7858 7859 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7860 7861 ASSERT(nfs4_consistent_type(nvp)); 7862 } 7863 7864 if (ovp == NULL) { 7865 /* 7866 * When renaming directories to be a subdirectory of a 7867 * different parent, the dnlc entry for ".." will no 7868 * longer be valid, so it must be removed. 7869 * 7870 * We do a lookup here to determine whether we are renaming 7871 * a directory and we need to check if we are renaming 7872 * an unlinked file. This might have already been done 7873 * in previous code, so we check ovp == NULL to avoid 7874 * doing it twice. 7875 */ 7876 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7877 /* 7878 * The source name *should* already exist. 7879 */ 7880 if (error) { 7881 nfs_rw_exit(&odrp->r_rwlock); 7882 nfs_rw_exit(&ndrp->r_rwlock); 7883 if (nvp) { 7884 VN_RELE(nvp); 7885 } 7886 return (error); 7887 } 7888 ASSERT(ovp != NULL); 7889 ASSERT(nfs4_consistent_type(ovp)); 7890 } 7891 7892 /* 7893 * Is the object being renamed a dir, and if so, is 7894 * it being renamed to a child of itself? The underlying 7895 * fs should ultimately return EINVAL for this case; 7896 * however, buggy beta non-Solaris NFSv4 servers at 7897 * interop testing events have allowed this behavior, 7898 * and it caused our client to panic due to a recursive 7899 * mutex_enter in fn_move. 7900 * 7901 * The tedious locking in fn_move could be changed to 7902 * deal with this case, and the client could avoid the 7903 * panic; however, the client would just confuse itself 7904 * later and misbehave. A better way to handle the broken 7905 * server is to detect this condition and return EINVAL 7906 * without ever sending the the bogus rename to the server. 7907 * We know the rename is invalid -- just fail it now. 7908 */ 7909 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7910 VN_RELE(ovp); 7911 nfs_rw_exit(&odrp->r_rwlock); 7912 nfs_rw_exit(&ndrp->r_rwlock); 7913 if (nvp) { 7914 VN_RELE(nvp); 7915 } 7916 return (EINVAL); 7917 } 7918 7919 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7920 7921 /* 7922 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7923 * possible for the filehandle to change due to the rename. 7924 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7925 * the fh will not change because of the rename, but we still need 7926 * to update its rnode entry with the new name for 7927 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7928 * has no effect on these for now, but for future improvements, 7929 * we might want to use it too to simplify handling of files 7930 * that are open with that flag on. (XXX) 7931 */ 7932 mi = VTOMI4(odvp); 7933 if (NFS4_VOLATILE_FH(mi)) 7934 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7935 &stat); 7936 else 7937 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7938 &stat); 7939 7940 ASSERT(nfs4_consistent_type(odvp)); 7941 ASSERT(nfs4_consistent_type(ndvp)); 7942 ASSERT(nfs4_consistent_type(ovp)); 7943 7944 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7945 do_link = 0; 7946 /* 7947 * Before the 'link_call' code, we did a nfs4_lookup 7948 * that puts a VN_HOLD on nvp. After the nfs4_link 7949 * call we call VN_RELE to match that hold. We need 7950 * to place an additional VN_HOLD here since we will 7951 * be hitting that VN_RELE again. 7952 */ 7953 VN_HOLD(nvp); 7954 7955 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7956 7957 /* Undo the unlinked file naming stuff we just did */ 7958 mutex_enter(&rp->r_statelock); 7959 if (rp->r_unldvp) { 7960 VN_RELE(ndvp); 7961 rp->r_unldvp = NULL; 7962 if (rp->r_unlcred != NULL) 7963 crfree(rp->r_unlcred); 7964 rp->r_unlcred = NULL; 7965 /* rp->r_unlanme points to tmpname */ 7966 if (rp->r_unlname) 7967 kmem_free(rp->r_unlname, MAXNAMELEN); 7968 rp->r_unlname = NULL; 7969 } 7970 mutex_exit(&rp->r_statelock); 7971 7972 if (nvp) { 7973 VN_RELE(nvp); 7974 } 7975 goto link_call; 7976 } 7977 7978 if (error) { 7979 VN_RELE(ovp); 7980 nfs_rw_exit(&odrp->r_rwlock); 7981 nfs_rw_exit(&ndrp->r_rwlock); 7982 if (nvp) { 7983 VN_RELE(nvp); 7984 } 7985 return (error); 7986 } 7987 7988 /* 7989 * when renaming directories to be a subdirectory of a 7990 * different parent, the dnlc entry for ".." will no 7991 * longer be valid, so it must be removed 7992 */ 7993 rp = VTOR4(ovp); 7994 if (ndvp != odvp) { 7995 if (ovp->v_type == VDIR) { 7996 dnlc_remove(ovp, ".."); 7997 if (rp->r_dir != NULL) 7998 nfs4_purge_rddir_cache(ovp); 7999 } 8000 } 8001 8002 /* 8003 * If we are renaming the unlinked file, update the 8004 * r_unldvp and r_unlname as needed. 8005 */ 8006 mutex_enter(&rp->r_statelock); 8007 if (rp->r_unldvp != NULL) { 8008 if (strcmp(rp->r_unlname, onm) == 0) { 8009 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8010 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8011 if (ndvp != rp->r_unldvp) { 8012 VN_RELE(rp->r_unldvp); 8013 rp->r_unldvp = ndvp; 8014 VN_HOLD(ndvp); 8015 } 8016 } 8017 } 8018 mutex_exit(&rp->r_statelock); 8019 8020 /* 8021 * Notify the rename vnevents to source vnode, and to the target 8022 * vnode if it already existed. 8023 */ 8024 if (error == 0) { 8025 vnode_t *tvp; 8026 rnode4_t *trp; 8027 /* 8028 * Notify the vnode. Each links is represented by 8029 * a different vnode, in nfsv4. 8030 */ 8031 if (nvp) { 8032 trp = VTOR4(nvp); 8033 tvp = nvp; 8034 if (IS_SHADOW(nvp, trp)) 8035 tvp = RTOV4(trp); 8036 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8037 } 8038 8039 /* 8040 * if the source and destination directory are not the 8041 * same notify the destination directory. 8042 */ 8043 if (VTOR4(odvp) != VTOR4(ndvp)) { 8044 trp = VTOR4(ndvp); 8045 tvp = ndvp; 8046 if (IS_SHADOW(ndvp, trp)) 8047 tvp = RTOV4(trp); 8048 vnevent_rename_dest_dir(tvp, ct); 8049 } 8050 8051 trp = VTOR4(ovp); 8052 tvp = ovp; 8053 if (IS_SHADOW(ovp, trp)) 8054 tvp = RTOV4(trp); 8055 vnevent_rename_src(tvp, odvp, onm, ct); 8056 } 8057 8058 if (nvp) { 8059 VN_RELE(nvp); 8060 } 8061 VN_RELE(ovp); 8062 8063 nfs_rw_exit(&odrp->r_rwlock); 8064 nfs_rw_exit(&ndrp->r_rwlock); 8065 8066 return (error); 8067 } 8068 8069 /* 8070 * When the parent directory has changed, sv_dfh must be updated 8071 */ 8072 static void 8073 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8074 { 8075 svnode_t *sv = VTOSV(vp); 8076 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8077 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8078 8079 sfh4_hold(new_dfh); 8080 sv->sv_dfh = new_dfh; 8081 sfh4_rele(&old_dfh); 8082 } 8083 8084 /* 8085 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8086 * when it is known that the filehandle is persistent through rename. 8087 * 8088 * Rename requires that the current fh be the target directory and the 8089 * saved fh be the source directory. After the operation, the current fh 8090 * is unchanged. 8091 * The compound op structure for persistent fh rename is: 8092 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8093 * Rather than bother with the directory postop args, we'll simply 8094 * update that a change occurred in the cache, so no post-op getattrs. 8095 */ 8096 static int 8097 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8098 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8099 { 8100 COMPOUND4args_clnt args; 8101 COMPOUND4res_clnt res, *resp = NULL; 8102 nfs_argop4 *argop; 8103 nfs_resop4 *resop; 8104 int doqueue, argoplist_size; 8105 mntinfo4_t *mi; 8106 rnode4_t *odrp = VTOR4(odvp); 8107 rnode4_t *ndrp = VTOR4(ndvp); 8108 RENAME4res *rn_res; 8109 bool_t needrecov; 8110 nfs4_recov_state_t recov_state; 8111 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8112 dirattr_info_t dinfo, *dinfop; 8113 8114 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8115 8116 recov_state.rs_flags = 0; 8117 recov_state.rs_num_retry_despite_err = 0; 8118 8119 /* 8120 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8121 * 8122 * If source/target are different dirs, then append putfh(src); getattr 8123 */ 8124 args.array_len = (odvp == ndvp) ? 5 : 7; 8125 argoplist_size = args.array_len * sizeof (nfs_argop4); 8126 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8127 8128 recov_retry: 8129 *statp = NFS4_OK; 8130 8131 /* No need to Lookup the file, persistent fh */ 8132 args.ctag = TAG_RENAME; 8133 8134 mi = VTOMI4(odvp); 8135 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8136 if (e.error) { 8137 kmem_free(argop, argoplist_size); 8138 return (e.error); 8139 } 8140 8141 /* 0: putfh source directory */ 8142 argop[0].argop = OP_CPUTFH; 8143 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8144 8145 /* 1: Save source fh to free up current for target */ 8146 argop[1].argop = OP_SAVEFH; 8147 8148 /* 2: putfh targetdir */ 8149 argop[2].argop = OP_CPUTFH; 8150 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8151 8152 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8153 argop[3].argop = OP_CRENAME; 8154 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8155 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8156 8157 /* 4: getattr (targetdir) */ 8158 argop[4].argop = OP_GETATTR; 8159 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8160 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8161 8162 if (ndvp != odvp) { 8163 8164 /* 5: putfh (sourcedir) */ 8165 argop[5].argop = OP_CPUTFH; 8166 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8167 8168 /* 6: getattr (sourcedir) */ 8169 argop[6].argop = OP_GETATTR; 8170 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8171 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8172 } 8173 8174 dnlc_remove(odvp, onm); 8175 dnlc_remove(ndvp, nnm); 8176 8177 doqueue = 1; 8178 dinfo.di_time_call = gethrtime(); 8179 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8180 8181 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8182 if (e.error) { 8183 PURGE_ATTRCACHE4(odvp); 8184 PURGE_ATTRCACHE4(ndvp); 8185 } else { 8186 *statp = res.status; 8187 } 8188 8189 if (needrecov) { 8190 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8191 OP_RENAME, NULL) == FALSE) { 8192 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8193 if (!e.error) 8194 (void) xdr_free(xdr_COMPOUND4res_clnt, 8195 (caddr_t)&res); 8196 goto recov_retry; 8197 } 8198 } 8199 8200 if (!e.error) { 8201 resp = &res; 8202 /* 8203 * as long as OP_RENAME 8204 */ 8205 if (res.status != NFS4_OK && res.array_len <= 4) { 8206 e.error = geterrno4(res.status); 8207 PURGE_ATTRCACHE4(odvp); 8208 PURGE_ATTRCACHE4(ndvp); 8209 /* 8210 * System V defines rename to return EEXIST, not 8211 * ENOTEMPTY if the target directory is not empty. 8212 * Over the wire, the error is NFSERR_ENOTEMPTY 8213 * which geterrno4 maps to ENOTEMPTY. 8214 */ 8215 if (e.error == ENOTEMPTY) 8216 e.error = EEXIST; 8217 } else { 8218 8219 resop = &res.array[3]; /* rename res */ 8220 rn_res = &resop->nfs_resop4_u.oprename; 8221 8222 if (res.status == NFS4_OK) { 8223 /* 8224 * Update target attribute, readdir and dnlc 8225 * caches. 8226 */ 8227 dinfo.di_garp = 8228 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8229 dinfo.di_cred = cr; 8230 dinfop = &dinfo; 8231 } else 8232 dinfop = NULL; 8233 8234 nfs4_update_dircaches(&rn_res->target_cinfo, 8235 ndvp, NULL, NULL, dinfop); 8236 8237 /* 8238 * Update source attribute, readdir and dnlc caches 8239 * 8240 */ 8241 if (ndvp != odvp) { 8242 update_parentdir_sfh(renvp, ndvp); 8243 8244 if (dinfop) 8245 dinfo.di_garp = 8246 &(res.array[6].nfs_resop4_u. 8247 opgetattr.ga_res); 8248 8249 nfs4_update_dircaches(&rn_res->source_cinfo, 8250 odvp, NULL, NULL, dinfop); 8251 } 8252 8253 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8254 nnm); 8255 } 8256 } 8257 8258 if (resp) 8259 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8260 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8261 kmem_free(argop, argoplist_size); 8262 8263 return (e.error); 8264 } 8265 8266 /* 8267 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8268 * it is possible for the filehandle to change due to the rename. 8269 * 8270 * The compound req in this case includes a post-rename lookup and getattr 8271 * to ensure that we have the correct fh and attributes for the object. 8272 * 8273 * Rename requires that the current fh be the target directory and the 8274 * saved fh be the source directory. After the operation, the current fh 8275 * is unchanged. 8276 * 8277 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8278 * update the filehandle for the renamed object. We also get the old 8279 * filehandle for historical reasons; this should be taken out sometime. 8280 * This results in a rather cumbersome compound... 8281 * 8282 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8283 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8284 * 8285 */ 8286 static int 8287 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8288 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8289 { 8290 COMPOUND4args_clnt args; 8291 COMPOUND4res_clnt res, *resp = NULL; 8292 int argoplist_size; 8293 nfs_argop4 *argop; 8294 nfs_resop4 *resop; 8295 int doqueue; 8296 mntinfo4_t *mi; 8297 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8298 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8299 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8300 RENAME4res *rn_res; 8301 GETFH4res *ngf_res; 8302 bool_t needrecov; 8303 nfs4_recov_state_t recov_state; 8304 hrtime_t t; 8305 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8306 dirattr_info_t dinfo, *dinfop = &dinfo; 8307 8308 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8309 8310 recov_state.rs_flags = 0; 8311 recov_state.rs_num_retry_despite_err = 0; 8312 8313 recov_retry: 8314 *statp = NFS4_OK; 8315 8316 /* 8317 * There is a window between the RPC and updating the path and 8318 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8319 * code, so that it doesn't try to use the old path during that 8320 * window. 8321 */ 8322 mutex_enter(&orp->r_statelock); 8323 while (orp->r_flags & R4RECEXPFH) { 8324 klwp_t *lwp = ttolwp(curthread); 8325 8326 if (lwp != NULL) 8327 lwp->lwp_nostop++; 8328 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8329 mutex_exit(&orp->r_statelock); 8330 if (lwp != NULL) 8331 lwp->lwp_nostop--; 8332 return (EINTR); 8333 } 8334 if (lwp != NULL) 8335 lwp->lwp_nostop--; 8336 } 8337 orp->r_flags |= R4RECEXPFH; 8338 mutex_exit(&orp->r_statelock); 8339 8340 mi = VTOMI4(odvp); 8341 8342 args.ctag = TAG_RENAME_VFH; 8343 args.array_len = (odvp == ndvp) ? 10 : 12; 8344 argoplist_size = args.array_len * sizeof (nfs_argop4); 8345 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8346 8347 /* 8348 * Rename ops: 8349 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8350 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8351 * LOOKUP(trgt), GETFH(new), GETATTR, 8352 * 8353 * if (odvp != ndvp) 8354 * add putfh(sourcedir), getattr(sourcedir) } 8355 */ 8356 args.array = argop; 8357 8358 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8359 &recov_state, NULL); 8360 if (e.error) { 8361 kmem_free(argop, argoplist_size); 8362 mutex_enter(&orp->r_statelock); 8363 orp->r_flags &= ~R4RECEXPFH; 8364 cv_broadcast(&orp->r_cv); 8365 mutex_exit(&orp->r_statelock); 8366 return (e.error); 8367 } 8368 8369 /* 0: putfh source directory */ 8370 argop[0].argop = OP_CPUTFH; 8371 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8372 8373 /* 1: Save source fh to free up current for target */ 8374 argop[1].argop = OP_SAVEFH; 8375 8376 /* 2: Lookup pre-rename fh of renamed object */ 8377 argop[2].argop = OP_CLOOKUP; 8378 argop[2].nfs_argop4_u.opclookup.cname = onm; 8379 8380 /* 3: getfh fh of renamed object (before rename) */ 8381 argop[3].argop = OP_GETFH; 8382 8383 /* 4: putfh targetdir */ 8384 argop[4].argop = OP_CPUTFH; 8385 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8386 8387 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8388 argop[5].argop = OP_CRENAME; 8389 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8390 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8391 8392 /* 6: getattr of target dir (post op attrs) */ 8393 argop[6].argop = OP_GETATTR; 8394 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8395 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8396 8397 /* 7: Lookup post-rename fh of renamed object */ 8398 argop[7].argop = OP_CLOOKUP; 8399 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8400 8401 /* 8: getfh fh of renamed object (after rename) */ 8402 argop[8].argop = OP_GETFH; 8403 8404 /* 9: getattr of renamed object */ 8405 argop[9].argop = OP_GETATTR; 8406 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8407 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8408 8409 /* 8410 * If source/target dirs are different, then get new post-op 8411 * attrs for source dir also. 8412 */ 8413 if (ndvp != odvp) { 8414 /* 10: putfh (sourcedir) */ 8415 argop[10].argop = OP_CPUTFH; 8416 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8417 8418 /* 11: getattr (sourcedir) */ 8419 argop[11].argop = OP_GETATTR; 8420 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8421 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8422 } 8423 8424 dnlc_remove(odvp, onm); 8425 dnlc_remove(ndvp, nnm); 8426 8427 doqueue = 1; 8428 t = gethrtime(); 8429 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8430 8431 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8432 if (e.error) { 8433 PURGE_ATTRCACHE4(odvp); 8434 PURGE_ATTRCACHE4(ndvp); 8435 if (!needrecov) { 8436 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8437 &recov_state, needrecov); 8438 goto out; 8439 } 8440 } else { 8441 *statp = res.status; 8442 } 8443 8444 if (needrecov) { 8445 bool_t abort; 8446 8447 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8448 OP_RENAME, NULL); 8449 if (abort == FALSE) { 8450 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8451 &recov_state, needrecov); 8452 kmem_free(argop, argoplist_size); 8453 if (!e.error) 8454 (void) xdr_free(xdr_COMPOUND4res_clnt, 8455 (caddr_t)&res); 8456 mutex_enter(&orp->r_statelock); 8457 orp->r_flags &= ~R4RECEXPFH; 8458 cv_broadcast(&orp->r_cv); 8459 mutex_exit(&orp->r_statelock); 8460 goto recov_retry; 8461 } else { 8462 if (e.error != 0) { 8463 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8464 &recov_state, needrecov); 8465 goto out; 8466 } 8467 /* fall through for res.status case */ 8468 } 8469 } 8470 8471 resp = &res; 8472 /* 8473 * If OP_RENAME (or any prev op) failed, then return an error. 8474 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8475 */ 8476 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8477 /* 8478 * Error in an op other than last Getattr 8479 */ 8480 e.error = geterrno4(res.status); 8481 PURGE_ATTRCACHE4(odvp); 8482 PURGE_ATTRCACHE4(ndvp); 8483 /* 8484 * System V defines rename to return EEXIST, not 8485 * ENOTEMPTY if the target directory is not empty. 8486 * Over the wire, the error is NFSERR_ENOTEMPTY 8487 * which geterrno4 maps to ENOTEMPTY. 8488 */ 8489 if (e.error == ENOTEMPTY) 8490 e.error = EEXIST; 8491 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8492 needrecov); 8493 goto out; 8494 } 8495 8496 /* rename results */ 8497 rn_res = &res.array[5].nfs_resop4_u.oprename; 8498 8499 if (res.status == NFS4_OK) { 8500 /* Update target attribute, readdir and dnlc caches */ 8501 dinfo.di_garp = 8502 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8503 dinfo.di_cred = cr; 8504 dinfo.di_time_call = t; 8505 } else 8506 dinfop = NULL; 8507 8508 /* Update source cache attribute, readdir and dnlc caches */ 8509 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8510 8511 /* Update source cache attribute, readdir and dnlc caches */ 8512 if (ndvp != odvp) { 8513 update_parentdir_sfh(ovp, ndvp); 8514 8515 /* 8516 * If dinfop is non-NULL, then compound succeded, so 8517 * set di_garp to attrs for source dir. dinfop is only 8518 * set to NULL when compound fails. 8519 */ 8520 if (dinfop) 8521 dinfo.di_garp = 8522 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8523 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8524 dinfop); 8525 } 8526 8527 /* 8528 * Update the rnode with the new component name and args, 8529 * and if the file handle changed, also update it with the new fh. 8530 * This is only necessary if the target object has an rnode 8531 * entry and there is no need to create one for it. 8532 */ 8533 resop = &res.array[8]; /* getfh new res */ 8534 ngf_res = &resop->nfs_resop4_u.opgetfh; 8535 8536 /* 8537 * Update the path and filehandle for the renamed object. 8538 */ 8539 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8540 8541 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8542 8543 if (res.status == NFS4_OK) { 8544 resop++; /* getattr res */ 8545 e.error = nfs4_update_attrcache(res.status, 8546 &resop->nfs_resop4_u.opgetattr.ga_res, 8547 t, ovp, cr); 8548 } 8549 8550 out: 8551 kmem_free(argop, argoplist_size); 8552 if (resp) 8553 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8554 mutex_enter(&orp->r_statelock); 8555 orp->r_flags &= ~R4RECEXPFH; 8556 cv_broadcast(&orp->r_cv); 8557 mutex_exit(&orp->r_statelock); 8558 8559 return (e.error); 8560 } 8561 8562 /* ARGSUSED */ 8563 static int 8564 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8565 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8566 { 8567 int error; 8568 vnode_t *vp; 8569 8570 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8571 return (EPERM); 8572 /* 8573 * As ".." has special meaning and rather than send a mkdir 8574 * over the wire to just let the server freak out, we just 8575 * short circuit it here and return EEXIST 8576 */ 8577 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8578 return (EEXIST); 8579 8580 /* 8581 * Decision to get the right gid and setgid bit of the 8582 * new directory is now made in call_nfs4_create_req. 8583 */ 8584 va->va_mask |= AT_MODE; 8585 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8586 if (error) 8587 return (error); 8588 8589 *vpp = vp; 8590 return (0); 8591 } 8592 8593 8594 /* 8595 * rmdir is using the same remove v4 op as does remove. 8596 * Remove requires that the current fh be the target directory. 8597 * After the operation, the current fh is unchanged. 8598 * The compound op structure is: 8599 * PUTFH(targetdir), REMOVE 8600 */ 8601 /*ARGSUSED4*/ 8602 static int 8603 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8604 caller_context_t *ct, int flags) 8605 { 8606 int need_end_op = FALSE; 8607 COMPOUND4args_clnt args; 8608 COMPOUND4res_clnt res, *resp = NULL; 8609 REMOVE4res *rm_res; 8610 nfs_argop4 argop[3]; 8611 nfs_resop4 *resop; 8612 vnode_t *vp; 8613 int doqueue; 8614 mntinfo4_t *mi; 8615 rnode4_t *drp; 8616 bool_t needrecov = FALSE; 8617 nfs4_recov_state_t recov_state; 8618 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8619 dirattr_info_t dinfo, *dinfop; 8620 8621 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8622 return (EPERM); 8623 /* 8624 * As ".." has special meaning and rather than send a rmdir 8625 * over the wire to just let the server freak out, we just 8626 * short circuit it here and return EEXIST 8627 */ 8628 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8629 return (EEXIST); 8630 8631 drp = VTOR4(dvp); 8632 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8633 return (EINTR); 8634 8635 /* 8636 * Attempt to prevent a rmdir(".") from succeeding. 8637 */ 8638 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8639 if (e.error) { 8640 nfs_rw_exit(&drp->r_rwlock); 8641 return (e.error); 8642 } 8643 if (vp == cdir) { 8644 VN_RELE(vp); 8645 nfs_rw_exit(&drp->r_rwlock); 8646 return (EINVAL); 8647 } 8648 8649 /* 8650 * Since nfsv4 remove op works on both files and directories, 8651 * check that the removed object is indeed a directory. 8652 */ 8653 if (vp->v_type != VDIR) { 8654 VN_RELE(vp); 8655 nfs_rw_exit(&drp->r_rwlock); 8656 return (ENOTDIR); 8657 } 8658 8659 /* 8660 * First just remove the entry from the name cache, as it 8661 * is most likely an entry for this vp. 8662 */ 8663 dnlc_remove(dvp, nm); 8664 8665 /* 8666 * If there vnode reference count is greater than one, then 8667 * there may be additional references in the DNLC which will 8668 * need to be purged. First, trying removing the entry for 8669 * the parent directory and see if that removes the additional 8670 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8671 * to completely remove any references to the directory which 8672 * might still exist in the DNLC. 8673 */ 8674 if (vp->v_count > 1) { 8675 dnlc_remove(vp, ".."); 8676 if (vp->v_count > 1) 8677 dnlc_purge_vp(vp); 8678 } 8679 8680 mi = VTOMI4(dvp); 8681 recov_state.rs_flags = 0; 8682 recov_state.rs_num_retry_despite_err = 0; 8683 8684 recov_retry: 8685 args.ctag = TAG_RMDIR; 8686 8687 /* 8688 * Rmdir ops: putfh dir; remove 8689 */ 8690 args.array_len = 3; 8691 args.array = argop; 8692 8693 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8694 if (e.error) { 8695 nfs_rw_exit(&drp->r_rwlock); 8696 return (e.error); 8697 } 8698 need_end_op = TRUE; 8699 8700 /* putfh directory */ 8701 argop[0].argop = OP_CPUTFH; 8702 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8703 8704 /* remove */ 8705 argop[1].argop = OP_CREMOVE; 8706 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8707 8708 /* getattr (postop attrs for dir that contained removed dir) */ 8709 argop[2].argop = OP_GETATTR; 8710 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8711 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8712 8713 dinfo.di_time_call = gethrtime(); 8714 doqueue = 1; 8715 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8716 8717 PURGE_ATTRCACHE4(vp); 8718 8719 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8720 if (e.error) { 8721 PURGE_ATTRCACHE4(dvp); 8722 } 8723 8724 if (needrecov) { 8725 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8726 NULL, OP_REMOVE, NULL) == FALSE) { 8727 if (!e.error) 8728 (void) xdr_free(xdr_COMPOUND4res_clnt, 8729 (caddr_t)&res); 8730 8731 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8732 needrecov); 8733 need_end_op = FALSE; 8734 goto recov_retry; 8735 } 8736 } 8737 8738 if (!e.error) { 8739 resp = &res; 8740 8741 /* 8742 * Only return error if first 2 ops (OP_REMOVE or earlier) 8743 * failed. 8744 */ 8745 if (res.status != NFS4_OK && res.array_len <= 2) { 8746 e.error = geterrno4(res.status); 8747 PURGE_ATTRCACHE4(dvp); 8748 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8749 &recov_state, needrecov); 8750 need_end_op = FALSE; 8751 nfs4_purge_stale_fh(e.error, dvp, cr); 8752 /* 8753 * System V defines rmdir to return EEXIST, not 8754 * ENOTEMPTY if the directory is not empty. Over 8755 * the wire, the error is NFSERR_ENOTEMPTY which 8756 * geterrno4 maps to ENOTEMPTY. 8757 */ 8758 if (e.error == ENOTEMPTY) 8759 e.error = EEXIST; 8760 } else { 8761 resop = &res.array[1]; /* remove res */ 8762 rm_res = &resop->nfs_resop4_u.opremove; 8763 8764 if (res.status == NFS4_OK) { 8765 resop = &res.array[2]; /* dir attrs */ 8766 dinfo.di_garp = 8767 &resop->nfs_resop4_u.opgetattr.ga_res; 8768 dinfo.di_cred = cr; 8769 dinfop = &dinfo; 8770 } else 8771 dinfop = NULL; 8772 8773 /* Update dir attribute, readdir and dnlc caches */ 8774 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8775 dinfop); 8776 8777 /* destroy rddir cache for dir that was removed */ 8778 if (VTOR4(vp)->r_dir != NULL) 8779 nfs4_purge_rddir_cache(vp); 8780 } 8781 } 8782 8783 if (need_end_op) 8784 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8785 8786 nfs_rw_exit(&drp->r_rwlock); 8787 8788 if (resp) 8789 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8790 8791 if (e.error == 0) { 8792 vnode_t *tvp; 8793 rnode4_t *trp; 8794 trp = VTOR4(vp); 8795 tvp = vp; 8796 if (IS_SHADOW(vp, trp)) 8797 tvp = RTOV4(trp); 8798 vnevent_rmdir(tvp, dvp, nm, ct); 8799 } 8800 8801 VN_RELE(vp); 8802 8803 return (e.error); 8804 } 8805 8806 /* ARGSUSED */ 8807 static int 8808 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8809 caller_context_t *ct, int flags) 8810 { 8811 int error; 8812 vnode_t *vp; 8813 rnode4_t *rp; 8814 char *contents; 8815 mntinfo4_t *mi = VTOMI4(dvp); 8816 8817 if (nfs_zone() != mi->mi_zone) 8818 return (EPERM); 8819 if (!(mi->mi_flags & MI4_SYMLINK)) 8820 return (EOPNOTSUPP); 8821 8822 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8823 if (error) 8824 return (error); 8825 8826 ASSERT(nfs4_consistent_type(vp)); 8827 rp = VTOR4(vp); 8828 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8829 8830 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8831 8832 if (contents != NULL) { 8833 mutex_enter(&rp->r_statelock); 8834 if (rp->r_symlink.contents == NULL) { 8835 rp->r_symlink.len = strlen(tnm); 8836 bcopy(tnm, contents, rp->r_symlink.len); 8837 rp->r_symlink.contents = contents; 8838 rp->r_symlink.size = MAXPATHLEN; 8839 mutex_exit(&rp->r_statelock); 8840 } else { 8841 mutex_exit(&rp->r_statelock); 8842 kmem_free((void *)contents, MAXPATHLEN); 8843 } 8844 } 8845 } 8846 VN_RELE(vp); 8847 8848 return (error); 8849 } 8850 8851 8852 /* 8853 * Read directory entries. 8854 * There are some weird things to look out for here. The uio_loffset 8855 * field is either 0 or it is the offset returned from a previous 8856 * readdir. It is an opaque value used by the server to find the 8857 * correct directory block to read. The count field is the number 8858 * of blocks to read on the server. This is advisory only, the server 8859 * may return only one block's worth of entries. Entries may be compressed 8860 * on the server. 8861 */ 8862 /* ARGSUSED */ 8863 static int 8864 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8865 caller_context_t *ct, int flags) 8866 { 8867 int error; 8868 uint_t count; 8869 rnode4_t *rp; 8870 rddir4_cache *rdc; 8871 rddir4_cache *rrdc; 8872 8873 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8874 return (EIO); 8875 rp = VTOR4(vp); 8876 8877 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8878 8879 /* 8880 * Make sure that the directory cache is valid. 8881 */ 8882 if (rp->r_dir != NULL) { 8883 if (nfs_disable_rddir_cache != 0) { 8884 /* 8885 * Setting nfs_disable_rddir_cache in /etc/system 8886 * allows interoperability with servers that do not 8887 * properly update the attributes of directories. 8888 * Any cached information gets purged before an 8889 * access is made to it. 8890 */ 8891 nfs4_purge_rddir_cache(vp); 8892 } 8893 8894 error = nfs4_validate_caches(vp, cr); 8895 if (error) 8896 return (error); 8897 } 8898 8899 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8900 8901 /* 8902 * Short circuit last readdir which always returns 0 bytes. 8903 * This can be done after the directory has been read through 8904 * completely at least once. This will set r_direof which 8905 * can be used to find the value of the last cookie. 8906 */ 8907 mutex_enter(&rp->r_statelock); 8908 if (rp->r_direof != NULL && 8909 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8910 mutex_exit(&rp->r_statelock); 8911 #ifdef DEBUG 8912 nfs4_readdir_cache_shorts++; 8913 #endif 8914 if (eofp) 8915 *eofp = 1; 8916 return (0); 8917 } 8918 8919 /* 8920 * Look for a cache entry. Cache entries are identified 8921 * by the NFS cookie value and the byte count requested. 8922 */ 8923 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8924 8925 /* 8926 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8927 */ 8928 if (rdc == NULL) { 8929 mutex_exit(&rp->r_statelock); 8930 return (EINTR); 8931 } 8932 8933 /* 8934 * Check to see if we need to fill this entry in. 8935 */ 8936 if (rdc->flags & RDDIRREQ) { 8937 rdc->flags &= ~RDDIRREQ; 8938 rdc->flags |= RDDIR; 8939 mutex_exit(&rp->r_statelock); 8940 8941 /* 8942 * Do the readdir. 8943 */ 8944 nfs4readdir(vp, rdc, cr); 8945 8946 /* 8947 * Reacquire the lock, so that we can continue 8948 */ 8949 mutex_enter(&rp->r_statelock); 8950 /* 8951 * The entry is now complete 8952 */ 8953 rdc->flags &= ~RDDIR; 8954 } 8955 8956 ASSERT(!(rdc->flags & RDDIR)); 8957 8958 /* 8959 * If an error occurred while attempting 8960 * to fill the cache entry, mark the entry invalid and 8961 * just return the error. 8962 */ 8963 if (rdc->error) { 8964 error = rdc->error; 8965 rdc->flags |= RDDIRREQ; 8966 rddir4_cache_rele(rp, rdc); 8967 mutex_exit(&rp->r_statelock); 8968 return (error); 8969 } 8970 8971 /* 8972 * The cache entry is complete and good, 8973 * copyout the dirent structs to the calling 8974 * thread. 8975 */ 8976 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 8977 8978 /* 8979 * If no error occurred during the copyout, 8980 * update the offset in the uio struct to 8981 * contain the value of the next NFS 4 cookie 8982 * and set the eof value appropriately. 8983 */ 8984 if (!error) { 8985 uiop->uio_loffset = rdc->nfs4_ncookie; 8986 if (eofp) 8987 *eofp = rdc->eof; 8988 } 8989 8990 /* 8991 * Decide whether to do readahead. Don't if we 8992 * have already read to the end of directory. 8993 */ 8994 if (rdc->eof) { 8995 /* 8996 * Make the entry the direof only if it is cached 8997 */ 8998 if (rdc->flags & RDDIRCACHED) 8999 rp->r_direof = rdc; 9000 rddir4_cache_rele(rp, rdc); 9001 mutex_exit(&rp->r_statelock); 9002 return (error); 9003 } 9004 9005 /* Determine if a readdir readahead should be done */ 9006 if (!(rp->r_flags & R4LOOKUP)) { 9007 rddir4_cache_rele(rp, rdc); 9008 mutex_exit(&rp->r_statelock); 9009 return (error); 9010 } 9011 9012 /* 9013 * Now look for a readahead entry. 9014 * 9015 * Check to see whether we found an entry for the readahead. 9016 * If so, we don't need to do anything further, so free the new 9017 * entry if one was allocated. Otherwise, allocate a new entry, add 9018 * it to the cache, and then initiate an asynchronous readdir 9019 * operation to fill it. 9020 */ 9021 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9022 9023 /* 9024 * A readdir cache entry could not be obtained for the readahead. In 9025 * this case we skip the readahead and return. 9026 */ 9027 if (rrdc == NULL) { 9028 rddir4_cache_rele(rp, rdc); 9029 mutex_exit(&rp->r_statelock); 9030 return (error); 9031 } 9032 9033 /* 9034 * Check to see if we need to fill this entry in. 9035 */ 9036 if (rrdc->flags & RDDIRREQ) { 9037 rrdc->flags &= ~RDDIRREQ; 9038 rrdc->flags |= RDDIR; 9039 rddir4_cache_rele(rp, rdc); 9040 mutex_exit(&rp->r_statelock); 9041 #ifdef DEBUG 9042 nfs4_readdir_readahead++; 9043 #endif 9044 /* 9045 * Do the readdir. 9046 */ 9047 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9048 return (error); 9049 } 9050 9051 rddir4_cache_rele(rp, rrdc); 9052 rddir4_cache_rele(rp, rdc); 9053 mutex_exit(&rp->r_statelock); 9054 return (error); 9055 } 9056 9057 static int 9058 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9059 { 9060 int error; 9061 rnode4_t *rp; 9062 9063 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9064 9065 rp = VTOR4(vp); 9066 9067 /* 9068 * Obtain the readdir results for the caller. 9069 */ 9070 nfs4readdir(vp, rdc, cr); 9071 9072 mutex_enter(&rp->r_statelock); 9073 /* 9074 * The entry is now complete 9075 */ 9076 rdc->flags &= ~RDDIR; 9077 9078 error = rdc->error; 9079 if (error) 9080 rdc->flags |= RDDIRREQ; 9081 rddir4_cache_rele(rp, rdc); 9082 mutex_exit(&rp->r_statelock); 9083 9084 return (error); 9085 } 9086 9087 /* 9088 * Read directory entries. 9089 * There are some weird things to look out for here. The uio_loffset 9090 * field is either 0 or it is the offset returned from a previous 9091 * readdir. It is an opaque value used by the server to find the 9092 * correct directory block to read. The count field is the number 9093 * of blocks to read on the server. This is advisory only, the server 9094 * may return only one block's worth of entries. Entries may be compressed 9095 * on the server. 9096 * 9097 * Generates the following compound request: 9098 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9099 * must include a Lookupp as well. In this case, send: 9100 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9101 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9102 * 9103 * Get complete attributes and filehandles for entries if this is the 9104 * first read of the directory. Otherwise, just get fileid's. 9105 */ 9106 static void 9107 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9108 { 9109 COMPOUND4args_clnt args; 9110 COMPOUND4res_clnt res; 9111 READDIR4args *rargs; 9112 READDIR4res_clnt *rd_res; 9113 bitmap4 rd_bitsval; 9114 nfs_argop4 argop[5]; 9115 nfs_resop4 *resop; 9116 rnode4_t *rp = VTOR4(vp); 9117 mntinfo4_t *mi = VTOMI4(vp); 9118 int doqueue; 9119 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9120 vnode_t *dvp; 9121 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9122 int num_ops, res_opcnt; 9123 bool_t needrecov = FALSE; 9124 nfs4_recov_state_t recov_state; 9125 hrtime_t t; 9126 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9127 9128 ASSERT(nfs_zone() == mi->mi_zone); 9129 ASSERT(rdc->flags & RDDIR); 9130 ASSERT(rdc->entries == NULL); 9131 9132 /* 9133 * If rp were a stub, it should have triggered and caused 9134 * a mount for us to get this far. 9135 */ 9136 ASSERT(!RP_ISSTUB(rp)); 9137 9138 num_ops = 2; 9139 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9140 /* 9141 * Since nfsv4 readdir may not return entries for "." and "..", 9142 * the client must recreate them: 9143 * To find the correct nodeid, do the following: 9144 * For current node, get nodeid from dnlc. 9145 * - if current node is rootvp, set pnodeid to nodeid. 9146 * - else if parent is in the dnlc, get its nodeid from there. 9147 * - else add LOOKUPP+GETATTR to compound. 9148 */ 9149 nodeid = rp->r_attr.va_nodeid; 9150 if (vp->v_flag & VROOT) { 9151 pnodeid = nodeid; /* root of mount point */ 9152 } else { 9153 dvp = dnlc_lookup(vp, ".."); 9154 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9155 /* parent in dnlc cache - no need for otw */ 9156 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9157 } else { 9158 /* 9159 * parent not in dnlc cache, 9160 * do lookupp to get its id 9161 */ 9162 num_ops = 5; 9163 pnodeid = 0; /* set later by getattr parent */ 9164 } 9165 if (dvp) 9166 VN_RELE(dvp); 9167 } 9168 } 9169 recov_state.rs_flags = 0; 9170 recov_state.rs_num_retry_despite_err = 0; 9171 9172 /* Save the original mount point security flavor */ 9173 (void) save_mnt_secinfo(mi->mi_curr_serv); 9174 9175 recov_retry: 9176 args.ctag = TAG_READDIR; 9177 9178 args.array = argop; 9179 args.array_len = num_ops; 9180 9181 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9182 &recov_state, NULL)) { 9183 /* 9184 * If readdir a node that is a stub for a crossed mount point, 9185 * keep the original secinfo flavor for the current file 9186 * system, not the crossed one. 9187 */ 9188 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9189 rdc->error = e.error; 9190 return; 9191 } 9192 9193 /* 9194 * Determine which attrs to request for dirents. This code 9195 * must be protected by nfs4_start/end_fop because of r_server 9196 * (which will change during failover recovery). 9197 * 9198 */ 9199 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9200 /* 9201 * Get all vattr attrs plus filehandle and rdattr_error 9202 */ 9203 rd_bitsval = NFS4_VATTR_MASK | 9204 FATTR4_RDATTR_ERROR_MASK | 9205 FATTR4_FILEHANDLE_MASK; 9206 9207 if (rp->r_flags & R4READDIRWATTR) { 9208 mutex_enter(&rp->r_statelock); 9209 rp->r_flags &= ~R4READDIRWATTR; 9210 mutex_exit(&rp->r_statelock); 9211 } 9212 } else { 9213 servinfo4_t *svp = rp->r_server; 9214 9215 /* 9216 * Already read directory. Use readdir with 9217 * no attrs (except for mounted_on_fileid) for updates. 9218 */ 9219 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9220 9221 /* 9222 * request mounted on fileid if supported, else request 9223 * fileid. maybe we should verify that fileid is supported 9224 * and request something else if not. 9225 */ 9226 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9227 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9228 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9229 nfs_rw_exit(&svp->sv_lock); 9230 } 9231 9232 /* putfh directory fh */ 9233 argop[0].argop = OP_CPUTFH; 9234 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9235 9236 argop[1].argop = OP_READDIR; 9237 rargs = &argop[1].nfs_argop4_u.opreaddir; 9238 /* 9239 * 1 and 2 are reserved for client "." and ".." entry offset. 9240 * cookie 0 should be used over-the-wire to start reading at 9241 * the beginning of the directory excluding "." and "..". 9242 */ 9243 if (rdc->nfs4_cookie == 0 || 9244 rdc->nfs4_cookie == 1 || 9245 rdc->nfs4_cookie == 2) { 9246 rargs->cookie = (nfs_cookie4)0; 9247 rargs->cookieverf = 0; 9248 } else { 9249 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9250 mutex_enter(&rp->r_statelock); 9251 rargs->cookieverf = rp->r_cookieverf4; 9252 mutex_exit(&rp->r_statelock); 9253 } 9254 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9255 rargs->maxcount = mi->mi_tsize; 9256 rargs->attr_request = rd_bitsval; 9257 rargs->rdc = rdc; 9258 rargs->dvp = vp; 9259 rargs->mi = mi; 9260 rargs->cr = cr; 9261 9262 9263 /* 9264 * If count < than the minimum required, we return no entries 9265 * and fail with EINVAL 9266 */ 9267 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9268 rdc->error = EINVAL; 9269 goto out; 9270 } 9271 9272 if (args.array_len == 5) { 9273 /* 9274 * Add lookupp and getattr for parent nodeid. 9275 */ 9276 argop[2].argop = OP_LOOKUPP; 9277 9278 argop[3].argop = OP_GETFH; 9279 9280 /* getattr parent */ 9281 argop[4].argop = OP_GETATTR; 9282 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9283 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9284 } 9285 9286 doqueue = 1; 9287 9288 if (mi->mi_io_kstats) { 9289 mutex_enter(&mi->mi_lock); 9290 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9291 mutex_exit(&mi->mi_lock); 9292 } 9293 9294 /* capture the time of this call */ 9295 rargs->t = t = gethrtime(); 9296 9297 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9298 9299 if (mi->mi_io_kstats) { 9300 mutex_enter(&mi->mi_lock); 9301 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9302 mutex_exit(&mi->mi_lock); 9303 } 9304 9305 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9306 9307 /* 9308 * If RPC error occurred and it isn't an error that 9309 * triggers recovery, then go ahead and fail now. 9310 */ 9311 if (e.error != 0 && !needrecov) { 9312 rdc->error = e.error; 9313 goto out; 9314 } 9315 9316 if (needrecov) { 9317 bool_t abort; 9318 9319 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9320 "nfs4readdir: initiating recovery.\n")); 9321 9322 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9323 NULL, OP_READDIR, NULL); 9324 if (abort == FALSE) { 9325 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9326 &recov_state, needrecov); 9327 if (!e.error) 9328 (void) xdr_free(xdr_COMPOUND4res_clnt, 9329 (caddr_t)&res); 9330 if (rdc->entries != NULL) { 9331 kmem_free(rdc->entries, rdc->entlen); 9332 rdc->entries = NULL; 9333 } 9334 goto recov_retry; 9335 } 9336 9337 if (e.error != 0) { 9338 rdc->error = e.error; 9339 goto out; 9340 } 9341 9342 /* fall through for res.status case */ 9343 } 9344 9345 res_opcnt = res.array_len; 9346 9347 /* 9348 * If compound failed first 2 ops (PUTFH+READDIR), then return 9349 * failure here. Subsequent ops are for filling out dot-dot 9350 * dirent, and if they fail, we still want to give the caller 9351 * the dirents returned by (the successful) READDIR op, so we need 9352 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9353 * 9354 * One example where PUTFH+READDIR ops would succeed but 9355 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9356 * but lacks x. In this case, a POSIX server's VOP_READDIR 9357 * would succeed; however, VOP_LOOKUP(..) would fail since no 9358 * x perm. We need to come up with a non-vendor-specific way 9359 * for a POSIX server to return d_ino from dotdot's dirent if 9360 * client only requests mounted_on_fileid, and just say the 9361 * LOOKUPP succeeded and fill out the GETATTR. However, if 9362 * client requested any mandatory attrs, server would be required 9363 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9364 * for dotdot. 9365 */ 9366 9367 if (res.status) { 9368 if (res_opcnt <= 2) { 9369 e.error = geterrno4(res.status); 9370 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9371 &recov_state, needrecov); 9372 nfs4_purge_stale_fh(e.error, vp, cr); 9373 rdc->error = e.error; 9374 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9375 if (rdc->entries != NULL) { 9376 kmem_free(rdc->entries, rdc->entlen); 9377 rdc->entries = NULL; 9378 } 9379 /* 9380 * If readdir a node that is a stub for a 9381 * crossed mount point, keep the original 9382 * secinfo flavor for the current file system, 9383 * not the crossed one. 9384 */ 9385 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9386 return; 9387 } 9388 } 9389 9390 resop = &res.array[1]; /* readdir res */ 9391 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9392 9393 mutex_enter(&rp->r_statelock); 9394 rp->r_cookieverf4 = rd_res->cookieverf; 9395 mutex_exit(&rp->r_statelock); 9396 9397 /* 9398 * For "." and ".." entries 9399 * e.g. 9400 * seek(cookie=0) -> "." entry with d_off = 1 9401 * seek(cookie=1) -> ".." entry with d_off = 2 9402 */ 9403 if (cookie == (nfs_cookie4) 0) { 9404 if (rd_res->dotp) 9405 rd_res->dotp->d_ino = nodeid; 9406 if (rd_res->dotdotp) 9407 rd_res->dotdotp->d_ino = pnodeid; 9408 } 9409 if (cookie == (nfs_cookie4) 1) { 9410 if (rd_res->dotdotp) 9411 rd_res->dotdotp->d_ino = pnodeid; 9412 } 9413 9414 9415 /* LOOKUPP+GETATTR attemped */ 9416 if (args.array_len == 5 && rd_res->dotdotp) { 9417 if (res.status == NFS4_OK && res_opcnt == 5) { 9418 nfs_fh4 *fhp; 9419 nfs4_sharedfh_t *sfhp; 9420 vnode_t *pvp; 9421 nfs4_ga_res_t *garp; 9422 9423 resop++; /* lookupp */ 9424 resop++; /* getfh */ 9425 fhp = &resop->nfs_resop4_u.opgetfh.object; 9426 9427 resop++; /* getattr of parent */ 9428 9429 /* 9430 * First, take care of finishing the 9431 * readdir results. 9432 */ 9433 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9434 /* 9435 * The d_ino of .. must be the inode number 9436 * of the mounted filesystem. 9437 */ 9438 if (garp->n4g_va.va_mask & AT_NODEID) 9439 rd_res->dotdotp->d_ino = 9440 garp->n4g_va.va_nodeid; 9441 9442 9443 /* 9444 * Next, create the ".." dnlc entry 9445 */ 9446 sfhp = sfh4_get(fhp, mi); 9447 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9448 dnlc_update(vp, "..", pvp); 9449 VN_RELE(pvp); 9450 } 9451 sfh4_rele(&sfhp); 9452 } 9453 } 9454 9455 if (mi->mi_io_kstats) { 9456 mutex_enter(&mi->mi_lock); 9457 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9458 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9459 mutex_exit(&mi->mi_lock); 9460 } 9461 9462 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9463 9464 out: 9465 /* 9466 * If readdir a node that is a stub for a crossed mount point, 9467 * keep the original secinfo flavor for the current file system, 9468 * not the crossed one. 9469 */ 9470 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9471 9472 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9473 } 9474 9475 9476 static int 9477 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9478 { 9479 rnode4_t *rp = VTOR4(bp->b_vp); 9480 int count; 9481 int error; 9482 cred_t *cred_otw = NULL; 9483 offset_t offset; 9484 nfs4_open_stream_t *osp = NULL; 9485 bool_t first_time = TRUE; /* first time getting otw cred */ 9486 bool_t last_time = FALSE; /* last time getting otw cred */ 9487 9488 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9489 9490 DTRACE_IO1(start, struct buf *, bp); 9491 offset = ldbtob(bp->b_lblkno); 9492 9493 if (bp->b_flags & B_READ) { 9494 read_again: 9495 /* 9496 * Releases the osp, if it is provided. 9497 * Puts a hold on the cred_otw and the new osp (if found). 9498 */ 9499 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9500 &first_time, &last_time); 9501 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9502 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9503 readahead, NULL); 9504 crfree(cred_otw); 9505 if (!error) { 9506 if (bp->b_resid) { 9507 /* 9508 * Didn't get it all because we hit EOF, 9509 * zero all the memory beyond the EOF. 9510 */ 9511 /* bzero(rdaddr + */ 9512 bzero(bp->b_un.b_addr + 9513 bp->b_bcount - bp->b_resid, bp->b_resid); 9514 } 9515 mutex_enter(&rp->r_statelock); 9516 if (bp->b_resid == bp->b_bcount && 9517 offset >= rp->r_size) { 9518 /* 9519 * We didn't read anything at all as we are 9520 * past EOF. Return an error indicator back 9521 * but don't destroy the pages (yet). 9522 */ 9523 error = NFS_EOF; 9524 } 9525 mutex_exit(&rp->r_statelock); 9526 } else if (error == EACCES && last_time == FALSE) { 9527 goto read_again; 9528 } 9529 } else { 9530 if (!(rp->r_flags & R4STALE)) { 9531 write_again: 9532 /* 9533 * Releases the osp, if it is provided. 9534 * Puts a hold on the cred_otw and the new 9535 * osp (if found). 9536 */ 9537 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9538 &first_time, &last_time); 9539 mutex_enter(&rp->r_statelock); 9540 count = MIN(bp->b_bcount, rp->r_size - offset); 9541 mutex_exit(&rp->r_statelock); 9542 if (count < 0) 9543 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9544 #ifdef DEBUG 9545 if (count == 0) { 9546 zoneid_t zoneid = getzoneid(); 9547 9548 zcmn_err(zoneid, CE_WARN, 9549 "nfs4_bio: zero length write at %lld", 9550 offset); 9551 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9552 "b_bcount=%ld, file size=%lld", 9553 rp->r_flags, (long)bp->b_bcount, 9554 rp->r_size); 9555 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9556 if (nfs4_bio_do_stop) 9557 debug_enter("nfs4_bio"); 9558 } 9559 #endif 9560 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9561 count, cred_otw, stab_comm); 9562 if (error == EACCES && last_time == FALSE) { 9563 crfree(cred_otw); 9564 goto write_again; 9565 } 9566 bp->b_error = error; 9567 if (error && error != EINTR && 9568 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9569 /* 9570 * Don't print EDQUOT errors on the console. 9571 * Don't print asynchronous EACCES errors. 9572 * Don't print EFBIG errors. 9573 * Print all other write errors. 9574 */ 9575 if (error != EDQUOT && error != EFBIG && 9576 (error != EACCES || 9577 !(bp->b_flags & B_ASYNC))) 9578 nfs4_write_error(bp->b_vp, 9579 error, cred_otw); 9580 /* 9581 * Update r_error and r_flags as appropriate. 9582 * If the error was ESTALE, then mark the 9583 * rnode as not being writeable and save 9584 * the error status. Otherwise, save any 9585 * errors which occur from asynchronous 9586 * page invalidations. Any errors occurring 9587 * from other operations should be saved 9588 * by the caller. 9589 */ 9590 mutex_enter(&rp->r_statelock); 9591 if (error == ESTALE) { 9592 rp->r_flags |= R4STALE; 9593 if (!rp->r_error) 9594 rp->r_error = error; 9595 } else if (!rp->r_error && 9596 (bp->b_flags & 9597 (B_INVAL|B_FORCE|B_ASYNC)) == 9598 (B_INVAL|B_FORCE|B_ASYNC)) { 9599 rp->r_error = error; 9600 } 9601 mutex_exit(&rp->r_statelock); 9602 } 9603 crfree(cred_otw); 9604 } else { 9605 error = rp->r_error; 9606 /* 9607 * A close may have cleared r_error, if so, 9608 * propagate ESTALE error return properly 9609 */ 9610 if (error == 0) 9611 error = ESTALE; 9612 } 9613 } 9614 9615 if (error != 0 && error != NFS_EOF) 9616 bp->b_flags |= B_ERROR; 9617 9618 if (osp) 9619 open_stream_rele(osp, rp); 9620 9621 DTRACE_IO1(done, struct buf *, bp); 9622 9623 return (error); 9624 } 9625 9626 /* ARGSUSED */ 9627 int 9628 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9629 { 9630 return (EREMOTE); 9631 } 9632 9633 /* ARGSUSED2 */ 9634 int 9635 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9636 { 9637 rnode4_t *rp = VTOR4(vp); 9638 9639 if (!write_lock) { 9640 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9641 return (V_WRITELOCK_FALSE); 9642 } 9643 9644 if ((rp->r_flags & R4DIRECTIO) || 9645 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9646 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9647 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9648 return (V_WRITELOCK_FALSE); 9649 nfs_rw_exit(&rp->r_rwlock); 9650 } 9651 9652 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9653 return (V_WRITELOCK_TRUE); 9654 } 9655 9656 /* ARGSUSED */ 9657 void 9658 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9659 { 9660 rnode4_t *rp = VTOR4(vp); 9661 9662 nfs_rw_exit(&rp->r_rwlock); 9663 } 9664 9665 /* ARGSUSED */ 9666 static int 9667 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9668 { 9669 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9670 return (EIO); 9671 9672 /* 9673 * Because we stuff the readdir cookie into the offset field 9674 * someone may attempt to do an lseek with the cookie which 9675 * we want to succeed. 9676 */ 9677 if (vp->v_type == VDIR) 9678 return (0); 9679 if (*noffp < 0) 9680 return (EINVAL); 9681 return (0); 9682 } 9683 9684 9685 /* 9686 * Return all the pages from [off..off+len) in file 9687 */ 9688 /* ARGSUSED */ 9689 static int 9690 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9691 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9692 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9693 { 9694 rnode4_t *rp; 9695 int error; 9696 mntinfo4_t *mi; 9697 9698 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9699 return (EIO); 9700 rp = VTOR4(vp); 9701 if (IS_SHADOW(vp, rp)) 9702 vp = RTOV4(rp); 9703 9704 if (vp->v_flag & VNOMAP) 9705 return (ENOSYS); 9706 9707 if (protp != NULL) 9708 *protp = PROT_ALL; 9709 9710 /* 9711 * Now validate that the caches are up to date. 9712 */ 9713 if (error = nfs4_validate_caches(vp, cr)) 9714 return (error); 9715 9716 mi = VTOMI4(vp); 9717 retry: 9718 mutex_enter(&rp->r_statelock); 9719 9720 /* 9721 * Don't create dirty pages faster than they 9722 * can be cleaned so that the system doesn't 9723 * get imbalanced. If the async queue is 9724 * maxed out, then wait for it to drain before 9725 * creating more dirty pages. Also, wait for 9726 * any threads doing pagewalks in the vop_getattr 9727 * entry points so that they don't block for 9728 * long periods. 9729 */ 9730 if (rw == S_CREATE) { 9731 while ((mi->mi_max_threads != 0 && 9732 rp->r_awcount > 2 * mi->mi_max_threads) || 9733 rp->r_gcount > 0) 9734 cv_wait(&rp->r_cv, &rp->r_statelock); 9735 } 9736 9737 /* 9738 * If we are getting called as a side effect of an nfs_write() 9739 * operation the local file size might not be extended yet. 9740 * In this case we want to be able to return pages of zeroes. 9741 */ 9742 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9743 NFS4_DEBUG(nfs4_pageio_debug, 9744 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9745 "len=%llu, size=%llu, attrsize =%llu", off, 9746 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9747 mutex_exit(&rp->r_statelock); 9748 return (EFAULT); /* beyond EOF */ 9749 } 9750 9751 mutex_exit(&rp->r_statelock); 9752 9753 if (len <= PAGESIZE) { 9754 error = nfs4_getapage(vp, off, len, protp, pl, plsz, 9755 seg, addr, rw, cr); 9756 NFS4_DEBUG(nfs4_pageio_debug && error, 9757 (CE_NOTE, "getpage error %d; off=%lld, " 9758 "len=%lld", error, off, (u_longlong_t)len)); 9759 } else { 9760 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9761 pl, plsz, seg, addr, rw, cr); 9762 NFS4_DEBUG(nfs4_pageio_debug && error, 9763 (CE_NOTE, "getpages error %d; off=%lld, " 9764 "len=%lld", error, off, (u_longlong_t)len)); 9765 } 9766 9767 switch (error) { 9768 case NFS_EOF: 9769 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9770 goto retry; 9771 case ESTALE: 9772 nfs4_purge_stale_fh(error, vp, cr); 9773 } 9774 9775 return (error); 9776 } 9777 9778 /* 9779 * Called from pvn_getpages or nfs4_getpage to get a particular page. 9780 */ 9781 /* ARGSUSED */ 9782 static int 9783 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9784 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9785 enum seg_rw rw, cred_t *cr) 9786 { 9787 rnode4_t *rp; 9788 uint_t bsize; 9789 struct buf *bp; 9790 page_t *pp; 9791 u_offset_t lbn; 9792 u_offset_t io_off; 9793 u_offset_t blkoff; 9794 u_offset_t rablkoff; 9795 size_t io_len; 9796 uint_t blksize; 9797 int error; 9798 int readahead; 9799 int readahead_issued = 0; 9800 int ra_window; /* readahead window */ 9801 page_t *pagefound; 9802 page_t *savepp; 9803 9804 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9805 return (EIO); 9806 9807 rp = VTOR4(vp); 9808 ASSERT(!IS_SHADOW(vp, rp)); 9809 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9810 9811 reread: 9812 bp = NULL; 9813 pp = NULL; 9814 pagefound = NULL; 9815 9816 if (pl != NULL) 9817 pl[0] = NULL; 9818 9819 error = 0; 9820 lbn = off / bsize; 9821 blkoff = lbn * bsize; 9822 9823 /* 9824 * Queueing up the readahead before doing the synchronous read 9825 * results in a significant increase in read throughput because 9826 * of the increased parallelism between the async threads and 9827 * the process context. 9828 */ 9829 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9830 rw != S_CREATE && 9831 !(vp->v_flag & VNOCACHE)) { 9832 mutex_enter(&rp->r_statelock); 9833 9834 /* 9835 * Calculate the number of readaheads to do. 9836 * a) No readaheads at offset = 0. 9837 * b) Do maximum(nfs4_nra) readaheads when the readahead 9838 * window is closed. 9839 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9840 * upon how far the readahead window is open or close. 9841 * d) No readaheads if rp->r_nextr is not within the scope 9842 * of the readahead window (random i/o). 9843 */ 9844 9845 if (off == 0) 9846 readahead = 0; 9847 else if (blkoff == rp->r_nextr) 9848 readahead = nfs4_nra; 9849 else if (rp->r_nextr > blkoff && 9850 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9851 <= (nfs4_nra - 1))) 9852 readahead = nfs4_nra - ra_window; 9853 else 9854 readahead = 0; 9855 9856 rablkoff = rp->r_nextr; 9857 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9858 mutex_exit(&rp->r_statelock); 9859 if (nfs4_async_readahead(vp, rablkoff + bsize, 9860 addr + (rablkoff + bsize - off), 9861 seg, cr, nfs4_readahead) < 0) { 9862 mutex_enter(&rp->r_statelock); 9863 break; 9864 } 9865 readahead--; 9866 rablkoff += bsize; 9867 /* 9868 * Indicate that we did a readahead so 9869 * readahead offset is not updated 9870 * by the synchronous read below. 9871 */ 9872 readahead_issued = 1; 9873 mutex_enter(&rp->r_statelock); 9874 /* 9875 * set readahead offset to 9876 * offset of last async readahead 9877 * request. 9878 */ 9879 rp->r_nextr = rablkoff; 9880 } 9881 mutex_exit(&rp->r_statelock); 9882 } 9883 9884 again: 9885 if ((pagefound = page_exists(vp, off)) == NULL) { 9886 if (pl == NULL) { 9887 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9888 nfs4_readahead); 9889 } else if (rw == S_CREATE) { 9890 /* 9891 * Block for this page is not allocated, or the offset 9892 * is beyond the current allocation size, or we're 9893 * allocating a swap slot and the page was not found, 9894 * so allocate it and return a zero page. 9895 */ 9896 if ((pp = page_create_va(vp, off, 9897 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9898 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9899 io_len = PAGESIZE; 9900 mutex_enter(&rp->r_statelock); 9901 rp->r_nextr = off + PAGESIZE; 9902 mutex_exit(&rp->r_statelock); 9903 } else { 9904 /* 9905 * Need to go to server to get a block 9906 */ 9907 mutex_enter(&rp->r_statelock); 9908 if (blkoff < rp->r_size && 9909 blkoff + bsize > rp->r_size) { 9910 /* 9911 * If less than a block left in 9912 * file read less than a block. 9913 */ 9914 if (rp->r_size <= off) { 9915 /* 9916 * Trying to access beyond EOF, 9917 * set up to get at least one page. 9918 */ 9919 blksize = off + PAGESIZE - blkoff; 9920 } else 9921 blksize = rp->r_size - blkoff; 9922 } else if ((off == 0) || 9923 (off != rp->r_nextr && !readahead_issued)) { 9924 blksize = PAGESIZE; 9925 blkoff = off; /* block = page here */ 9926 } else 9927 blksize = bsize; 9928 mutex_exit(&rp->r_statelock); 9929 9930 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9931 &io_len, blkoff, blksize, 0); 9932 9933 /* 9934 * Some other thread has entered the page, 9935 * so just use it. 9936 */ 9937 if (pp == NULL) 9938 goto again; 9939 9940 /* 9941 * Now round the request size up to page boundaries. 9942 * This ensures that the entire page will be 9943 * initialized to zeroes if EOF is encountered. 9944 */ 9945 io_len = ptob(btopr(io_len)); 9946 9947 bp = pageio_setup(pp, io_len, vp, B_READ); 9948 ASSERT(bp != NULL); 9949 9950 /* 9951 * pageio_setup should have set b_addr to 0. This 9952 * is correct since we want to do I/O on a page 9953 * boundary. bp_mapin will use this addr to calculate 9954 * an offset, and then set b_addr to the kernel virtual 9955 * address it allocated for us. 9956 */ 9957 ASSERT(bp->b_un.b_addr == 0); 9958 9959 bp->b_edev = 0; 9960 bp->b_dev = 0; 9961 bp->b_lblkno = lbtodb(io_off); 9962 bp->b_file = vp; 9963 bp->b_offset = (offset_t)off; 9964 bp_mapin(bp); 9965 9966 /* 9967 * If doing a write beyond what we believe is EOF, 9968 * don't bother trying to read the pages from the 9969 * server, we'll just zero the pages here. We 9970 * don't check that the rw flag is S_WRITE here 9971 * because some implementations may attempt a 9972 * read access to the buffer before copying data. 9973 */ 9974 mutex_enter(&rp->r_statelock); 9975 if (io_off >= rp->r_size && seg == segkmap) { 9976 mutex_exit(&rp->r_statelock); 9977 bzero(bp->b_un.b_addr, io_len); 9978 } else { 9979 mutex_exit(&rp->r_statelock); 9980 error = nfs4_bio(bp, NULL, cr, FALSE); 9981 } 9982 9983 /* 9984 * Unmap the buffer before freeing it. 9985 */ 9986 bp_mapout(bp); 9987 pageio_done(bp); 9988 9989 savepp = pp; 9990 do { 9991 pp->p_fsdata = C_NOCOMMIT; 9992 } while ((pp = pp->p_next) != savepp); 9993 9994 if (error == NFS_EOF) { 9995 /* 9996 * If doing a write system call just return 9997 * zeroed pages, else user tried to get pages 9998 * beyond EOF, return error. We don't check 9999 * that the rw flag is S_WRITE here because 10000 * some implementations may attempt a read 10001 * access to the buffer before copying data. 10002 */ 10003 if (seg == segkmap) 10004 error = 0; 10005 else 10006 error = EFAULT; 10007 } 10008 10009 if (!readahead_issued && !error) { 10010 mutex_enter(&rp->r_statelock); 10011 rp->r_nextr = io_off + io_len; 10012 mutex_exit(&rp->r_statelock); 10013 } 10014 } 10015 } 10016 10017 out: 10018 if (pl == NULL) 10019 return (error); 10020 10021 if (error) { 10022 if (pp != NULL) 10023 pvn_read_done(pp, B_ERROR); 10024 return (error); 10025 } 10026 10027 if (pagefound) { 10028 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10029 10030 /* 10031 * Page exists in the cache, acquire the appropriate lock. 10032 * If this fails, start all over again. 10033 */ 10034 if ((pp = page_lookup(vp, off, se)) == NULL) { 10035 #ifdef DEBUG 10036 nfs4_lostpage++; 10037 #endif 10038 goto reread; 10039 } 10040 pl[0] = pp; 10041 pl[1] = NULL; 10042 return (0); 10043 } 10044 10045 if (pp != NULL) 10046 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10047 10048 return (error); 10049 } 10050 10051 static void 10052 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10053 cred_t *cr) 10054 { 10055 int error; 10056 page_t *pp; 10057 u_offset_t io_off; 10058 size_t io_len; 10059 struct buf *bp; 10060 uint_t bsize, blksize; 10061 rnode4_t *rp = VTOR4(vp); 10062 page_t *savepp; 10063 10064 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10065 10066 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10067 10068 mutex_enter(&rp->r_statelock); 10069 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10070 /* 10071 * If less than a block left in file read less 10072 * than a block. 10073 */ 10074 blksize = rp->r_size - blkoff; 10075 } else 10076 blksize = bsize; 10077 mutex_exit(&rp->r_statelock); 10078 10079 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10080 &io_off, &io_len, blkoff, blksize, 1); 10081 /* 10082 * The isra flag passed to the kluster function is 1, we may have 10083 * gotten a return value of NULL for a variety of reasons (# of free 10084 * pages < minfree, someone entered the page on the vnode etc). In all 10085 * cases, we want to punt on the readahead. 10086 */ 10087 if (pp == NULL) 10088 return; 10089 10090 /* 10091 * Now round the request size up to page boundaries. 10092 * This ensures that the entire page will be 10093 * initialized to zeroes if EOF is encountered. 10094 */ 10095 io_len = ptob(btopr(io_len)); 10096 10097 bp = pageio_setup(pp, io_len, vp, B_READ); 10098 ASSERT(bp != NULL); 10099 10100 /* 10101 * pageio_setup should have set b_addr to 0. This is correct since 10102 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10103 * to calculate an offset, and then set b_addr to the kernel virtual 10104 * address it allocated for us. 10105 */ 10106 ASSERT(bp->b_un.b_addr == 0); 10107 10108 bp->b_edev = 0; 10109 bp->b_dev = 0; 10110 bp->b_lblkno = lbtodb(io_off); 10111 bp->b_file = vp; 10112 bp->b_offset = (offset_t)blkoff; 10113 bp_mapin(bp); 10114 10115 /* 10116 * If doing a write beyond what we believe is EOF, don't bother trying 10117 * to read the pages from the server, we'll just zero the pages here. 10118 * We don't check that the rw flag is S_WRITE here because some 10119 * implementations may attempt a read access to the buffer before 10120 * copying data. 10121 */ 10122 mutex_enter(&rp->r_statelock); 10123 if (io_off >= rp->r_size && seg == segkmap) { 10124 mutex_exit(&rp->r_statelock); 10125 bzero(bp->b_un.b_addr, io_len); 10126 error = 0; 10127 } else { 10128 mutex_exit(&rp->r_statelock); 10129 error = nfs4_bio(bp, NULL, cr, TRUE); 10130 if (error == NFS_EOF) 10131 error = 0; 10132 } 10133 10134 /* 10135 * Unmap the buffer before freeing it. 10136 */ 10137 bp_mapout(bp); 10138 pageio_done(bp); 10139 10140 savepp = pp; 10141 do { 10142 pp->p_fsdata = C_NOCOMMIT; 10143 } while ((pp = pp->p_next) != savepp); 10144 10145 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10146 10147 /* 10148 * In case of error set readahead offset 10149 * to the lowest offset. 10150 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10151 */ 10152 if (error && rp->r_nextr > io_off) { 10153 mutex_enter(&rp->r_statelock); 10154 if (rp->r_nextr > io_off) 10155 rp->r_nextr = io_off; 10156 mutex_exit(&rp->r_statelock); 10157 } 10158 } 10159 10160 /* 10161 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10162 * If len == 0, do from off to EOF. 10163 * 10164 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10165 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10166 * (from pageout). 10167 */ 10168 /* ARGSUSED */ 10169 static int 10170 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10171 caller_context_t *ct) 10172 { 10173 int error; 10174 rnode4_t *rp; 10175 10176 ASSERT(cr != NULL); 10177 10178 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10179 return (EIO); 10180 10181 rp = VTOR4(vp); 10182 if (IS_SHADOW(vp, rp)) 10183 vp = RTOV4(rp); 10184 10185 /* 10186 * XXX - Why should this check be made here? 10187 */ 10188 if (vp->v_flag & VNOMAP) 10189 return (ENOSYS); 10190 10191 if (len == 0 && !(flags & B_INVAL) && 10192 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10193 return (0); 10194 10195 mutex_enter(&rp->r_statelock); 10196 rp->r_count++; 10197 mutex_exit(&rp->r_statelock); 10198 error = nfs4_putpages(vp, off, len, flags, cr); 10199 mutex_enter(&rp->r_statelock); 10200 rp->r_count--; 10201 cv_broadcast(&rp->r_cv); 10202 mutex_exit(&rp->r_statelock); 10203 10204 return (error); 10205 } 10206 10207 /* 10208 * Write out a single page, possibly klustering adjacent dirty pages. 10209 */ 10210 int 10211 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10212 int flags, cred_t *cr) 10213 { 10214 u_offset_t io_off; 10215 u_offset_t lbn_off; 10216 u_offset_t lbn; 10217 size_t io_len; 10218 uint_t bsize; 10219 int error; 10220 rnode4_t *rp; 10221 10222 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10223 ASSERT(pp != NULL); 10224 ASSERT(cr != NULL); 10225 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10226 10227 rp = VTOR4(vp); 10228 ASSERT(rp->r_count > 0); 10229 ASSERT(!IS_SHADOW(vp, rp)); 10230 10231 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10232 lbn = pp->p_offset / bsize; 10233 lbn_off = lbn * bsize; 10234 10235 /* 10236 * Find a kluster that fits in one block, or in 10237 * one page if pages are bigger than blocks. If 10238 * there is less file space allocated than a whole 10239 * page, we'll shorten the i/o request below. 10240 */ 10241 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10242 roundup(bsize, PAGESIZE), flags); 10243 10244 /* 10245 * pvn_write_kluster shouldn't have returned a page with offset 10246 * behind the original page we were given. Verify that. 10247 */ 10248 ASSERT((pp->p_offset / bsize) >= lbn); 10249 10250 /* 10251 * Now pp will have the list of kept dirty pages marked for 10252 * write back. It will also handle invalidation and freeing 10253 * of pages that are not dirty. Check for page length rounding 10254 * problems. 10255 */ 10256 if (io_off + io_len > lbn_off + bsize) { 10257 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10258 io_len = lbn_off + bsize - io_off; 10259 } 10260 /* 10261 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10262 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10263 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10264 * progress and the r_size has not been made consistent with the 10265 * new size of the file. When the uiomove() completes the r_size is 10266 * updated and the R4MODINPROGRESS flag is cleared. 10267 * 10268 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10269 * consistent value of r_size. Without this handshaking, it is 10270 * possible that nfs4_bio() picks up the old value of r_size 10271 * before the uiomove() in writerp4() completes. This will result 10272 * in the write through nfs4_bio() being dropped. 10273 * 10274 * More precisely, there is a window between the time the uiomove() 10275 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10276 * operation intervenes in this window, the page will be picked up, 10277 * because it is dirty (it will be unlocked, unless it was 10278 * pagecreate'd). When the page is picked up as dirty, the dirty 10279 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10280 * checked. This will still be the old size. Therefore the page will 10281 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10282 * the page will be found to be clean and the write will be dropped. 10283 */ 10284 if (rp->r_flags & R4MODINPROGRESS) { 10285 mutex_enter(&rp->r_statelock); 10286 if ((rp->r_flags & R4MODINPROGRESS) && 10287 rp->r_modaddr + MAXBSIZE > io_off && 10288 rp->r_modaddr < io_off + io_len) { 10289 page_t *plist; 10290 /* 10291 * A write is in progress for this region of the file. 10292 * If we did not detect R4MODINPROGRESS here then this 10293 * path through nfs_putapage() would eventually go to 10294 * nfs4_bio() and may not write out all of the data 10295 * in the pages. We end up losing data. So we decide 10296 * to set the modified bit on each page in the page 10297 * list and mark the rnode with R4DIRTY. This write 10298 * will be restarted at some later time. 10299 */ 10300 plist = pp; 10301 while (plist != NULL) { 10302 pp = plist; 10303 page_sub(&plist, pp); 10304 hat_setmod(pp); 10305 page_io_unlock(pp); 10306 page_unlock(pp); 10307 } 10308 rp->r_flags |= R4DIRTY; 10309 mutex_exit(&rp->r_statelock); 10310 if (offp) 10311 *offp = io_off; 10312 if (lenp) 10313 *lenp = io_len; 10314 return (0); 10315 } 10316 mutex_exit(&rp->r_statelock); 10317 } 10318 10319 if (flags & B_ASYNC) { 10320 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10321 nfs4_sync_putapage); 10322 } else 10323 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10324 10325 if (offp) 10326 *offp = io_off; 10327 if (lenp) 10328 *lenp = io_len; 10329 return (error); 10330 } 10331 10332 static int 10333 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10334 int flags, cred_t *cr) 10335 { 10336 int error; 10337 rnode4_t *rp; 10338 10339 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10340 10341 flags |= B_WRITE; 10342 10343 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10344 10345 rp = VTOR4(vp); 10346 10347 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10348 error == EACCES) && 10349 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10350 if (!(rp->r_flags & R4OUTOFSPACE)) { 10351 mutex_enter(&rp->r_statelock); 10352 rp->r_flags |= R4OUTOFSPACE; 10353 mutex_exit(&rp->r_statelock); 10354 } 10355 flags |= B_ERROR; 10356 pvn_write_done(pp, flags); 10357 /* 10358 * If this was not an async thread, then try again to 10359 * write out the pages, but this time, also destroy 10360 * them whether or not the write is successful. This 10361 * will prevent memory from filling up with these 10362 * pages and destroying them is the only alternative 10363 * if they can't be written out. 10364 * 10365 * Don't do this if this is an async thread because 10366 * when the pages are unlocked in pvn_write_done, 10367 * some other thread could have come along, locked 10368 * them, and queued for an async thread. It would be 10369 * possible for all of the async threads to be tied 10370 * up waiting to lock the pages again and they would 10371 * all already be locked and waiting for an async 10372 * thread to handle them. Deadlock. 10373 */ 10374 if (!(flags & B_ASYNC)) { 10375 error = nfs4_putpage(vp, io_off, io_len, 10376 B_INVAL | B_FORCE, cr, NULL); 10377 } 10378 } else { 10379 if (error) 10380 flags |= B_ERROR; 10381 else if (rp->r_flags & R4OUTOFSPACE) { 10382 mutex_enter(&rp->r_statelock); 10383 rp->r_flags &= ~R4OUTOFSPACE; 10384 mutex_exit(&rp->r_statelock); 10385 } 10386 pvn_write_done(pp, flags); 10387 if (freemem < desfree) 10388 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10389 NFS4_WRITE_NOWAIT); 10390 } 10391 10392 return (error); 10393 } 10394 10395 #ifdef DEBUG 10396 int nfs4_force_open_before_mmap = 0; 10397 #endif 10398 10399 /* ARGSUSED */ 10400 static int 10401 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10402 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10403 caller_context_t *ct) 10404 { 10405 struct segvn_crargs vn_a; 10406 int error = 0; 10407 rnode4_t *rp = VTOR4(vp); 10408 mntinfo4_t *mi = VTOMI4(vp); 10409 10410 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10411 return (EIO); 10412 10413 if (vp->v_flag & VNOMAP) 10414 return (ENOSYS); 10415 10416 if (off < 0 || (off + len) < 0) 10417 return (ENXIO); 10418 10419 if (vp->v_type != VREG) 10420 return (ENODEV); 10421 10422 /* 10423 * If the file is delegated to the client don't do anything. 10424 * If the file is not delegated, then validate the data cache. 10425 */ 10426 mutex_enter(&rp->r_statev4_lock); 10427 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10428 mutex_exit(&rp->r_statev4_lock); 10429 error = nfs4_validate_caches(vp, cr); 10430 if (error) 10431 return (error); 10432 } else { 10433 mutex_exit(&rp->r_statev4_lock); 10434 } 10435 10436 /* 10437 * Check to see if the vnode is currently marked as not cachable. 10438 * This means portions of the file are locked (through VOP_FRLOCK). 10439 * In this case the map request must be refused. We use 10440 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10441 * 10442 * Atomically increment r_inmap after acquiring r_rwlock. The 10443 * idea here is to acquire r_rwlock to block read/write and 10444 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10445 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10446 * and we can prevent the deadlock that would have occurred 10447 * when nfs4_addmap() would have acquired it out of order. 10448 * 10449 * Since we are not protecting r_inmap by any lock, we do not 10450 * hold any lock when we decrement it. We atomically decrement 10451 * r_inmap after we release r_lkserlock. 10452 */ 10453 10454 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10455 return (EINTR); 10456 atomic_add_int(&rp->r_inmap, 1); 10457 nfs_rw_exit(&rp->r_rwlock); 10458 10459 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10460 atomic_add_int(&rp->r_inmap, -1); 10461 return (EINTR); 10462 } 10463 10464 10465 if (vp->v_flag & VNOCACHE) { 10466 error = EAGAIN; 10467 goto done; 10468 } 10469 10470 /* 10471 * Don't allow concurrent locks and mapping if mandatory locking is 10472 * enabled. 10473 */ 10474 if (flk_has_remote_locks(vp)) { 10475 struct vattr va; 10476 va.va_mask = AT_MODE; 10477 error = nfs4getattr(vp, &va, cr); 10478 if (error != 0) 10479 goto done; 10480 if (MANDLOCK(vp, va.va_mode)) { 10481 error = EAGAIN; 10482 goto done; 10483 } 10484 } 10485 10486 /* 10487 * It is possible that the rnode has a lost lock request that we 10488 * are still trying to recover, and that the request conflicts with 10489 * this map request. 10490 * 10491 * An alternative approach would be for nfs4_safemap() to consider 10492 * queued lock requests when deciding whether to set or clear 10493 * VNOCACHE. This would require the frlock code path to call 10494 * nfs4_safemap() after enqueing a lost request. 10495 */ 10496 if (nfs4_map_lost_lock_conflict(vp)) { 10497 error = EAGAIN; 10498 goto done; 10499 } 10500 10501 as_rangelock(as); 10502 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10503 if (error != 0) { 10504 as_rangeunlock(as); 10505 goto done; 10506 } 10507 10508 if (vp->v_type == VREG) { 10509 /* 10510 * We need to retrieve the open stream 10511 */ 10512 nfs4_open_stream_t *osp = NULL; 10513 nfs4_open_owner_t *oop = NULL; 10514 10515 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10516 if (oop != NULL) { 10517 /* returns with 'os_sync_lock' held */ 10518 osp = find_open_stream(oop, rp); 10519 open_owner_rele(oop); 10520 } 10521 if (osp == NULL) { 10522 #ifdef DEBUG 10523 if (nfs4_force_open_before_mmap) { 10524 error = EIO; 10525 goto done; 10526 } 10527 #endif 10528 /* returns with 'os_sync_lock' held */ 10529 error = open_and_get_osp(vp, cr, &osp); 10530 if (osp == NULL) { 10531 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10532 "nfs4_map: we tried to OPEN the file " 10533 "but again no osp, so fail with EIO")); 10534 goto done; 10535 } 10536 } 10537 10538 if (osp->os_failed_reopen) { 10539 mutex_exit(&osp->os_sync_lock); 10540 open_stream_rele(osp, rp); 10541 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10542 "nfs4_map: os_failed_reopen set on " 10543 "osp %p, cr %p, rp %s", (void *)osp, 10544 (void *)cr, rnode4info(rp))); 10545 error = EIO; 10546 goto done; 10547 } 10548 mutex_exit(&osp->os_sync_lock); 10549 open_stream_rele(osp, rp); 10550 } 10551 10552 vn_a.vp = vp; 10553 vn_a.offset = off; 10554 vn_a.type = (flags & MAP_TYPE); 10555 vn_a.prot = (uchar_t)prot; 10556 vn_a.maxprot = (uchar_t)maxprot; 10557 vn_a.flags = (flags & ~MAP_TYPE); 10558 vn_a.cred = cr; 10559 vn_a.amp = NULL; 10560 vn_a.szc = 0; 10561 vn_a.lgrp_mem_policy_flags = 0; 10562 10563 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10564 as_rangeunlock(as); 10565 10566 done: 10567 nfs_rw_exit(&rp->r_lkserlock); 10568 atomic_add_int(&rp->r_inmap, -1); 10569 return (error); 10570 } 10571 10572 /* 10573 * We're most likely dealing with a kernel module that likes to READ 10574 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10575 * officially OPEN the file to create the necessary client state 10576 * for bookkeeping of os_mmap_read/write counts. 10577 * 10578 * Since VOP_MAP only passes in a pointer to the vnode rather than 10579 * a double pointer, we can't handle the case where nfs4open_otw() 10580 * returns a different vnode than the one passed into VOP_MAP (since 10581 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10582 * we return NULL and let nfs4_map() fail. Note: the only case where 10583 * this should happen is if the file got removed and replaced with the 10584 * same name on the server (in addition to the fact that we're trying 10585 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10586 */ 10587 static int 10588 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10589 { 10590 rnode4_t *rp, *drp; 10591 vnode_t *dvp, *open_vp; 10592 char file_name[MAXNAMELEN]; 10593 int just_created; 10594 nfs4_open_stream_t *osp; 10595 nfs4_open_owner_t *oop; 10596 int error; 10597 10598 *ospp = NULL; 10599 open_vp = map_vp; 10600 10601 rp = VTOR4(open_vp); 10602 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10603 return (error); 10604 drp = VTOR4(dvp); 10605 10606 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10607 VN_RELE(dvp); 10608 return (EINTR); 10609 } 10610 10611 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10612 nfs_rw_exit(&drp->r_rwlock); 10613 VN_RELE(dvp); 10614 return (error); 10615 } 10616 10617 mutex_enter(&rp->r_statev4_lock); 10618 if (rp->created_v4) { 10619 rp->created_v4 = 0; 10620 mutex_exit(&rp->r_statev4_lock); 10621 10622 dnlc_update(dvp, file_name, open_vp); 10623 /* This is needed so we don't bump the open ref count */ 10624 just_created = 1; 10625 } else { 10626 mutex_exit(&rp->r_statev4_lock); 10627 just_created = 0; 10628 } 10629 10630 VN_HOLD(map_vp); 10631 10632 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10633 just_created); 10634 if (error) { 10635 nfs_rw_exit(&drp->r_rwlock); 10636 VN_RELE(dvp); 10637 VN_RELE(map_vp); 10638 return (error); 10639 } 10640 10641 nfs_rw_exit(&drp->r_rwlock); 10642 VN_RELE(dvp); 10643 10644 /* 10645 * If nfs4open_otw() returned a different vnode then "undo" 10646 * the open and return failure to the caller. 10647 */ 10648 if (!VN_CMP(open_vp, map_vp)) { 10649 nfs4_error_t e; 10650 10651 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10652 "open returned a different vnode")); 10653 /* 10654 * If there's an error, ignore it, 10655 * and let VOP_INACTIVE handle it. 10656 */ 10657 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10658 CLOSE_NORM, 0, 0, 0); 10659 VN_RELE(map_vp); 10660 return (EIO); 10661 } 10662 10663 VN_RELE(map_vp); 10664 10665 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10666 if (!oop) { 10667 nfs4_error_t e; 10668 10669 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10670 "no open owner")); 10671 /* 10672 * If there's an error, ignore it, 10673 * and let VOP_INACTIVE handle it. 10674 */ 10675 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10676 CLOSE_NORM, 0, 0, 0); 10677 return (EIO); 10678 } 10679 osp = find_open_stream(oop, rp); 10680 open_owner_rele(oop); 10681 *ospp = osp; 10682 return (0); 10683 } 10684 10685 /* 10686 * Please be aware that when this function is called, the address space write 10687 * a_lock is held. Do not put over the wire calls in this function. 10688 */ 10689 /* ARGSUSED */ 10690 static int 10691 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10692 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10693 caller_context_t *ct) 10694 { 10695 rnode4_t *rp; 10696 int error = 0; 10697 mntinfo4_t *mi; 10698 10699 mi = VTOMI4(vp); 10700 rp = VTOR4(vp); 10701 10702 if (nfs_zone() != mi->mi_zone) 10703 return (EIO); 10704 if (vp->v_flag & VNOMAP) 10705 return (ENOSYS); 10706 10707 /* 10708 * Don't need to update the open stream first, since this 10709 * mmap can't add any additional share access that isn't 10710 * already contained in the open stream (for the case where we 10711 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10712 * take into account os_mmap_read[write] counts). 10713 */ 10714 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10715 10716 if (vp->v_type == VREG) { 10717 /* 10718 * We need to retrieve the open stream and update the counts. 10719 * If there is no open stream here, something is wrong. 10720 */ 10721 nfs4_open_stream_t *osp = NULL; 10722 nfs4_open_owner_t *oop = NULL; 10723 10724 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10725 if (oop != NULL) { 10726 /* returns with 'os_sync_lock' held */ 10727 osp = find_open_stream(oop, rp); 10728 open_owner_rele(oop); 10729 } 10730 if (osp == NULL) { 10731 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10732 "nfs4_addmap: we should have an osp" 10733 "but we don't, so fail with EIO")); 10734 error = EIO; 10735 goto out; 10736 } 10737 10738 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10739 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10740 10741 /* 10742 * Update the map count in the open stream. 10743 * This is necessary in the case where we 10744 * open/mmap/close/, then the server reboots, and we 10745 * attempt to reopen. If the mmap doesn't add share 10746 * access then we send an invalid reopen with 10747 * access = NONE. 10748 * 10749 * We need to specifically check each PROT_* so a mmap 10750 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10751 * read and write access. A simple comparison of prot 10752 * to ~PROT_WRITE to determine read access is insufficient 10753 * since prot can be |= with PROT_USER, etc. 10754 */ 10755 10756 /* 10757 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10758 */ 10759 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10760 osp->os_mmap_write += btopr(len); 10761 if (maxprot & PROT_READ) 10762 osp->os_mmap_read += btopr(len); 10763 if (maxprot & PROT_EXEC) 10764 osp->os_mmap_read += btopr(len); 10765 /* 10766 * Ensure that os_mmap_read gets incremented, even if 10767 * maxprot were to look like PROT_NONE. 10768 */ 10769 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10770 !(maxprot & PROT_EXEC)) 10771 osp->os_mmap_read += btopr(len); 10772 osp->os_mapcnt += btopr(len); 10773 mutex_exit(&osp->os_sync_lock); 10774 open_stream_rele(osp, rp); 10775 } 10776 10777 out: 10778 /* 10779 * If we got an error, then undo our 10780 * incrementing of 'r_mapcnt'. 10781 */ 10782 10783 if (error) { 10784 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10785 ASSERT(rp->r_mapcnt >= 0); 10786 } 10787 return (error); 10788 } 10789 10790 /* ARGSUSED */ 10791 static int 10792 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10793 { 10794 10795 return (VTOR4(vp1) == VTOR4(vp2)); 10796 } 10797 10798 /* ARGSUSED */ 10799 static int 10800 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10801 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10802 caller_context_t *ct) 10803 { 10804 int rc; 10805 u_offset_t start, end; 10806 rnode4_t *rp; 10807 int error = 0, intr = INTR4(vp); 10808 nfs4_error_t e; 10809 10810 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10811 return (EIO); 10812 10813 /* check for valid cmd parameter */ 10814 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10815 return (EINVAL); 10816 10817 /* Verify l_type. */ 10818 switch (bfp->l_type) { 10819 case F_RDLCK: 10820 if (cmd != F_GETLK && !(flag & FREAD)) 10821 return (EBADF); 10822 break; 10823 case F_WRLCK: 10824 if (cmd != F_GETLK && !(flag & FWRITE)) 10825 return (EBADF); 10826 break; 10827 case F_UNLCK: 10828 intr = 0; 10829 break; 10830 10831 default: 10832 return (EINVAL); 10833 } 10834 10835 /* check the validity of the lock range */ 10836 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10837 return (rc); 10838 if (rc = flk_check_lock_data(start, end, MAXEND)) 10839 return (rc); 10840 10841 /* 10842 * If the filesystem is mounted using local locking, pass the 10843 * request off to the local locking code. 10844 */ 10845 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10846 if (cmd == F_SETLK || cmd == F_SETLKW) { 10847 /* 10848 * For complete safety, we should be holding 10849 * r_lkserlock. However, we can't call 10850 * nfs4_safelock and then fs_frlock while 10851 * holding r_lkserlock, so just invoke 10852 * nfs4_safelock and expect that this will 10853 * catch enough of the cases. 10854 */ 10855 if (!nfs4_safelock(vp, bfp, cr)) 10856 return (EAGAIN); 10857 } 10858 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10859 } 10860 10861 rp = VTOR4(vp); 10862 10863 /* 10864 * Check whether the given lock request can proceed, given the 10865 * current file mappings. 10866 */ 10867 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10868 return (EINTR); 10869 if (cmd == F_SETLK || cmd == F_SETLKW) { 10870 if (!nfs4_safelock(vp, bfp, cr)) { 10871 rc = EAGAIN; 10872 goto done; 10873 } 10874 } 10875 10876 /* 10877 * Flush the cache after waiting for async I/O to finish. For new 10878 * locks, this is so that the process gets the latest bits from the 10879 * server. For unlocks, this is so that other clients see the 10880 * latest bits once the file has been unlocked. If currently dirty 10881 * pages can't be flushed, then don't allow a lock to be set. But 10882 * allow unlocks to succeed, to avoid having orphan locks on the 10883 * server. 10884 */ 10885 if (cmd != F_GETLK) { 10886 mutex_enter(&rp->r_statelock); 10887 while (rp->r_count > 0) { 10888 if (intr) { 10889 klwp_t *lwp = ttolwp(curthread); 10890 10891 if (lwp != NULL) 10892 lwp->lwp_nostop++; 10893 if (cv_wait_sig(&rp->r_cv, 10894 &rp->r_statelock) == 0) { 10895 if (lwp != NULL) 10896 lwp->lwp_nostop--; 10897 rc = EINTR; 10898 break; 10899 } 10900 if (lwp != NULL) 10901 lwp->lwp_nostop--; 10902 } else 10903 cv_wait(&rp->r_cv, &rp->r_statelock); 10904 } 10905 mutex_exit(&rp->r_statelock); 10906 if (rc != 0) 10907 goto done; 10908 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10909 if (error) { 10910 if (error == ENOSPC || error == EDQUOT) { 10911 mutex_enter(&rp->r_statelock); 10912 if (!rp->r_error) 10913 rp->r_error = error; 10914 mutex_exit(&rp->r_statelock); 10915 } 10916 if (bfp->l_type != F_UNLCK) { 10917 rc = ENOLCK; 10918 goto done; 10919 } 10920 } 10921 } 10922 10923 /* 10924 * Call the lock manager to do the real work of contacting 10925 * the server and obtaining the lock. 10926 */ 10927 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10928 cr, &e, NULL, NULL); 10929 rc = e.error; 10930 10931 if (rc == 0) 10932 nfs4_lockcompletion(vp, cmd); 10933 10934 done: 10935 nfs_rw_exit(&rp->r_lkserlock); 10936 10937 return (rc); 10938 } 10939 10940 /* 10941 * Free storage space associated with the specified vnode. The portion 10942 * to be freed is specified by bfp->l_start and bfp->l_len (already 10943 * normalized to a "whence" of 0). 10944 * 10945 * This is an experimental facility whose continued existence is not 10946 * guaranteed. Currently, we only support the special case 10947 * of l_len == 0, meaning free to end of file. 10948 */ 10949 /* ARGSUSED */ 10950 static int 10951 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10952 offset_t offset, cred_t *cr, caller_context_t *ct) 10953 { 10954 int error; 10955 10956 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10957 return (EIO); 10958 ASSERT(vp->v_type == VREG); 10959 if (cmd != F_FREESP) 10960 return (EINVAL); 10961 10962 error = convoff(vp, bfp, 0, offset); 10963 if (!error) { 10964 ASSERT(bfp->l_start >= 0); 10965 if (bfp->l_len == 0) { 10966 struct vattr va; 10967 10968 va.va_mask = AT_SIZE; 10969 va.va_size = bfp->l_start; 10970 error = nfs4setattr(vp, &va, 0, cr, NULL); 10971 } else 10972 error = EINVAL; 10973 } 10974 10975 return (error); 10976 } 10977 10978 /* ARGSUSED */ 10979 int 10980 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 10981 { 10982 rnode4_t *rp; 10983 rp = VTOR4(vp); 10984 10985 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 10986 vp = RTOV4(rp); 10987 } 10988 *vpp = vp; 10989 return (0); 10990 } 10991 10992 /* 10993 * Setup and add an address space callback to do the work of the delmap call. 10994 * The callback will (and must be) deleted in the actual callback function. 10995 * 10996 * This is done in order to take care of the problem that we have with holding 10997 * the address space's a_lock for a long period of time (e.g. if the NFS server 10998 * is down). Callbacks will be executed in the address space code while the 10999 * a_lock is not held. Holding the address space's a_lock causes things such 11000 * as ps and fork to hang because they are trying to acquire this lock as well. 11001 */ 11002 /* ARGSUSED */ 11003 static int 11004 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11005 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11006 caller_context_t *ct) 11007 { 11008 int caller_found; 11009 int error; 11010 rnode4_t *rp; 11011 nfs4_delmap_args_t *dmapp; 11012 nfs4_delmapcall_t *delmap_call; 11013 11014 if (vp->v_flag & VNOMAP) 11015 return (ENOSYS); 11016 11017 /* 11018 * A process may not change zones if it has NFS pages mmap'ed 11019 * in, so we can't legitimately get here from the wrong zone. 11020 */ 11021 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11022 11023 rp = VTOR4(vp); 11024 11025 /* 11026 * The way that the address space of this process deletes its mapping 11027 * of this file is via the following call chains: 11028 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11029 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11030 * 11031 * With the use of address space callbacks we are allowed to drop the 11032 * address space lock, a_lock, while executing the NFS operations that 11033 * need to go over the wire. Returning EAGAIN to the caller of this 11034 * function is what drives the execution of the callback that we add 11035 * below. The callback will be executed by the address space code 11036 * after dropping the a_lock. When the callback is finished, since 11037 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11038 * is called again on the same segment to finish the rest of the work 11039 * that needs to happen during unmapping. 11040 * 11041 * This action of calling back into the segment driver causes 11042 * nfs4_delmap() to get called again, but since the callback was 11043 * already executed at this point, it already did the work and there 11044 * is nothing left for us to do. 11045 * 11046 * To Summarize: 11047 * - The first time nfs4_delmap is called by the current thread is when 11048 * we add the caller associated with this delmap to the delmap caller 11049 * list, add the callback, and return EAGAIN. 11050 * - The second time in this call chain when nfs4_delmap is called we 11051 * will find this caller in the delmap caller list and realize there 11052 * is no more work to do thus removing this caller from the list and 11053 * returning the error that was set in the callback execution. 11054 */ 11055 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11056 if (caller_found) { 11057 /* 11058 * 'error' is from the actual delmap operations. To avoid 11059 * hangs, we need to handle the return of EAGAIN differently 11060 * since this is what drives the callback execution. 11061 * In this case, we don't want to return EAGAIN and do the 11062 * callback execution because there are none to execute. 11063 */ 11064 if (error == EAGAIN) 11065 return (0); 11066 else 11067 return (error); 11068 } 11069 11070 /* current caller was not in the list */ 11071 delmap_call = nfs4_init_delmapcall(); 11072 11073 mutex_enter(&rp->r_statelock); 11074 list_insert_tail(&rp->r_indelmap, delmap_call); 11075 mutex_exit(&rp->r_statelock); 11076 11077 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11078 11079 dmapp->vp = vp; 11080 dmapp->off = off; 11081 dmapp->addr = addr; 11082 dmapp->len = len; 11083 dmapp->prot = prot; 11084 dmapp->maxprot = maxprot; 11085 dmapp->flags = flags; 11086 dmapp->cr = cr; 11087 dmapp->caller = delmap_call; 11088 11089 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11090 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11091 11092 return (error ? error : EAGAIN); 11093 } 11094 11095 static nfs4_delmapcall_t * 11096 nfs4_init_delmapcall() 11097 { 11098 nfs4_delmapcall_t *delmap_call; 11099 11100 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11101 delmap_call->call_id = curthread; 11102 delmap_call->error = 0; 11103 11104 return (delmap_call); 11105 } 11106 11107 static void 11108 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11109 { 11110 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11111 } 11112 11113 /* 11114 * Searches for the current delmap caller (based on curthread) in the list of 11115 * callers. If it is found, we remove it and free the delmap caller. 11116 * Returns: 11117 * 0 if the caller wasn't found 11118 * 1 if the caller was found, removed and freed. *errp will be set 11119 * to what the result of the delmap was. 11120 */ 11121 static int 11122 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11123 { 11124 nfs4_delmapcall_t *delmap_call; 11125 11126 /* 11127 * If the list doesn't exist yet, we create it and return 11128 * that the caller wasn't found. No list = no callers. 11129 */ 11130 mutex_enter(&rp->r_statelock); 11131 if (!(rp->r_flags & R4DELMAPLIST)) { 11132 /* The list does not exist */ 11133 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11134 offsetof(nfs4_delmapcall_t, call_node)); 11135 rp->r_flags |= R4DELMAPLIST; 11136 mutex_exit(&rp->r_statelock); 11137 return (0); 11138 } else { 11139 /* The list exists so search it */ 11140 for (delmap_call = list_head(&rp->r_indelmap); 11141 delmap_call != NULL; 11142 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11143 if (delmap_call->call_id == curthread) { 11144 /* current caller is in the list */ 11145 *errp = delmap_call->error; 11146 list_remove(&rp->r_indelmap, delmap_call); 11147 mutex_exit(&rp->r_statelock); 11148 nfs4_free_delmapcall(delmap_call); 11149 return (1); 11150 } 11151 } 11152 } 11153 mutex_exit(&rp->r_statelock); 11154 return (0); 11155 } 11156 11157 /* 11158 * Remove some pages from an mmap'd vnode. Just update the 11159 * count of pages. If doing close-to-open, then flush and 11160 * commit all of the pages associated with this file. 11161 * Otherwise, start an asynchronous page flush to write out 11162 * any dirty pages. This will also associate a credential 11163 * with the rnode which can be used to write the pages. 11164 */ 11165 /* ARGSUSED */ 11166 static void 11167 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11168 { 11169 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11170 rnode4_t *rp; 11171 mntinfo4_t *mi; 11172 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11173 11174 rp = VTOR4(dmapp->vp); 11175 mi = VTOMI4(dmapp->vp); 11176 11177 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11178 ASSERT(rp->r_mapcnt >= 0); 11179 11180 /* 11181 * Initiate a page flush and potential commit if there are 11182 * pages, the file system was not mounted readonly, the segment 11183 * was mapped shared, and the pages themselves were writeable. 11184 */ 11185 if (nfs4_has_pages(dmapp->vp) && 11186 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11187 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11188 mutex_enter(&rp->r_statelock); 11189 rp->r_flags |= R4DIRTY; 11190 mutex_exit(&rp->r_statelock); 11191 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11192 dmapp->len, dmapp->cr); 11193 if (!e.error) { 11194 mutex_enter(&rp->r_statelock); 11195 e.error = rp->r_error; 11196 rp->r_error = 0; 11197 mutex_exit(&rp->r_statelock); 11198 } 11199 } else 11200 e.error = 0; 11201 11202 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11203 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11204 B_INVAL, dmapp->cr, NULL); 11205 11206 if (e.error) { 11207 e.stat = puterrno4(e.error); 11208 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11209 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11210 dmapp->caller->error = e.error; 11211 } 11212 11213 /* Check to see if we need to close the file */ 11214 11215 if (dmapp->vp->v_type == VREG) { 11216 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11217 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11218 11219 if (e.error != 0 || e.stat != NFS4_OK) { 11220 /* 11221 * Since it is possible that e.error == 0 and 11222 * e.stat != NFS4_OK (and vice versa), 11223 * we do the proper checking in order to get both 11224 * e.error and e.stat reporting the correct info. 11225 */ 11226 if (e.stat == NFS4_OK) 11227 e.stat = puterrno4(e.error); 11228 if (e.error == 0) 11229 e.error = geterrno4(e.stat); 11230 11231 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11232 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11233 dmapp->caller->error = e.error; 11234 } 11235 } 11236 11237 (void) as_delete_callback(as, arg); 11238 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11239 } 11240 11241 11242 static uint_t 11243 fattr4_maxfilesize_to_bits(uint64_t ll) 11244 { 11245 uint_t l = 1; 11246 11247 if (ll == 0) { 11248 return (0); 11249 } 11250 11251 if (ll & 0xffffffff00000000) { 11252 l += 32; ll >>= 32; 11253 } 11254 if (ll & 0xffff0000) { 11255 l += 16; ll >>= 16; 11256 } 11257 if (ll & 0xff00) { 11258 l += 8; ll >>= 8; 11259 } 11260 if (ll & 0xf0) { 11261 l += 4; ll >>= 4; 11262 } 11263 if (ll & 0xc) { 11264 l += 2; ll >>= 2; 11265 } 11266 if (ll & 0x2) { 11267 l += 1; 11268 } 11269 return (l); 11270 } 11271 11272 static int 11273 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11274 { 11275 vnode_t *avp = NULL; 11276 int error; 11277 11278 if ((error = nfs4lookup_xattr(vp, "", &avp, 11279 LOOKUP_XATTR, cr)) == 0) 11280 error = do_xattr_exists_check(avp, valp, cr); 11281 if (avp) 11282 VN_RELE(avp); 11283 11284 return (error); 11285 } 11286 11287 /* ARGSUSED */ 11288 int 11289 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11290 caller_context_t *ct) 11291 { 11292 int error; 11293 hrtime_t t; 11294 rnode4_t *rp; 11295 nfs4_ga_res_t gar; 11296 nfs4_ga_ext_res_t ger; 11297 11298 gar.n4g_ext_res = &ger; 11299 11300 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11301 return (EIO); 11302 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11303 *valp = MAXPATHLEN; 11304 return (0); 11305 } 11306 if (cmd == _PC_ACL_ENABLED) { 11307 *valp = _ACL_ACE_ENABLED; 11308 return (0); 11309 } 11310 11311 rp = VTOR4(vp); 11312 if (cmd == _PC_XATTR_EXISTS) { 11313 /* 11314 * The existence of the xattr directory is not sufficient 11315 * for determining whether generic user attributes exists. 11316 * The attribute directory could only be a transient directory 11317 * used for Solaris sysattr support. Do a small readdir 11318 * to verify if the only entries are sysattrs or not. 11319 * 11320 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11321 * is NULL. Once the xadir vp exists, we can create xattrs, 11322 * and we don't have any way to update the "base" object's 11323 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11324 * could help out. 11325 */ 11326 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11327 rp->r_xattr_dir == NULL) { 11328 return (nfs4_have_xattrs(vp, valp, cr)); 11329 } 11330 } else { /* OLD CODE */ 11331 if (ATTRCACHE4_VALID(vp)) { 11332 mutex_enter(&rp->r_statelock); 11333 if (rp->r_pathconf.pc4_cache_valid) { 11334 error = 0; 11335 switch (cmd) { 11336 case _PC_FILESIZEBITS: 11337 *valp = 11338 rp->r_pathconf.pc4_filesizebits; 11339 break; 11340 case _PC_LINK_MAX: 11341 *valp = 11342 rp->r_pathconf.pc4_link_max; 11343 break; 11344 case _PC_NAME_MAX: 11345 *valp = 11346 rp->r_pathconf.pc4_name_max; 11347 break; 11348 case _PC_CHOWN_RESTRICTED: 11349 *valp = 11350 rp->r_pathconf.pc4_chown_restricted; 11351 break; 11352 case _PC_NO_TRUNC: 11353 *valp = 11354 rp->r_pathconf.pc4_no_trunc; 11355 break; 11356 default: 11357 error = EINVAL; 11358 break; 11359 } 11360 mutex_exit(&rp->r_statelock); 11361 #ifdef DEBUG 11362 nfs4_pathconf_cache_hits++; 11363 #endif 11364 return (error); 11365 } 11366 mutex_exit(&rp->r_statelock); 11367 } 11368 } 11369 #ifdef DEBUG 11370 nfs4_pathconf_cache_misses++; 11371 #endif 11372 11373 t = gethrtime(); 11374 11375 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11376 11377 if (error) { 11378 mutex_enter(&rp->r_statelock); 11379 rp->r_pathconf.pc4_cache_valid = FALSE; 11380 rp->r_pathconf.pc4_xattr_valid = FALSE; 11381 mutex_exit(&rp->r_statelock); 11382 return (error); 11383 } 11384 11385 /* interpret the max filesize */ 11386 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11387 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11388 11389 /* Store the attributes we just received */ 11390 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11391 11392 switch (cmd) { 11393 case _PC_FILESIZEBITS: 11394 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11395 break; 11396 case _PC_LINK_MAX: 11397 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11398 break; 11399 case _PC_NAME_MAX: 11400 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11401 break; 11402 case _PC_CHOWN_RESTRICTED: 11403 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11404 break; 11405 case _PC_NO_TRUNC: 11406 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11407 break; 11408 case _PC_XATTR_EXISTS: 11409 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11410 if (error = nfs4_have_xattrs(vp, valp, cr)) 11411 return (error); 11412 } 11413 break; 11414 default: 11415 return (EINVAL); 11416 } 11417 11418 return (0); 11419 } 11420 11421 /* 11422 * Called by async thread to do synchronous pageio. Do the i/o, wait 11423 * for it to complete, and cleanup the page list when done. 11424 */ 11425 static int 11426 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11427 int flags, cred_t *cr) 11428 { 11429 int error; 11430 11431 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11432 11433 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11434 if (flags & B_READ) 11435 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11436 else 11437 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11438 return (error); 11439 } 11440 11441 /* ARGSUSED */ 11442 static int 11443 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11444 int flags, cred_t *cr, caller_context_t *ct) 11445 { 11446 int error; 11447 rnode4_t *rp; 11448 11449 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11450 return (EIO); 11451 11452 if (pp == NULL) 11453 return (EINVAL); 11454 11455 rp = VTOR4(vp); 11456 mutex_enter(&rp->r_statelock); 11457 rp->r_count++; 11458 mutex_exit(&rp->r_statelock); 11459 11460 if (flags & B_ASYNC) { 11461 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11462 nfs4_sync_pageio); 11463 } else 11464 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11465 mutex_enter(&rp->r_statelock); 11466 rp->r_count--; 11467 cv_broadcast(&rp->r_cv); 11468 mutex_exit(&rp->r_statelock); 11469 return (error); 11470 } 11471 11472 /* ARGSUSED */ 11473 static void 11474 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11475 caller_context_t *ct) 11476 { 11477 int error; 11478 rnode4_t *rp; 11479 page_t *plist; 11480 page_t *pptr; 11481 offset3 offset; 11482 count3 len; 11483 k_sigset_t smask; 11484 11485 /* 11486 * We should get called with fl equal to either B_FREE or 11487 * B_INVAL. Any other value is illegal. 11488 * 11489 * The page that we are either supposed to free or destroy 11490 * should be exclusive locked and its io lock should not 11491 * be held. 11492 */ 11493 ASSERT(fl == B_FREE || fl == B_INVAL); 11494 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11495 11496 rp = VTOR4(vp); 11497 11498 /* 11499 * If the page doesn't need to be committed or we shouldn't 11500 * even bother attempting to commit it, then just make sure 11501 * that the p_fsdata byte is clear and then either free or 11502 * destroy the page as appropriate. 11503 */ 11504 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11505 pp->p_fsdata = C_NOCOMMIT; 11506 if (fl == B_FREE) 11507 page_free(pp, dn); 11508 else 11509 page_destroy(pp, dn); 11510 return; 11511 } 11512 11513 /* 11514 * If there is a page invalidation operation going on, then 11515 * if this is one of the pages being destroyed, then just 11516 * clear the p_fsdata byte and then either free or destroy 11517 * the page as appropriate. 11518 */ 11519 mutex_enter(&rp->r_statelock); 11520 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11521 mutex_exit(&rp->r_statelock); 11522 pp->p_fsdata = C_NOCOMMIT; 11523 if (fl == B_FREE) 11524 page_free(pp, dn); 11525 else 11526 page_destroy(pp, dn); 11527 return; 11528 } 11529 11530 /* 11531 * If we are freeing this page and someone else is already 11532 * waiting to do a commit, then just unlock the page and 11533 * return. That other thread will take care of commiting 11534 * this page. The page can be freed sometime after the 11535 * commit has finished. Otherwise, if the page is marked 11536 * as delay commit, then we may be getting called from 11537 * pvn_write_done, one page at a time. This could result 11538 * in one commit per page, so we end up doing lots of small 11539 * commits instead of fewer larger commits. This is bad, 11540 * we want do as few commits as possible. 11541 */ 11542 if (fl == B_FREE) { 11543 if (rp->r_flags & R4COMMITWAIT) { 11544 page_unlock(pp); 11545 mutex_exit(&rp->r_statelock); 11546 return; 11547 } 11548 if (pp->p_fsdata == C_DELAYCOMMIT) { 11549 pp->p_fsdata = C_COMMIT; 11550 page_unlock(pp); 11551 mutex_exit(&rp->r_statelock); 11552 return; 11553 } 11554 } 11555 11556 /* 11557 * Check to see if there is a signal which would prevent an 11558 * attempt to commit the pages from being successful. If so, 11559 * then don't bother with all of the work to gather pages and 11560 * generate the unsuccessful RPC. Just return from here and 11561 * let the page be committed at some later time. 11562 */ 11563 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11564 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11565 sigunintr(&smask); 11566 page_unlock(pp); 11567 mutex_exit(&rp->r_statelock); 11568 return; 11569 } 11570 sigunintr(&smask); 11571 11572 /* 11573 * We are starting to need to commit pages, so let's try 11574 * to commit as many as possible at once to reduce the 11575 * overhead. 11576 * 11577 * Set the `commit inprogress' state bit. We must 11578 * first wait until any current one finishes. Then 11579 * we initialize the c_pages list with this page. 11580 */ 11581 while (rp->r_flags & R4COMMIT) { 11582 rp->r_flags |= R4COMMITWAIT; 11583 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11584 rp->r_flags &= ~R4COMMITWAIT; 11585 } 11586 rp->r_flags |= R4COMMIT; 11587 mutex_exit(&rp->r_statelock); 11588 ASSERT(rp->r_commit.c_pages == NULL); 11589 rp->r_commit.c_pages = pp; 11590 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11591 rp->r_commit.c_commlen = PAGESIZE; 11592 11593 /* 11594 * Gather together all other pages which can be committed. 11595 * They will all be chained off r_commit.c_pages. 11596 */ 11597 nfs4_get_commit(vp); 11598 11599 /* 11600 * Clear the `commit inprogress' status and disconnect 11601 * the list of pages to be committed from the rnode. 11602 * At this same time, we also save the starting offset 11603 * and length of data to be committed on the server. 11604 */ 11605 plist = rp->r_commit.c_pages; 11606 rp->r_commit.c_pages = NULL; 11607 offset = rp->r_commit.c_commbase; 11608 len = rp->r_commit.c_commlen; 11609 mutex_enter(&rp->r_statelock); 11610 rp->r_flags &= ~R4COMMIT; 11611 cv_broadcast(&rp->r_commit.c_cv); 11612 mutex_exit(&rp->r_statelock); 11613 11614 if (curproc == proc_pageout || curproc == proc_fsflush || 11615 nfs_zone() != VTOMI4(vp)->mi_zone) { 11616 nfs4_async_commit(vp, plist, offset, len, 11617 cr, do_nfs4_async_commit); 11618 return; 11619 } 11620 11621 /* 11622 * Actually generate the COMMIT op over the wire operation. 11623 */ 11624 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11625 11626 /* 11627 * If we got an error during the commit, just unlock all 11628 * of the pages. The pages will get retransmitted to the 11629 * server during a putpage operation. 11630 */ 11631 if (error) { 11632 while (plist != NULL) { 11633 pptr = plist; 11634 page_sub(&plist, pptr); 11635 page_unlock(pptr); 11636 } 11637 return; 11638 } 11639 11640 /* 11641 * We've tried as hard as we can to commit the data to stable 11642 * storage on the server. We just unlock the rest of the pages 11643 * and clear the commit required state. They will be put 11644 * onto the tail of the cachelist if they are nolonger 11645 * mapped. 11646 */ 11647 while (plist != pp) { 11648 pptr = plist; 11649 page_sub(&plist, pptr); 11650 pptr->p_fsdata = C_NOCOMMIT; 11651 page_unlock(pptr); 11652 } 11653 11654 /* 11655 * It is possible that nfs4_commit didn't return error but 11656 * some other thread has modified the page we are going 11657 * to free/destroy. 11658 * In this case we need to rewrite the page. Do an explicit check 11659 * before attempting to free/destroy the page. If modified, needs to 11660 * be rewritten so unlock the page and return. 11661 */ 11662 if (hat_ismod(pp)) { 11663 pp->p_fsdata = C_NOCOMMIT; 11664 page_unlock(pp); 11665 return; 11666 } 11667 11668 /* 11669 * Now, as appropriate, either free or destroy the page 11670 * that we were called with. 11671 */ 11672 pp->p_fsdata = C_NOCOMMIT; 11673 if (fl == B_FREE) 11674 page_free(pp, dn); 11675 else 11676 page_destroy(pp, dn); 11677 } 11678 11679 /* 11680 * Commit requires that the current fh be the file written to. 11681 * The compound op structure is: 11682 * PUTFH(file), COMMIT 11683 */ 11684 static int 11685 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11686 { 11687 COMPOUND4args_clnt args; 11688 COMPOUND4res_clnt res; 11689 COMMIT4res *cm_res; 11690 nfs_argop4 argop[2]; 11691 nfs_resop4 *resop; 11692 int doqueue; 11693 mntinfo4_t *mi; 11694 rnode4_t *rp; 11695 cred_t *cred_otw = NULL; 11696 bool_t needrecov = FALSE; 11697 nfs4_recov_state_t recov_state; 11698 nfs4_open_stream_t *osp = NULL; 11699 bool_t first_time = TRUE; /* first time getting OTW cred */ 11700 bool_t last_time = FALSE; /* last time getting OTW cred */ 11701 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11702 11703 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11704 11705 rp = VTOR4(vp); 11706 11707 mi = VTOMI4(vp); 11708 recov_state.rs_flags = 0; 11709 recov_state.rs_num_retry_despite_err = 0; 11710 get_commit_cred: 11711 /* 11712 * Releases the osp, if a valid open stream is provided. 11713 * Puts a hold on the cred_otw and the new osp (if found). 11714 */ 11715 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11716 &first_time, &last_time); 11717 args.ctag = TAG_COMMIT; 11718 recov_retry: 11719 /* 11720 * Commit ops: putfh file; commit 11721 */ 11722 args.array_len = 2; 11723 args.array = argop; 11724 11725 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11726 &recov_state, NULL); 11727 if (e.error) { 11728 crfree(cred_otw); 11729 if (osp != NULL) 11730 open_stream_rele(osp, rp); 11731 return (e.error); 11732 } 11733 11734 /* putfh directory */ 11735 argop[0].argop = OP_CPUTFH; 11736 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11737 11738 /* commit */ 11739 argop[1].argop = OP_COMMIT; 11740 argop[1].nfs_argop4_u.opcommit.offset = offset; 11741 argop[1].nfs_argop4_u.opcommit.count = count; 11742 11743 doqueue = 1; 11744 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11745 11746 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11747 if (!needrecov && e.error) { 11748 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11749 needrecov); 11750 crfree(cred_otw); 11751 if (e.error == EACCES && last_time == FALSE) 11752 goto get_commit_cred; 11753 if (osp != NULL) 11754 open_stream_rele(osp, rp); 11755 return (e.error); 11756 } 11757 11758 if (needrecov) { 11759 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11760 NULL, OP_COMMIT, NULL) == FALSE) { 11761 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11762 &recov_state, needrecov); 11763 if (!e.error) 11764 (void) xdr_free(xdr_COMPOUND4res_clnt, 11765 (caddr_t)&res); 11766 goto recov_retry; 11767 } 11768 if (e.error) { 11769 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11770 &recov_state, needrecov); 11771 crfree(cred_otw); 11772 if (osp != NULL) 11773 open_stream_rele(osp, rp); 11774 return (e.error); 11775 } 11776 /* fall through for res.status case */ 11777 } 11778 11779 if (res.status) { 11780 e.error = geterrno4(res.status); 11781 if (e.error == EACCES && last_time == FALSE) { 11782 crfree(cred_otw); 11783 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11784 &recov_state, needrecov); 11785 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11786 goto get_commit_cred; 11787 } 11788 /* 11789 * Can't do a nfs4_purge_stale_fh here because this 11790 * can cause a deadlock. nfs4_commit can 11791 * be called from nfs4_dispose which can be called 11792 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11793 * can call back to pvn_vplist_dirty. 11794 */ 11795 if (e.error == ESTALE) { 11796 mutex_enter(&rp->r_statelock); 11797 rp->r_flags |= R4STALE; 11798 if (!rp->r_error) 11799 rp->r_error = e.error; 11800 mutex_exit(&rp->r_statelock); 11801 PURGE_ATTRCACHE4(vp); 11802 } else { 11803 mutex_enter(&rp->r_statelock); 11804 if (!rp->r_error) 11805 rp->r_error = e.error; 11806 mutex_exit(&rp->r_statelock); 11807 } 11808 } else { 11809 ASSERT(rp->r_flags & R4HAVEVERF); 11810 resop = &res.array[1]; /* commit res */ 11811 cm_res = &resop->nfs_resop4_u.opcommit; 11812 mutex_enter(&rp->r_statelock); 11813 if (cm_res->writeverf == rp->r_writeverf) { 11814 mutex_exit(&rp->r_statelock); 11815 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11816 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11817 &recov_state, needrecov); 11818 crfree(cred_otw); 11819 if (osp != NULL) 11820 open_stream_rele(osp, rp); 11821 return (0); 11822 } 11823 nfs4_set_mod(vp); 11824 rp->r_writeverf = cm_res->writeverf; 11825 mutex_exit(&rp->r_statelock); 11826 e.error = NFS_VERF_MISMATCH; 11827 } 11828 11829 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11830 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11831 crfree(cred_otw); 11832 if (osp != NULL) 11833 open_stream_rele(osp, rp); 11834 11835 return (e.error); 11836 } 11837 11838 static void 11839 nfs4_set_mod(vnode_t *vp) 11840 { 11841 page_t *pp; 11842 kmutex_t *vphm; 11843 rnode4_t *rp; 11844 11845 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11846 11847 /* make sure we're looking at the master vnode, not a shadow */ 11848 11849 rp = VTOR4(vp); 11850 if (IS_SHADOW(vp, rp)) 11851 vp = RTOV4(rp); 11852 11853 vphm = page_vnode_mutex(vp); 11854 mutex_enter(vphm); 11855 /* 11856 * If there are no pages associated with this vnode, then 11857 * just return. 11858 */ 11859 if ((pp = vp->v_pages) == NULL) { 11860 mutex_exit(vphm); 11861 return; 11862 } 11863 11864 do { 11865 if (pp->p_fsdata != C_NOCOMMIT) { 11866 hat_setmod(pp); 11867 pp->p_fsdata = C_NOCOMMIT; 11868 } 11869 } while ((pp = pp->p_vpnext) != vp->v_pages); 11870 mutex_exit(vphm); 11871 } 11872 11873 /* 11874 * This function is used to gather a page list of the pages which 11875 * can be committed on the server. 11876 * 11877 * The calling thread must have set R4COMMIT. This bit is used to 11878 * serialize access to the commit structure in the rnode. As long 11879 * as the thread has set R4COMMIT, then it can manipulate the commit 11880 * structure without requiring any other locks. 11881 * 11882 * When this function is called from nfs4_dispose() the page passed 11883 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11884 * will skip it. This is not a problem since we initially add the 11885 * page to the r_commit page list. 11886 * 11887 */ 11888 static void 11889 nfs4_get_commit(vnode_t *vp) 11890 { 11891 rnode4_t *rp; 11892 page_t *pp; 11893 kmutex_t *vphm; 11894 11895 rp = VTOR4(vp); 11896 11897 ASSERT(rp->r_flags & R4COMMIT); 11898 11899 /* make sure we're looking at the master vnode, not a shadow */ 11900 11901 if (IS_SHADOW(vp, rp)) 11902 vp = RTOV4(rp); 11903 11904 vphm = page_vnode_mutex(vp); 11905 mutex_enter(vphm); 11906 11907 /* 11908 * If there are no pages associated with this vnode, then 11909 * just return. 11910 */ 11911 if ((pp = vp->v_pages) == NULL) { 11912 mutex_exit(vphm); 11913 return; 11914 } 11915 11916 /* 11917 * Step through all of the pages associated with this vnode 11918 * looking for pages which need to be committed. 11919 */ 11920 do { 11921 /* 11922 * First short-cut everything (without the page_lock) 11923 * and see if this page does not need to be committed 11924 * or is modified if so then we'll just skip it. 11925 */ 11926 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11927 continue; 11928 11929 /* 11930 * Attempt to lock the page. If we can't, then 11931 * someone else is messing with it or we have been 11932 * called from nfs4_dispose and this is the page that 11933 * nfs4_dispose was called with.. anyway just skip it. 11934 */ 11935 if (!page_trylock(pp, SE_EXCL)) 11936 continue; 11937 11938 /* 11939 * Lets check again now that we have the page lock. 11940 */ 11941 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11942 page_unlock(pp); 11943 continue; 11944 } 11945 11946 /* this had better not be a free page */ 11947 ASSERT(PP_ISFREE(pp) == 0); 11948 11949 /* 11950 * The page needs to be committed and we locked it. 11951 * Update the base and length parameters and add it 11952 * to r_pages. 11953 */ 11954 if (rp->r_commit.c_pages == NULL) { 11955 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11956 rp->r_commit.c_commlen = PAGESIZE; 11957 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11958 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11959 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11960 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11961 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11962 <= pp->p_offset) { 11963 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11964 rp->r_commit.c_commbase + PAGESIZE; 11965 } 11966 page_add(&rp->r_commit.c_pages, pp); 11967 } while ((pp = pp->p_vpnext) != vp->v_pages); 11968 11969 mutex_exit(vphm); 11970 } 11971 11972 /* 11973 * This routine is used to gather together a page list of the pages 11974 * which are to be committed on the server. This routine must not 11975 * be called if the calling thread holds any locked pages. 11976 * 11977 * The calling thread must have set R4COMMIT. This bit is used to 11978 * serialize access to the commit structure in the rnode. As long 11979 * as the thread has set R4COMMIT, then it can manipulate the commit 11980 * structure without requiring any other locks. 11981 */ 11982 static void 11983 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11984 { 11985 11986 rnode4_t *rp; 11987 page_t *pp; 11988 u_offset_t end; 11989 u_offset_t off; 11990 ASSERT(len != 0); 11991 rp = VTOR4(vp); 11992 ASSERT(rp->r_flags & R4COMMIT); 11993 11994 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11995 11996 /* make sure we're looking at the master vnode, not a shadow */ 11997 11998 if (IS_SHADOW(vp, rp)) 11999 vp = RTOV4(rp); 12000 12001 /* 12002 * If there are no pages associated with this vnode, then 12003 * just return. 12004 */ 12005 if ((pp = vp->v_pages) == NULL) 12006 return; 12007 /* 12008 * Calculate the ending offset. 12009 */ 12010 end = soff + len; 12011 for (off = soff; off < end; off += PAGESIZE) { 12012 /* 12013 * Lookup each page by vp, offset. 12014 */ 12015 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12016 continue; 12017 /* 12018 * If this page does not need to be committed or is 12019 * modified, then just skip it. 12020 */ 12021 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12022 page_unlock(pp); 12023 continue; 12024 } 12025 12026 ASSERT(PP_ISFREE(pp) == 0); 12027 /* 12028 * The page needs to be committed and we locked it. 12029 * Update the base and length parameters and add it 12030 * to r_pages. 12031 */ 12032 if (rp->r_commit.c_pages == NULL) { 12033 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12034 rp->r_commit.c_commlen = PAGESIZE; 12035 } else { 12036 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12037 rp->r_commit.c_commbase + PAGESIZE; 12038 } 12039 page_add(&rp->r_commit.c_pages, pp); 12040 } 12041 } 12042 12043 /* 12044 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12045 * Flushes and commits data to the server. 12046 */ 12047 static int 12048 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12049 { 12050 int error; 12051 verifier4 write_verf; 12052 rnode4_t *rp = VTOR4(vp); 12053 12054 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12055 12056 /* 12057 * Flush the data portion of the file and then commit any 12058 * portions which need to be committed. This may need to 12059 * be done twice if the server has changed state since 12060 * data was last written. The data will need to be 12061 * rewritten to the server and then a new commit done. 12062 * 12063 * In fact, this may need to be done several times if the 12064 * server is having problems and crashing while we are 12065 * attempting to do this. 12066 */ 12067 12068 top: 12069 /* 12070 * Do a flush based on the poff and plen arguments. This 12071 * will synchronously write out any modified pages in the 12072 * range specified by (poff, plen). This starts all of the 12073 * i/o operations which will be waited for in the next 12074 * call to nfs4_putpage 12075 */ 12076 12077 mutex_enter(&rp->r_statelock); 12078 write_verf = rp->r_writeverf; 12079 mutex_exit(&rp->r_statelock); 12080 12081 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12082 if (error == EAGAIN) 12083 error = 0; 12084 12085 /* 12086 * Do a flush based on the poff and plen arguments. This 12087 * will synchronously write out any modified pages in the 12088 * range specified by (poff, plen) and wait until all of 12089 * the asynchronous i/o's in that range are done as well. 12090 */ 12091 if (!error) 12092 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12093 12094 if (error) 12095 return (error); 12096 12097 mutex_enter(&rp->r_statelock); 12098 if (rp->r_writeverf != write_verf) { 12099 mutex_exit(&rp->r_statelock); 12100 goto top; 12101 } 12102 mutex_exit(&rp->r_statelock); 12103 12104 /* 12105 * Now commit any pages which might need to be committed. 12106 * If the error, NFS_VERF_MISMATCH, is returned, then 12107 * start over with the flush operation. 12108 */ 12109 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12110 12111 if (error == NFS_VERF_MISMATCH) 12112 goto top; 12113 12114 return (error); 12115 } 12116 12117 /* 12118 * nfs4_commit_vp() will wait for other pending commits and 12119 * will either commit the whole file or a range, plen dictates 12120 * if we commit whole file. a value of zero indicates the whole 12121 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12122 */ 12123 static int 12124 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12125 cred_t *cr, int wait_on_writes) 12126 { 12127 rnode4_t *rp; 12128 page_t *plist; 12129 offset3 offset; 12130 count3 len; 12131 12132 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12133 12134 rp = VTOR4(vp); 12135 12136 /* 12137 * before we gather commitable pages make 12138 * sure there are no outstanding async writes 12139 */ 12140 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12141 mutex_enter(&rp->r_statelock); 12142 while (rp->r_count > 0) { 12143 cv_wait(&rp->r_cv, &rp->r_statelock); 12144 } 12145 mutex_exit(&rp->r_statelock); 12146 } 12147 12148 /* 12149 * Set the `commit inprogress' state bit. We must 12150 * first wait until any current one finishes. 12151 */ 12152 mutex_enter(&rp->r_statelock); 12153 while (rp->r_flags & R4COMMIT) { 12154 rp->r_flags |= R4COMMITWAIT; 12155 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12156 rp->r_flags &= ~R4COMMITWAIT; 12157 } 12158 rp->r_flags |= R4COMMIT; 12159 mutex_exit(&rp->r_statelock); 12160 12161 /* 12162 * Gather all of the pages which need to be 12163 * committed. 12164 */ 12165 if (plen == 0) 12166 nfs4_get_commit(vp); 12167 else 12168 nfs4_get_commit_range(vp, poff, plen); 12169 12170 /* 12171 * Clear the `commit inprogress' bit and disconnect the 12172 * page list which was gathered by nfs4_get_commit. 12173 */ 12174 plist = rp->r_commit.c_pages; 12175 rp->r_commit.c_pages = NULL; 12176 offset = rp->r_commit.c_commbase; 12177 len = rp->r_commit.c_commlen; 12178 mutex_enter(&rp->r_statelock); 12179 rp->r_flags &= ~R4COMMIT; 12180 cv_broadcast(&rp->r_commit.c_cv); 12181 mutex_exit(&rp->r_statelock); 12182 12183 /* 12184 * If any pages need to be committed, commit them and 12185 * then unlock them so that they can be freed some 12186 * time later. 12187 */ 12188 if (plist == NULL) 12189 return (0); 12190 12191 /* 12192 * No error occurred during the flush portion 12193 * of this operation, so now attempt to commit 12194 * the data to stable storage on the server. 12195 * 12196 * This will unlock all of the pages on the list. 12197 */ 12198 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12199 } 12200 12201 static int 12202 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12203 cred_t *cr) 12204 { 12205 int error; 12206 page_t *pp; 12207 12208 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12209 12210 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12211 12212 /* 12213 * If we got an error, then just unlock all of the pages 12214 * on the list. 12215 */ 12216 if (error) { 12217 while (plist != NULL) { 12218 pp = plist; 12219 page_sub(&plist, pp); 12220 page_unlock(pp); 12221 } 12222 return (error); 12223 } 12224 /* 12225 * We've tried as hard as we can to commit the data to stable 12226 * storage on the server. We just unlock the pages and clear 12227 * the commit required state. They will get freed later. 12228 */ 12229 while (plist != NULL) { 12230 pp = plist; 12231 page_sub(&plist, pp); 12232 pp->p_fsdata = C_NOCOMMIT; 12233 page_unlock(pp); 12234 } 12235 12236 return (error); 12237 } 12238 12239 static void 12240 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12241 cred_t *cr) 12242 { 12243 12244 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12245 } 12246 12247 /*ARGSUSED*/ 12248 static int 12249 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12250 caller_context_t *ct) 12251 { 12252 int error = 0; 12253 mntinfo4_t *mi; 12254 vattr_t va; 12255 vsecattr_t nfsace4_vsap; 12256 12257 mi = VTOMI4(vp); 12258 if (nfs_zone() != mi->mi_zone) 12259 return (EIO); 12260 if (mi->mi_flags & MI4_ACL) { 12261 /* if we have a delegation, return it */ 12262 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12263 (void) nfs4delegreturn(VTOR4(vp), 12264 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12265 12266 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12267 NFS4_ACL_SET); 12268 if (error) /* EINVAL */ 12269 return (error); 12270 12271 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12272 /* 12273 * These are aclent_t type entries. 12274 */ 12275 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12276 vp->v_type == VDIR, FALSE); 12277 if (error) 12278 return (error); 12279 } else { 12280 /* 12281 * These are ace_t type entries. 12282 */ 12283 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12284 FALSE); 12285 if (error) 12286 return (error); 12287 } 12288 bzero(&va, sizeof (va)); 12289 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12290 vs_ace4_destroy(&nfsace4_vsap); 12291 return (error); 12292 } 12293 return (ENOSYS); 12294 } 12295 12296 /* ARGSUSED */ 12297 int 12298 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12299 caller_context_t *ct) 12300 { 12301 int error; 12302 mntinfo4_t *mi; 12303 nfs4_ga_res_t gar; 12304 rnode4_t *rp = VTOR4(vp); 12305 12306 mi = VTOMI4(vp); 12307 if (nfs_zone() != mi->mi_zone) 12308 return (EIO); 12309 12310 bzero(&gar, sizeof (gar)); 12311 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12312 12313 /* 12314 * vsecattr->vsa_mask holds the original acl request mask. 12315 * This is needed when determining what to return. 12316 * (See: nfs4_create_getsecattr_return()) 12317 */ 12318 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12319 if (error) /* EINVAL */ 12320 return (error); 12321 12322 if (mi->mi_flags & MI4_ACL) { 12323 /* 12324 * Check if the data is cached and the cache is valid. If it 12325 * is we don't go over the wire. 12326 */ 12327 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12328 mutex_enter(&rp->r_statelock); 12329 if (rp->r_secattr != NULL) { 12330 error = nfs4_create_getsecattr_return( 12331 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12332 rp->r_attr.va_gid, 12333 vp->v_type == VDIR); 12334 if (!error) { /* error == 0 - Success! */ 12335 mutex_exit(&rp->r_statelock); 12336 return (error); 12337 } 12338 } 12339 mutex_exit(&rp->r_statelock); 12340 } 12341 12342 /* 12343 * The getattr otw call will always get both the acl, in 12344 * the form of a list of nfsace4's, and the number of acl 12345 * entries; independent of the value of gar.n4g_vsa.vsa_mask. 12346 */ 12347 gar.n4g_va.va_mask = AT_ALL; 12348 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12349 if (error) { 12350 vs_ace4_destroy(&gar.n4g_vsa); 12351 if (error == ENOTSUP || error == EOPNOTSUPP) 12352 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12353 return (error); 12354 } 12355 12356 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12357 /* 12358 * No error was returned, but according to the response 12359 * bitmap, neither was an acl. 12360 */ 12361 vs_ace4_destroy(&gar.n4g_vsa); 12362 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12363 return (error); 12364 } 12365 12366 /* 12367 * Update the cache with the ACL. 12368 */ 12369 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12370 12371 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12372 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12373 vp->v_type == VDIR); 12374 vs_ace4_destroy(&gar.n4g_vsa); 12375 if ((error) && (vsecattr->vsa_mask & 12376 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12377 (error != EACCES)) { 12378 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12379 } 12380 return (error); 12381 } 12382 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12383 return (error); 12384 } 12385 12386 /* 12387 * The function returns: 12388 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12389 * - EINVAL if the passed in "acl_mask" is an invalid request. 12390 * 12391 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12392 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12393 * 12394 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12395 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12396 * - We have a count field set without the corresponding acl field set. (e.g. - 12397 * VSA_ACECNT is set, but VSA_ACE is not) 12398 */ 12399 static int 12400 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12401 { 12402 /* Shortcut the masks that are always valid. */ 12403 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12404 return (0); 12405 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12406 return (0); 12407 12408 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12409 /* 12410 * We can't have any VSA_ACL type stuff in the mask now. 12411 */ 12412 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12413 VSA_DFACLCNT)) 12414 return (EINVAL); 12415 12416 if (op == NFS4_ACL_SET) { 12417 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12418 return (EINVAL); 12419 } 12420 } 12421 12422 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12423 /* 12424 * We can't have any VSA_ACE type stuff in the mask now. 12425 */ 12426 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12427 return (EINVAL); 12428 12429 if (op == NFS4_ACL_SET) { 12430 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12431 return (EINVAL); 12432 12433 if ((acl_mask & VSA_DFACLCNT) && 12434 !(acl_mask & VSA_DFACL)) 12435 return (EINVAL); 12436 } 12437 } 12438 return (0); 12439 } 12440 12441 /* 12442 * The theory behind creating the correct getsecattr return is simply this: 12443 * "Don't return anything that the caller is not expecting to have to free." 12444 */ 12445 static int 12446 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12447 uid_t uid, gid_t gid, int isdir) 12448 { 12449 int error = 0; 12450 /* Save the mask since the translators modify it. */ 12451 uint_t orig_mask = vsap->vsa_mask; 12452 12453 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12454 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, 12455 FALSE, ((orig_mask & VSA_ACE) ? FALSE : TRUE)); 12456 12457 if (error) 12458 return (error); 12459 12460 /* 12461 * If the caller only asked for the ace count (VSA_ACECNT) 12462 * don't give them the full acl (VSA_ACE), free it. 12463 */ 12464 if (!orig_mask & VSA_ACE) { 12465 if (vsap->vsa_aclentp != NULL) { 12466 kmem_free(vsap->vsa_aclentp, 12467 vsap->vsa_aclcnt * sizeof (ace_t)); 12468 vsap->vsa_aclentp = NULL; 12469 } 12470 } 12471 vsap->vsa_mask = orig_mask; 12472 12473 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12474 VSA_DFACLCNT)) { 12475 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12476 isdir, FALSE, 12477 ((orig_mask & (VSA_ACL | VSA_DFACL)) ? FALSE : TRUE)); 12478 12479 if (error) 12480 return (error); 12481 12482 /* 12483 * If the caller only asked for the acl count (VSA_ACLCNT) 12484 * and/or the default acl count (VSA_DFACLCNT) don't give them 12485 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12486 */ 12487 if (!orig_mask & VSA_ACL) { 12488 if (vsap->vsa_aclentp != NULL) { 12489 kmem_free(vsap->vsa_aclentp, 12490 vsap->vsa_aclcnt * sizeof (aclent_t)); 12491 vsap->vsa_aclentp = NULL; 12492 } 12493 } 12494 12495 if (!orig_mask & VSA_DFACL) { 12496 if (vsap->vsa_dfaclentp != NULL) { 12497 kmem_free(vsap->vsa_dfaclentp, 12498 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12499 vsap->vsa_dfaclentp = NULL; 12500 } 12501 } 12502 vsap->vsa_mask = orig_mask; 12503 } 12504 return (0); 12505 } 12506 12507 /* ARGSUSED */ 12508 int 12509 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12510 caller_context_t *ct) 12511 { 12512 int error; 12513 12514 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12515 return (EIO); 12516 /* 12517 * check for valid cmd parameter 12518 */ 12519 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12520 return (EINVAL); 12521 12522 /* 12523 * Check access permissions 12524 */ 12525 if ((cmd & F_SHARE) && 12526 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12527 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12528 return (EBADF); 12529 12530 /* 12531 * If the filesystem is mounted using local locking, pass the 12532 * request off to the local share code. 12533 */ 12534 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12535 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12536 12537 switch (cmd) { 12538 case F_SHARE: 12539 case F_UNSHARE: 12540 /* 12541 * This will be properly implemented later, 12542 * see RFE: 4823948 . 12543 */ 12544 error = EAGAIN; 12545 break; 12546 12547 case F_HASREMOTELOCKS: 12548 /* 12549 * NFS client can't store remote locks itself 12550 */ 12551 shr->s_access = 0; 12552 error = 0; 12553 break; 12554 12555 default: 12556 error = EINVAL; 12557 break; 12558 } 12559 12560 return (error); 12561 } 12562 12563 /* 12564 * Common code called by directory ops to update the attrcache 12565 */ 12566 static int 12567 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12568 hrtime_t t, vnode_t *vp, cred_t *cr) 12569 { 12570 int error = 0; 12571 12572 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12573 12574 if (status != NFS4_OK) { 12575 /* getattr not done or failed */ 12576 PURGE_ATTRCACHE4(vp); 12577 return (error); 12578 } 12579 12580 if (garp) { 12581 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12582 } else { 12583 PURGE_ATTRCACHE4(vp); 12584 } 12585 return (error); 12586 } 12587 12588 /* 12589 * Update directory caches for directory modification ops (link, rename, etc.) 12590 * When dinfo is NULL, manage dircaches in the old way. 12591 */ 12592 static void 12593 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12594 dirattr_info_t *dinfo) 12595 { 12596 rnode4_t *drp = VTOR4(dvp); 12597 12598 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12599 12600 /* Purge rddir cache for dir since it changed */ 12601 if (drp->r_dir != NULL) 12602 nfs4_purge_rddir_cache(dvp); 12603 12604 /* 12605 * If caller provided dinfo, then use it to manage dir caches. 12606 */ 12607 if (dinfo != NULL) { 12608 if (vp != NULL) { 12609 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12610 if (!VTOR4(vp)->created_v4) { 12611 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12612 dnlc_update(dvp, nm, vp); 12613 } else { 12614 /* 12615 * XXX don't update if the created_v4 flag is 12616 * set 12617 */ 12618 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12619 NFS4_DEBUG(nfs4_client_state_debug, 12620 (CE_NOTE, "nfs4_update_dircaches: " 12621 "don't update dnlc: created_v4 flag")); 12622 } 12623 } 12624 12625 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12626 dinfo->di_cred, FALSE, cinfo); 12627 12628 return; 12629 } 12630 12631 /* 12632 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12633 * Since caller modified dir but didn't receive post-dirmod-op dir 12634 * attrs, the dir's attrs must be purged. 12635 * 12636 * XXX this check and dnlc update/purge should really be atomic, 12637 * XXX but can't use rnode statelock because it'll deadlock in 12638 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12639 * XXX does occur. 12640 * 12641 * XXX We also may want to check that atomic is true in the 12642 * XXX change_info struct. If it is not, the change_info may 12643 * XXX reflect changes by more than one clients which means that 12644 * XXX our cache may not be valid. 12645 */ 12646 PURGE_ATTRCACHE4(dvp); 12647 if (drp->r_change == cinfo->before) { 12648 /* no changes took place in the directory prior to our link */ 12649 if (vp != NULL) { 12650 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12651 if (!VTOR4(vp)->created_v4) { 12652 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12653 dnlc_update(dvp, nm, vp); 12654 } else { 12655 /* 12656 * XXX dont' update if the created_v4 flag 12657 * is set 12658 */ 12659 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12660 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12661 "nfs4_update_dircaches: don't" 12662 " update dnlc: created_v4 flag")); 12663 } 12664 } 12665 } else { 12666 /* Another client modified directory - purge its dnlc cache */ 12667 dnlc_purge_vp(dvp); 12668 } 12669 } 12670 12671 /* 12672 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12673 * file. 12674 * 12675 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12676 * file (ie: client recovery) and otherwise set to FALSE. 12677 * 12678 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12679 * initiated) calling functions. 12680 * 12681 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12682 * of resending a 'lost' open request. 12683 * 12684 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12685 * server that hands out BAD_SEQID on open confirm. 12686 * 12687 * Errors are returned via the nfs4_error_t parameter. 12688 */ 12689 void 12690 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12691 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12692 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12693 { 12694 COMPOUND4args_clnt args; 12695 COMPOUND4res_clnt res; 12696 nfs_argop4 argop[2]; 12697 nfs_resop4 *resop; 12698 int doqueue = 1; 12699 mntinfo4_t *mi; 12700 OPEN_CONFIRM4args *open_confirm_args; 12701 int needrecov; 12702 12703 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12704 #if DEBUG 12705 mutex_enter(&oop->oo_lock); 12706 ASSERT(oop->oo_seqid_inuse); 12707 mutex_exit(&oop->oo_lock); 12708 #endif 12709 12710 recov_retry_confirm: 12711 nfs4_error_zinit(ep); 12712 *retry_open = FALSE; 12713 12714 if (resend) 12715 args.ctag = TAG_OPEN_CONFIRM_LOST; 12716 else 12717 args.ctag = TAG_OPEN_CONFIRM; 12718 12719 args.array_len = 2; 12720 args.array = argop; 12721 12722 /* putfh target fh */ 12723 argop[0].argop = OP_CPUTFH; 12724 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12725 12726 argop[1].argop = OP_OPEN_CONFIRM; 12727 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12728 12729 (*seqid) += 1; 12730 open_confirm_args->seqid = *seqid; 12731 open_confirm_args->open_stateid = *stateid; 12732 12733 mi = VTOMI4(vp); 12734 12735 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12736 12737 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12738 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12739 } 12740 12741 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12742 if (!needrecov && ep->error) 12743 return; 12744 12745 if (needrecov) { 12746 bool_t abort = FALSE; 12747 12748 if (reopening_file == FALSE) { 12749 nfs4_bseqid_entry_t *bsep = NULL; 12750 12751 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12752 bsep = nfs4_create_bseqid_entry(oop, NULL, 12753 vp, 0, args.ctag, 12754 open_confirm_args->seqid); 12755 12756 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, 12757 NULL, NULL, NULL, OP_OPEN_CONFIRM, bsep); 12758 if (bsep) { 12759 kmem_free(bsep, sizeof (*bsep)); 12760 if (num_bseqid_retryp && 12761 --(*num_bseqid_retryp) == 0) 12762 abort = TRUE; 12763 } 12764 } 12765 if ((ep->error == ETIMEDOUT || 12766 res.status == NFS4ERR_RESOURCE) && 12767 abort == FALSE && resend == FALSE) { 12768 if (!ep->error) 12769 (void) xdr_free(xdr_COMPOUND4res_clnt, 12770 (caddr_t)&res); 12771 12772 delay(SEC_TO_TICK(confirm_retry_sec)); 12773 goto recov_retry_confirm; 12774 } 12775 /* State may have changed so retry the entire OPEN op */ 12776 if (abort == FALSE) 12777 *retry_open = TRUE; 12778 else 12779 *retry_open = FALSE; 12780 if (!ep->error) 12781 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12782 return; 12783 } 12784 12785 if (res.status) { 12786 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12787 return; 12788 } 12789 12790 resop = &res.array[1]; /* open confirm res */ 12791 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12792 stateid, sizeof (*stateid)); 12793 12794 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12795 } 12796 12797 /* 12798 * Return the credentials associated with a client state object. The 12799 * caller is responsible for freeing the credentials. 12800 */ 12801 12802 static cred_t * 12803 state_to_cred(nfs4_open_stream_t *osp) 12804 { 12805 cred_t *cr; 12806 12807 /* 12808 * It's ok to not lock the open stream and open owner to get 12809 * the oo_cred since this is only written once (upon creation) 12810 * and will not change. 12811 */ 12812 cr = osp->os_open_owner->oo_cred; 12813 crhold(cr); 12814 12815 return (cr); 12816 } 12817 12818 /* 12819 * nfs4_find_sysid 12820 * 12821 * Find the sysid for the knetconfig associated with the given mi. 12822 */ 12823 static struct lm_sysid * 12824 nfs4_find_sysid(mntinfo4_t *mi) 12825 { 12826 ASSERT(nfs_zone() == mi->mi_zone); 12827 12828 /* 12829 * Switch from RDMA knconf to original mount knconf 12830 */ 12831 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12832 mi->mi_curr_serv->sv_hostname, NULL)); 12833 } 12834 12835 #ifdef DEBUG 12836 /* 12837 * Return a string version of the call type for easy reading. 12838 */ 12839 static char * 12840 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12841 { 12842 switch (ctype) { 12843 case NFS4_LCK_CTYPE_NORM: 12844 return ("NORMAL"); 12845 case NFS4_LCK_CTYPE_RECLAIM: 12846 return ("RECLAIM"); 12847 case NFS4_LCK_CTYPE_RESEND: 12848 return ("RESEND"); 12849 case NFS4_LCK_CTYPE_REINSTATE: 12850 return ("REINSTATE"); 12851 default: 12852 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12853 "type %d", ctype); 12854 return (""); 12855 } 12856 } 12857 #endif 12858 12859 /* 12860 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12861 * Unlock requests don't have an over-the-wire locktype, so we just return 12862 * something non-threatening. 12863 */ 12864 12865 static nfs_lock_type4 12866 flk_to_locktype(int cmd, int l_type) 12867 { 12868 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12869 12870 switch (l_type) { 12871 case F_UNLCK: 12872 return (READ_LT); 12873 case F_RDLCK: 12874 if (cmd == F_SETLK) 12875 return (READ_LT); 12876 else 12877 return (READW_LT); 12878 case F_WRLCK: 12879 if (cmd == F_SETLK) 12880 return (WRITE_LT); 12881 else 12882 return (WRITEW_LT); 12883 } 12884 panic("flk_to_locktype"); 12885 /*NOTREACHED*/ 12886 } 12887 12888 /* 12889 * Do some preliminary checks for nfs4frlock. 12890 */ 12891 static int 12892 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12893 u_offset_t offset) 12894 { 12895 int error = 0; 12896 12897 /* 12898 * If we are setting a lock, check that the file is opened 12899 * with the correct mode. 12900 */ 12901 if (cmd == F_SETLK || cmd == F_SETLKW) { 12902 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12903 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12904 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12905 "nfs4frlock_validate_args: file was opened with " 12906 "incorrect mode")); 12907 return (EBADF); 12908 } 12909 } 12910 12911 /* Convert the offset. It may need to be restored before returning. */ 12912 if (error = convoff(vp, flk, 0, offset)) { 12913 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12914 "nfs4frlock_validate_args: convoff => error= %d\n", 12915 error)); 12916 return (error); 12917 } 12918 12919 return (error); 12920 } 12921 12922 /* 12923 * Set the flock64's lm_sysid for nfs4frlock. 12924 */ 12925 static int 12926 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12927 { 12928 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12929 12930 /* Find the lm_sysid */ 12931 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12932 12933 if (*lspp == NULL) { 12934 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12935 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12936 return (ENOLCK); 12937 } 12938 12939 flk->l_sysid = lm_sysidt(*lspp); 12940 12941 return (0); 12942 } 12943 12944 /* 12945 * Do the remaining preliminary setup for nfs4frlock. 12946 */ 12947 static void 12948 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12949 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12950 cred_t **cred_otw) 12951 { 12952 /* 12953 * set tick_delay to the base delay time. 12954 * (NFS4_BASE_WAIT_TIME is in secs) 12955 */ 12956 12957 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12958 12959 /* 12960 * If lock is relative to EOF, we need the newest length of the 12961 * file. Therefore invalidate the ATTR_CACHE. 12962 */ 12963 12964 *whencep = flk->l_whence; 12965 12966 if (*whencep == 2) /* SEEK_END */ 12967 PURGE_ATTRCACHE4(vp); 12968 12969 recov_statep->rs_flags = 0; 12970 recov_statep->rs_num_retry_despite_err = 0; 12971 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12972 } 12973 12974 /* 12975 * Initialize and allocate the data structures necessary for 12976 * the nfs4frlock call. 12977 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 12978 */ 12979 static void 12980 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12981 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12982 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12983 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12984 { 12985 int argoplist_size; 12986 int num_ops = 2; 12987 12988 *retry = FALSE; 12989 *did_start_fop = FALSE; 12990 *skip_get_err = FALSE; 12991 lost_rqstp->lr_op = 0; 12992 argoplist_size = num_ops * sizeof (nfs_argop4); 12993 /* fill array with zero */ 12994 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 12995 12996 *argspp = argsp; 12997 *respp = NULL; 12998 12999 argsp->array_len = num_ops; 13000 argsp->array = *argopp; 13001 13002 /* initialize in case of error; will get real value down below */ 13003 argsp->ctag = TAG_NONE; 13004 13005 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13006 *op_hintp = OH_LOCKU; 13007 else 13008 *op_hintp = OH_OTHER; 13009 } 13010 13011 /* 13012 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13013 * the proper nfs4_server_t for this instance of nfs4frlock. 13014 * Returns 0 (success) or an errno value. 13015 */ 13016 static int 13017 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13018 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13019 bool_t *did_start_fop, bool_t *startrecovp) 13020 { 13021 int error = 0; 13022 rnode4_t *rp; 13023 13024 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13025 13026 if (ctype == NFS4_LCK_CTYPE_NORM) { 13027 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13028 recov_statep, startrecovp); 13029 if (error) 13030 return (error); 13031 *did_start_fop = TRUE; 13032 } else { 13033 *did_start_fop = FALSE; 13034 *startrecovp = FALSE; 13035 } 13036 13037 if (!error) { 13038 rp = VTOR4(vp); 13039 13040 /* If the file failed recovery, just quit. */ 13041 mutex_enter(&rp->r_statelock); 13042 if (rp->r_flags & R4RECOVERR) { 13043 error = EIO; 13044 } 13045 mutex_exit(&rp->r_statelock); 13046 } 13047 13048 return (error); 13049 } 13050 13051 /* 13052 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13053 * resend nfs4frlock call is initiated by the recovery framework. 13054 * Acquires the lop and oop seqid synchronization. 13055 */ 13056 static void 13057 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13058 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13059 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13060 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13061 { 13062 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13063 int error; 13064 13065 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13066 (CE_NOTE, 13067 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13068 ASSERT(resend_rqstp != NULL); 13069 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13070 resend_rqstp->lr_op == OP_LOCKU); 13071 13072 *oopp = resend_rqstp->lr_oop; 13073 if (resend_rqstp->lr_oop) { 13074 open_owner_hold(resend_rqstp->lr_oop); 13075 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13076 ASSERT(error == 0); /* recov thread always succeeds */ 13077 } 13078 13079 /* Must resend this lost lock/locku request. */ 13080 ASSERT(resend_rqstp->lr_lop != NULL); 13081 *lopp = resend_rqstp->lr_lop; 13082 lock_owner_hold(resend_rqstp->lr_lop); 13083 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13084 ASSERT(error == 0); /* recov thread always succeeds */ 13085 13086 *ospp = resend_rqstp->lr_osp; 13087 if (*ospp) 13088 open_stream_hold(resend_rqstp->lr_osp); 13089 13090 if (resend_rqstp->lr_op == OP_LOCK) { 13091 LOCK4args *lock_args; 13092 13093 argop->argop = OP_LOCK; 13094 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13095 lock_args->locktype = resend_rqstp->lr_locktype; 13096 lock_args->reclaim = 13097 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13098 lock_args->offset = resend_rqstp->lr_flk->l_start; 13099 lock_args->length = resend_rqstp->lr_flk->l_len; 13100 if (lock_args->length == 0) 13101 lock_args->length = ~lock_args->length; 13102 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13103 mi2clientid(mi), &lock_args->locker); 13104 13105 switch (resend_rqstp->lr_ctype) { 13106 case NFS4_LCK_CTYPE_RESEND: 13107 argsp->ctag = TAG_LOCK_RESEND; 13108 break; 13109 case NFS4_LCK_CTYPE_REINSTATE: 13110 argsp->ctag = TAG_LOCK_REINSTATE; 13111 break; 13112 case NFS4_LCK_CTYPE_RECLAIM: 13113 argsp->ctag = TAG_LOCK_RECLAIM; 13114 break; 13115 default: 13116 argsp->ctag = TAG_LOCK_UNKNOWN; 13117 break; 13118 } 13119 } else { 13120 LOCKU4args *locku_args; 13121 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13122 13123 argop->argop = OP_LOCKU; 13124 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13125 locku_args->locktype = READ_LT; 13126 locku_args->seqid = lop->lock_seqid + 1; 13127 mutex_enter(&lop->lo_lock); 13128 locku_args->lock_stateid = lop->lock_stateid; 13129 mutex_exit(&lop->lo_lock); 13130 locku_args->offset = resend_rqstp->lr_flk->l_start; 13131 locku_args->length = resend_rqstp->lr_flk->l_len; 13132 if (locku_args->length == 0) 13133 locku_args->length = ~locku_args->length; 13134 13135 switch (resend_rqstp->lr_ctype) { 13136 case NFS4_LCK_CTYPE_RESEND: 13137 argsp->ctag = TAG_LOCKU_RESEND; 13138 break; 13139 case NFS4_LCK_CTYPE_REINSTATE: 13140 argsp->ctag = TAG_LOCKU_REINSTATE; 13141 break; 13142 default: 13143 argsp->ctag = TAG_LOCK_UNKNOWN; 13144 break; 13145 } 13146 } 13147 } 13148 13149 /* 13150 * Setup the LOCKT4 arguments. 13151 */ 13152 static void 13153 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13154 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13155 rnode4_t *rp) 13156 { 13157 LOCKT4args *lockt_args; 13158 13159 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13160 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13161 argop->argop = OP_LOCKT; 13162 argsp->ctag = TAG_LOCKT; 13163 lockt_args = &argop->nfs_argop4_u.oplockt; 13164 13165 /* 13166 * The locktype will be READ_LT unless it's 13167 * a write lock. We do this because the Solaris 13168 * system call allows the combination of 13169 * F_UNLCK and F_GETLK* and so in that case the 13170 * unlock is mapped to a read. 13171 */ 13172 if (flk->l_type == F_WRLCK) 13173 lockt_args->locktype = WRITE_LT; 13174 else 13175 lockt_args->locktype = READ_LT; 13176 13177 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13178 /* set the lock owner4 args */ 13179 nfs4_setlockowner_args(&lockt_args->owner, rp, 13180 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13181 flk->l_pid); 13182 lockt_args->offset = flk->l_start; 13183 lockt_args->length = flk->l_len; 13184 if (flk->l_len == 0) 13185 lockt_args->length = ~lockt_args->length; 13186 13187 *lockt_argsp = lockt_args; 13188 } 13189 13190 /* 13191 * If the client is holding a delegation, and the open stream to be used 13192 * with this lock request is a delegation open stream, then re-open the stream. 13193 * Sets the nfs4_error_t to all zeros unless the open stream has already 13194 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13195 * means the caller should retry (like a recovery retry). 13196 */ 13197 static void 13198 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13199 { 13200 open_delegation_type4 dt; 13201 bool_t reopen_needed, force; 13202 nfs4_open_stream_t *osp; 13203 open_claim_type4 oclaim; 13204 rnode4_t *rp = VTOR4(vp); 13205 mntinfo4_t *mi = VTOMI4(vp); 13206 13207 ASSERT(nfs_zone() == mi->mi_zone); 13208 13209 nfs4_error_zinit(ep); 13210 13211 mutex_enter(&rp->r_statev4_lock); 13212 dt = rp->r_deleg_type; 13213 mutex_exit(&rp->r_statev4_lock); 13214 13215 if (dt != OPEN_DELEGATE_NONE) { 13216 nfs4_open_owner_t *oop; 13217 13218 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13219 if (!oop) { 13220 ep->stat = NFS4ERR_IO; 13221 return; 13222 } 13223 /* returns with 'os_sync_lock' held */ 13224 osp = find_open_stream(oop, rp); 13225 if (!osp) { 13226 open_owner_rele(oop); 13227 ep->stat = NFS4ERR_IO; 13228 return; 13229 } 13230 13231 if (osp->os_failed_reopen) { 13232 NFS4_DEBUG((nfs4_open_stream_debug || 13233 nfs4_client_lock_debug), (CE_NOTE, 13234 "nfs4frlock_check_deleg: os_failed_reopen set " 13235 "for osp %p, cr %p, rp %s", (void *)osp, 13236 (void *)cr, rnode4info(rp))); 13237 mutex_exit(&osp->os_sync_lock); 13238 open_stream_rele(osp, rp); 13239 open_owner_rele(oop); 13240 ep->stat = NFS4ERR_IO; 13241 return; 13242 } 13243 13244 /* 13245 * Determine whether a reopen is needed. If this 13246 * is a delegation open stream, then send the open 13247 * to the server to give visibility to the open owner. 13248 * Even if it isn't a delegation open stream, we need 13249 * to check if the previous open CLAIM_DELEGATE_CUR 13250 * was sufficient. 13251 */ 13252 13253 reopen_needed = osp->os_delegation || 13254 ((lt == F_RDLCK && 13255 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13256 (lt == F_WRLCK && 13257 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13258 13259 mutex_exit(&osp->os_sync_lock); 13260 open_owner_rele(oop); 13261 13262 if (reopen_needed) { 13263 /* 13264 * Always use CLAIM_PREVIOUS after server reboot. 13265 * The server will reject CLAIM_DELEGATE_CUR if 13266 * it is used during the grace period. 13267 */ 13268 mutex_enter(&mi->mi_lock); 13269 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13270 oclaim = CLAIM_PREVIOUS; 13271 force = TRUE; 13272 } else { 13273 oclaim = CLAIM_DELEGATE_CUR; 13274 force = FALSE; 13275 } 13276 mutex_exit(&mi->mi_lock); 13277 13278 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13279 if (ep->error == EAGAIN) { 13280 nfs4_error_zinit(ep); 13281 ep->stat = NFS4ERR_DELAY; 13282 } 13283 } 13284 open_stream_rele(osp, rp); 13285 osp = NULL; 13286 } 13287 } 13288 13289 /* 13290 * Setup the LOCKU4 arguments. 13291 * Returns errors via the nfs4_error_t. 13292 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13293 * over-the-wire. The caller must release the 13294 * reference on *lopp. 13295 * NFS4ERR_DELAY caller should retry (like recovery retry) 13296 * (other) unrecoverable error. 13297 */ 13298 static void 13299 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13300 LOCKU4args **locku_argsp, flock64_t *flk, 13301 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13302 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13303 bool_t *skip_get_err, bool_t *go_otwp) 13304 { 13305 nfs4_lock_owner_t *lop = NULL; 13306 LOCKU4args *locku_args; 13307 pid_t pid; 13308 bool_t is_spec = FALSE; 13309 rnode4_t *rp = VTOR4(vp); 13310 13311 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13312 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13313 13314 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13315 if (ep->error || ep->stat) 13316 return; 13317 13318 argop->argop = OP_LOCKU; 13319 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13320 argsp->ctag = TAG_LOCKU_REINSTATE; 13321 else 13322 argsp->ctag = TAG_LOCKU; 13323 locku_args = &argop->nfs_argop4_u.oplocku; 13324 *locku_argsp = locku_args; 13325 13326 /* 13327 * XXX what should locku_args->locktype be? 13328 * setting to ALWAYS be READ_LT so at least 13329 * it is a valid locktype. 13330 */ 13331 13332 locku_args->locktype = READ_LT; 13333 13334 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13335 flk->l_pid; 13336 13337 /* 13338 * Get the lock owner stateid. If no lock owner 13339 * exists, return success. 13340 */ 13341 lop = find_lock_owner(rp, pid, LOWN_ANY); 13342 *lopp = lop; 13343 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13344 is_spec = TRUE; 13345 if (!lop || is_spec) { 13346 /* 13347 * No lock owner so no locks to unlock. 13348 * Return success. If there was a failed 13349 * reclaim earlier, the lock might still be 13350 * registered with the local locking code, 13351 * so notify it of the unlock. 13352 * 13353 * If the lockowner is using a special stateid, 13354 * then the original lock request (that created 13355 * this lockowner) was never successful, so we 13356 * have no lock to undo OTW. 13357 */ 13358 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13359 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13360 "(%ld) so return success", (long)pid)); 13361 13362 if (ctype == NFS4_LCK_CTYPE_NORM) 13363 flk->l_pid = curproc->p_pid; 13364 nfs4_register_lock_locally(vp, flk, flag, offset); 13365 /* 13366 * Release our hold and NULL out so final_cleanup 13367 * doesn't try to end a lock seqid sync we 13368 * never started. 13369 */ 13370 if (is_spec) { 13371 lock_owner_rele(lop); 13372 *lopp = NULL; 13373 } 13374 *skip_get_err = TRUE; 13375 *go_otwp = FALSE; 13376 return; 13377 } 13378 13379 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13380 if (ep->error == EAGAIN) { 13381 lock_owner_rele(lop); 13382 *lopp = NULL; 13383 return; 13384 } 13385 13386 mutex_enter(&lop->lo_lock); 13387 locku_args->lock_stateid = lop->lock_stateid; 13388 mutex_exit(&lop->lo_lock); 13389 locku_args->seqid = lop->lock_seqid + 1; 13390 13391 /* leave the ref count on lop, rele after RPC call */ 13392 13393 locku_args->offset = flk->l_start; 13394 locku_args->length = flk->l_len; 13395 if (flk->l_len == 0) 13396 locku_args->length = ~locku_args->length; 13397 13398 *go_otwp = TRUE; 13399 } 13400 13401 /* 13402 * Setup the LOCK4 arguments. 13403 * 13404 * Returns errors via the nfs4_error_t. 13405 * NFS4_OK no problems 13406 * NFS4ERR_DELAY caller should retry (like recovery retry) 13407 * (other) unrecoverable error 13408 */ 13409 static void 13410 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13411 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13412 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13413 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13414 { 13415 LOCK4args *lock_args; 13416 nfs4_open_owner_t *oop = NULL; 13417 nfs4_open_stream_t *osp = NULL; 13418 nfs4_lock_owner_t *lop = NULL; 13419 pid_t pid; 13420 rnode4_t *rp = VTOR4(vp); 13421 13422 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13423 13424 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13425 if (ep->error || ep->stat != NFS4_OK) 13426 return; 13427 13428 argop->argop = OP_LOCK; 13429 if (ctype == NFS4_LCK_CTYPE_NORM) 13430 argsp->ctag = TAG_LOCK; 13431 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13432 argsp->ctag = TAG_RELOCK; 13433 else 13434 argsp->ctag = TAG_LOCK_REINSTATE; 13435 lock_args = &argop->nfs_argop4_u.oplock; 13436 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13437 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13438 /* 13439 * Get the lock owner. If no lock owner exists, 13440 * create a 'temporary' one and grab the open seqid 13441 * synchronization (which puts a hold on the open 13442 * owner and open stream). 13443 * This also grabs the lock seqid synchronization. 13444 */ 13445 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13446 ep->stat = 13447 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13448 13449 if (ep->stat != NFS4_OK) 13450 goto out; 13451 13452 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13453 &lock_args->locker); 13454 13455 lock_args->offset = flk->l_start; 13456 lock_args->length = flk->l_len; 13457 if (flk->l_len == 0) 13458 lock_args->length = ~lock_args->length; 13459 *lock_argsp = lock_args; 13460 out: 13461 *oopp = oop; 13462 *ospp = osp; 13463 *lopp = lop; 13464 } 13465 13466 /* 13467 * After we get the reply from the server, record the proper information 13468 * for possible resend lock requests. 13469 * 13470 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13471 */ 13472 static void 13473 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13474 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13475 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13476 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13477 { 13478 bool_t unlock = (flk->l_type == F_UNLCK); 13479 13480 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13481 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13482 ctype == NFS4_LCK_CTYPE_REINSTATE); 13483 13484 if (error != 0 && !unlock) { 13485 NFS4_DEBUG((nfs4_lost_rqst_debug || 13486 nfs4_client_lock_debug), (CE_NOTE, 13487 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13488 " for lop %p", (void *)lop)); 13489 ASSERT(lop != NULL); 13490 mutex_enter(&lop->lo_lock); 13491 lop->lo_pending_rqsts = 1; 13492 mutex_exit(&lop->lo_lock); 13493 } 13494 13495 lost_rqstp->lr_putfirst = FALSE; 13496 lost_rqstp->lr_op = 0; 13497 13498 /* 13499 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13500 * recovery purposes so that the lock request that was sent 13501 * can be saved and re-issued later. Ditto for EIO from a forced 13502 * unmount. This is done to have the client's local locking state 13503 * match the v4 server's state; that is, the request was 13504 * potentially received and accepted by the server but the client 13505 * thinks it was not. 13506 */ 13507 if (error == ETIMEDOUT || error == EINTR || 13508 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13509 NFS4_DEBUG((nfs4_lost_rqst_debug || 13510 nfs4_client_lock_debug), (CE_NOTE, 13511 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13512 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13513 (void *)lop, (void *)oop, (void *)osp)); 13514 if (unlock) 13515 lost_rqstp->lr_op = OP_LOCKU; 13516 else { 13517 lost_rqstp->lr_op = OP_LOCK; 13518 lost_rqstp->lr_locktype = locktype; 13519 } 13520 /* 13521 * Objects are held and rele'd via the recovery code. 13522 * See nfs4_save_lost_rqst. 13523 */ 13524 lost_rqstp->lr_vp = vp; 13525 lost_rqstp->lr_dvp = NULL; 13526 lost_rqstp->lr_oop = oop; 13527 lost_rqstp->lr_osp = osp; 13528 lost_rqstp->lr_lop = lop; 13529 lost_rqstp->lr_cr = cr; 13530 switch (ctype) { 13531 case NFS4_LCK_CTYPE_NORM: 13532 flk->l_pid = ttoproc(curthread)->p_pid; 13533 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13534 break; 13535 case NFS4_LCK_CTYPE_REINSTATE: 13536 lost_rqstp->lr_putfirst = TRUE; 13537 lost_rqstp->lr_ctype = ctype; 13538 break; 13539 default: 13540 break; 13541 } 13542 lost_rqstp->lr_flk = flk; 13543 } 13544 } 13545 13546 /* 13547 * Update lop's seqid. Also update the seqid stored in a resend request, 13548 * if any. (Some recovery errors increment the seqid, and we may have to 13549 * send the resend request again.) 13550 */ 13551 13552 static void 13553 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13554 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13555 { 13556 if (lock_args) { 13557 if (lock_args->locker.new_lock_owner == TRUE) 13558 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13559 else { 13560 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13561 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13562 } 13563 } else if (locku_args) { 13564 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13565 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13566 } 13567 } 13568 13569 /* 13570 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13571 * COMPOUND4 args/res for calls that need to retry. 13572 * Switches the *cred_otwp to base_cr. 13573 */ 13574 static void 13575 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13576 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13577 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13578 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13579 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13580 { 13581 nfs4_open_owner_t *oop = *oopp; 13582 nfs4_open_stream_t *osp = *ospp; 13583 nfs4_lock_owner_t *lop = *lopp; 13584 nfs_argop4 *argop = (*argspp)->array; 13585 13586 if (*did_start_fop) { 13587 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13588 needrecov); 13589 *did_start_fop = FALSE; 13590 } 13591 ASSERT((*argspp)->array_len == 2); 13592 if (argop[1].argop == OP_LOCK) 13593 nfs4args_lock_free(&argop[1]); 13594 else if (argop[1].argop == OP_LOCKT) 13595 nfs4args_lockt_free(&argop[1]); 13596 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13597 if (!error) 13598 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13599 *argspp = NULL; 13600 *respp = NULL; 13601 13602 if (lop) { 13603 nfs4_end_lock_seqid_sync(lop); 13604 lock_owner_rele(lop); 13605 *lopp = NULL; 13606 } 13607 13608 /* need to free up the reference on osp for lock args */ 13609 if (osp != NULL) { 13610 open_stream_rele(osp, VTOR4(vp)); 13611 *ospp = NULL; 13612 } 13613 13614 /* need to free up the reference on oop for lock args */ 13615 if (oop != NULL) { 13616 nfs4_end_open_seqid_sync(oop); 13617 open_owner_rele(oop); 13618 *oopp = NULL; 13619 } 13620 13621 crfree(*cred_otwp); 13622 *cred_otwp = base_cr; 13623 crhold(*cred_otwp); 13624 } 13625 13626 /* 13627 * Function to process the client's recovery for nfs4frlock. 13628 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13629 * 13630 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13631 * COMPOUND4 args/res for calls that need to retry. 13632 * 13633 * Note: the rp's r_lkserlock is *not* dropped during this path. 13634 */ 13635 static bool_t 13636 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13637 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13638 LOCK4args *lock_args, LOCKU4args *locku_args, 13639 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13640 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13641 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13642 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13643 { 13644 nfs4_open_owner_t *oop = *oopp; 13645 nfs4_open_stream_t *osp = *ospp; 13646 nfs4_lock_owner_t *lop = *lopp; 13647 13648 bool_t abort, retry; 13649 13650 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13651 ASSERT((*argspp) != NULL); 13652 ASSERT((*respp) != NULL); 13653 if (lock_args || locku_args) 13654 ASSERT(lop != NULL); 13655 13656 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13657 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13658 13659 retry = TRUE; 13660 abort = FALSE; 13661 if (needrecov) { 13662 nfs4_bseqid_entry_t *bsep = NULL; 13663 nfs_opnum4 op; 13664 13665 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13666 13667 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13668 seqid4 seqid; 13669 13670 if (lock_args) { 13671 if (lock_args->locker.new_lock_owner == TRUE) 13672 seqid = lock_args->locker.locker4_u. 13673 open_owner.open_seqid; 13674 else 13675 seqid = lock_args->locker.locker4_u. 13676 lock_owner.lock_seqid; 13677 } else if (locku_args) { 13678 seqid = locku_args->seqid; 13679 } else { 13680 seqid = 0; 13681 } 13682 13683 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13684 flk->l_pid, (*argspp)->ctag, seqid); 13685 } 13686 13687 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13688 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13689 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13690 NULL, op, bsep); 13691 13692 if (bsep) 13693 kmem_free(bsep, sizeof (*bsep)); 13694 } 13695 13696 /* 13697 * Return that we do not want to retry the request for 3 cases: 13698 * 1. If we received EINTR or are bailing out because of a forced 13699 * unmount, we came into this code path just for the sake of 13700 * initiating recovery, we now need to return the error. 13701 * 2. If we have aborted recovery. 13702 * 3. We received NFS4ERR_BAD_SEQID. 13703 */ 13704 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13705 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13706 retry = FALSE; 13707 13708 if (*did_start_fop == TRUE) { 13709 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13710 needrecov); 13711 *did_start_fop = FALSE; 13712 } 13713 13714 if (retry == TRUE) { 13715 nfs_argop4 *argop; 13716 13717 argop = (*argspp)->array; 13718 ASSERT((*argspp)->array_len == 2); 13719 13720 if (argop[1].argop == OP_LOCK) 13721 nfs4args_lock_free(&argop[1]); 13722 else if (argop[1].argop == OP_LOCKT) 13723 nfs4args_lockt_free(&argop[1]); 13724 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13725 if (!ep->error) 13726 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13727 *respp = NULL; 13728 *argspp = NULL; 13729 } 13730 13731 if (lop != NULL) { 13732 nfs4_end_lock_seqid_sync(lop); 13733 lock_owner_rele(lop); 13734 } 13735 13736 *lopp = NULL; 13737 13738 /* need to free up the reference on osp for lock args */ 13739 if (osp != NULL) { 13740 open_stream_rele(osp, rp); 13741 *ospp = NULL; 13742 } 13743 13744 /* need to free up the reference on oop for lock args */ 13745 if (oop != NULL) { 13746 nfs4_end_open_seqid_sync(oop); 13747 open_owner_rele(oop); 13748 *oopp = NULL; 13749 } 13750 13751 return (retry); 13752 } 13753 13754 /* 13755 * Handles the successful reply from the server for nfs4frlock. 13756 */ 13757 static void 13758 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13759 vnode_t *vp, int flag, u_offset_t offset, 13760 nfs4_lost_rqst_t *resend_rqstp) 13761 { 13762 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13763 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13764 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13765 if (ctype == NFS4_LCK_CTYPE_NORM) { 13766 flk->l_pid = ttoproc(curthread)->p_pid; 13767 /* 13768 * We do not register lost locks locally in 13769 * the 'resend' case since the user/application 13770 * doesn't think we have the lock. 13771 */ 13772 ASSERT(!resend_rqstp); 13773 nfs4_register_lock_locally(vp, flk, flag, offset); 13774 } 13775 } 13776 } 13777 13778 /* 13779 * Handle the DENIED reply from the server for nfs4frlock. 13780 * Returns TRUE if we should retry the request; FALSE otherwise. 13781 * 13782 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13783 * COMPOUND4 args/res for calls that need to retry. Can also 13784 * drop and regrab the r_lkserlock. 13785 */ 13786 static bool_t 13787 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13788 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13789 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13790 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13791 nfs4_recov_state_t *recov_statep, int needrecov, 13792 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13793 clock_t *tick_delayp, short *whencep, int *errorp, 13794 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13795 bool_t *skip_get_err) 13796 { 13797 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13798 13799 if (lock_args) { 13800 nfs4_open_owner_t *oop = *oopp; 13801 nfs4_open_stream_t *osp = *ospp; 13802 nfs4_lock_owner_t *lop = *lopp; 13803 int intr; 13804 13805 /* 13806 * Blocking lock needs to sleep and retry from the request. 13807 * 13808 * Do not block and wait for 'resend' or 'reinstate' 13809 * lock requests, just return the error. 13810 * 13811 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13812 */ 13813 if (cmd == F_SETLKW) { 13814 rnode4_t *rp = VTOR4(vp); 13815 nfs_argop4 *argop = (*argspp)->array; 13816 13817 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13818 13819 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13820 recov_statep, needrecov); 13821 *did_start_fop = FALSE; 13822 ASSERT((*argspp)->array_len == 2); 13823 if (argop[1].argop == OP_LOCK) 13824 nfs4args_lock_free(&argop[1]); 13825 else if (argop[1].argop == OP_LOCKT) 13826 nfs4args_lockt_free(&argop[1]); 13827 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13828 if (*respp) 13829 (void) xdr_free(xdr_COMPOUND4res_clnt, 13830 (caddr_t)*respp); 13831 *argspp = NULL; 13832 *respp = NULL; 13833 nfs4_end_lock_seqid_sync(lop); 13834 lock_owner_rele(lop); 13835 *lopp = NULL; 13836 if (osp != NULL) { 13837 open_stream_rele(osp, rp); 13838 *ospp = NULL; 13839 } 13840 if (oop != NULL) { 13841 nfs4_end_open_seqid_sync(oop); 13842 open_owner_rele(oop); 13843 *oopp = NULL; 13844 } 13845 13846 nfs_rw_exit(&rp->r_lkserlock); 13847 13848 intr = nfs4_block_and_wait(tick_delayp, rp); 13849 13850 if (intr) { 13851 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13852 RW_WRITER, FALSE); 13853 *errorp = EINTR; 13854 return (FALSE); 13855 } 13856 13857 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13858 RW_WRITER, FALSE); 13859 13860 /* 13861 * Make sure we are still safe to lock with 13862 * regards to mmapping. 13863 */ 13864 if (!nfs4_safelock(vp, flk, cr)) { 13865 *errorp = EAGAIN; 13866 return (FALSE); 13867 } 13868 13869 return (TRUE); 13870 } 13871 if (ctype == NFS4_LCK_CTYPE_NORM) 13872 *errorp = EAGAIN; 13873 *skip_get_err = TRUE; 13874 flk->l_whence = 0; 13875 *whencep = 0; 13876 return (FALSE); 13877 } else if (lockt_args) { 13878 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13879 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13880 13881 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13882 flk, lockt_args); 13883 13884 /* according to NLM code */ 13885 *errorp = 0; 13886 *whencep = 0; 13887 *skip_get_err = TRUE; 13888 return (FALSE); 13889 } 13890 return (FALSE); 13891 } 13892 13893 /* 13894 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13895 */ 13896 static void 13897 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13898 { 13899 switch (resp->status) { 13900 case NFS4ERR_ACCESS: 13901 case NFS4ERR_ADMIN_REVOKED: 13902 case NFS4ERR_BADHANDLE: 13903 case NFS4ERR_BAD_RANGE: 13904 case NFS4ERR_BAD_SEQID: 13905 case NFS4ERR_BAD_STATEID: 13906 case NFS4ERR_BADXDR: 13907 case NFS4ERR_DEADLOCK: 13908 case NFS4ERR_DELAY: 13909 case NFS4ERR_EXPIRED: 13910 case NFS4ERR_FHEXPIRED: 13911 case NFS4ERR_GRACE: 13912 case NFS4ERR_INVAL: 13913 case NFS4ERR_ISDIR: 13914 case NFS4ERR_LEASE_MOVED: 13915 case NFS4ERR_LOCK_NOTSUPP: 13916 case NFS4ERR_LOCK_RANGE: 13917 case NFS4ERR_MOVED: 13918 case NFS4ERR_NOFILEHANDLE: 13919 case NFS4ERR_NO_GRACE: 13920 case NFS4ERR_OLD_STATEID: 13921 case NFS4ERR_OPENMODE: 13922 case NFS4ERR_RECLAIM_BAD: 13923 case NFS4ERR_RECLAIM_CONFLICT: 13924 case NFS4ERR_RESOURCE: 13925 case NFS4ERR_SERVERFAULT: 13926 case NFS4ERR_STALE: 13927 case NFS4ERR_STALE_CLIENTID: 13928 case NFS4ERR_STALE_STATEID: 13929 return; 13930 default: 13931 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13932 "nfs4frlock_results_default: got unrecognizable " 13933 "res.status %d", resp->status)); 13934 *errorp = NFS4ERR_INVAL; 13935 } 13936 } 13937 13938 /* 13939 * The lock request was successful, so update the client's state. 13940 */ 13941 static void 13942 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13943 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13944 vnode_t *vp, flock64_t *flk, cred_t *cr, 13945 nfs4_lost_rqst_t *resend_rqstp) 13946 { 13947 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13948 13949 if (lock_args) { 13950 LOCK4res *lock_res; 13951 13952 lock_res = &resop->nfs_resop4_u.oplock; 13953 /* update the stateid with server's response */ 13954 13955 if (lock_args->locker.new_lock_owner == TRUE) { 13956 mutex_enter(&lop->lo_lock); 13957 lop->lo_just_created = NFS4_PERM_CREATED; 13958 mutex_exit(&lop->lo_lock); 13959 } 13960 13961 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13962 13963 /* 13964 * If the lock was the result of a resending a lost 13965 * request, we've synched up the stateid and seqid 13966 * with the server, but now the server might be out of sync 13967 * with what the application thinks it has for locks. 13968 * Clean that up here. It's unclear whether we should do 13969 * this even if the filesystem has been forcibly unmounted. 13970 * For most servers, it's probably wasted effort, but 13971 * RFC3530 lets servers require that unlocks exactly match 13972 * the locks that are held. 13973 */ 13974 if (resend_rqstp != NULL && 13975 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13976 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13977 } else { 13978 flk->l_whence = 0; 13979 } 13980 } else if (locku_args) { 13981 LOCKU4res *locku_res; 13982 13983 locku_res = &resop->nfs_resop4_u.oplocku; 13984 13985 /* Update the stateid with the server's response */ 13986 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13987 } else if (lockt_args) { 13988 /* Switch the lock type to express success, see fcntl */ 13989 flk->l_type = F_UNLCK; 13990 flk->l_whence = 0; 13991 } 13992 } 13993 13994 /* 13995 * Do final cleanup before exiting nfs4frlock. 13996 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13997 * COMPOUND4 args/res for calls that haven't already. 13998 */ 13999 static void 14000 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14001 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14002 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14003 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 14004 short whence, u_offset_t offset, struct lm_sysid *ls, 14005 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14006 bool_t did_start_fop, bool_t skip_get_err, 14007 cred_t *cred_otw, cred_t *cred) 14008 { 14009 mntinfo4_t *mi = VTOMI4(vp); 14010 rnode4_t *rp = VTOR4(vp); 14011 int error = *errorp; 14012 nfs_argop4 *argop; 14013 int do_flush_pages = 0; 14014 14015 ASSERT(nfs_zone() == mi->mi_zone); 14016 /* 14017 * The client recovery code wants the raw status information, 14018 * so don't map the NFS status code to an errno value for 14019 * non-normal call types. 14020 */ 14021 if (ctype == NFS4_LCK_CTYPE_NORM) { 14022 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14023 *errorp = geterrno4(resp->status); 14024 if (did_start_fop == TRUE) 14025 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14026 needrecov); 14027 14028 /* 14029 * We've established a new lock on the server, so invalidate 14030 * the pages associated with the vnode to get the most up to 14031 * date pages from the server after acquiring the lock. We 14032 * want to be sure that the read operation gets the newest data. 14033 * N.B. 14034 * We used to do this in nfs4frlock_results_ok but that doesn't 14035 * work since VOP_PUTPAGE can call nfs4_commit which calls 14036 * nfs4_start_fop. We flush the pages below after calling 14037 * nfs4_end_fop above 14038 * The flush of the page cache must be done after 14039 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14040 */ 14041 if (!error && resp && resp->status == NFS4_OK) 14042 do_flush_pages = 1; 14043 } 14044 if (argsp) { 14045 ASSERT(argsp->array_len == 2); 14046 argop = argsp->array; 14047 if (argop[1].argop == OP_LOCK) 14048 nfs4args_lock_free(&argop[1]); 14049 else if (argop[1].argop == OP_LOCKT) 14050 nfs4args_lockt_free(&argop[1]); 14051 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14052 if (resp) 14053 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14054 } 14055 14056 /* free the reference on the lock owner */ 14057 if (lop != NULL) { 14058 nfs4_end_lock_seqid_sync(lop); 14059 lock_owner_rele(lop); 14060 } 14061 14062 /* need to free up the reference on osp for lock args */ 14063 if (osp != NULL) 14064 open_stream_rele(osp, rp); 14065 14066 /* need to free up the reference on oop for lock args */ 14067 if (oop != NULL) { 14068 nfs4_end_open_seqid_sync(oop); 14069 open_owner_rele(oop); 14070 } 14071 14072 if (do_flush_pages) 14073 nfs4_flush_pages(vp, cred); 14074 14075 (void) convoff(vp, flk, whence, offset); 14076 14077 lm_rel_sysid(ls); 14078 14079 /* 14080 * Record debug information in the event we get EINVAL. 14081 */ 14082 mutex_enter(&mi->mi_lock); 14083 if (*errorp == EINVAL && (lock_args || locku_args) && 14084 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14085 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14086 zcmn_err(getzoneid(), CE_NOTE, 14087 "%s operation failed with " 14088 "EINVAL probably since the server, %s," 14089 " doesn't support POSIX style locking", 14090 lock_args ? "LOCK" : "LOCKU", 14091 mi->mi_curr_serv->sv_hostname); 14092 mi->mi_flags |= MI4_LOCK_DEBUG; 14093 } 14094 } 14095 mutex_exit(&mi->mi_lock); 14096 14097 if (cred_otw) 14098 crfree(cred_otw); 14099 } 14100 14101 /* 14102 * This calls the server and the local locking code. 14103 * 14104 * Client locks are registerred locally by oring the sysid with 14105 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14106 * We need to distinguish between the two to avoid collision in case one 14107 * machine is used as both client and server. 14108 * 14109 * Blocking lock requests will continually retry to acquire the lock 14110 * forever. 14111 * 14112 * The ctype is defined as follows: 14113 * NFS4_LCK_CTYPE_NORM: normal lock request. 14114 * 14115 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14116 * recovery, get the pid from flk instead of curproc, and don't reregister 14117 * the lock locally. 14118 * 14119 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14120 * that we will use the information passed in via resend_rqstp to setup the 14121 * lock/locku request. This resend is the exact same request as the 'lost 14122 * lock', and is initiated by the recovery framework. A successful resend 14123 * request can initiate one or more reinstate requests. 14124 * 14125 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14126 * does not trigger additional reinstate requests. This lock call type is 14127 * set for setting the v4 server's locking state back to match what the 14128 * client's local locking state is in the event of a received 'lost lock'. 14129 * 14130 * Errors are returned via the nfs4_error_t parameter. 14131 */ 14132 void 14133 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14134 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14135 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14136 { 14137 COMPOUND4args_clnt args, *argsp = NULL; 14138 COMPOUND4res_clnt res, *resp = NULL; 14139 nfs_argop4 *argop; 14140 nfs_resop4 *resop; 14141 rnode4_t *rp; 14142 int doqueue = 1; 14143 clock_t tick_delay; /* delay in clock ticks */ 14144 struct lm_sysid *ls; 14145 LOCK4args *lock_args = NULL; 14146 LOCKU4args *locku_args = NULL; 14147 LOCKT4args *lockt_args = NULL; 14148 nfs4_open_owner_t *oop = NULL; 14149 nfs4_open_stream_t *osp = NULL; 14150 nfs4_lock_owner_t *lop = NULL; 14151 bool_t needrecov = FALSE; 14152 nfs4_recov_state_t recov_state; 14153 short whence; 14154 nfs4_op_hint_t op_hint; 14155 nfs4_lost_rqst_t lost_rqst; 14156 bool_t retry = FALSE; 14157 bool_t did_start_fop = FALSE; 14158 bool_t skip_get_err = FALSE; 14159 cred_t *cred_otw = NULL; 14160 bool_t recovonly; /* just queue request */ 14161 int frc_no_reclaim = 0; 14162 #ifdef DEBUG 14163 char *name; 14164 #endif 14165 14166 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14167 14168 #ifdef DEBUG 14169 name = fn_name(VTOSV(vp)->sv_name); 14170 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14171 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14172 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14173 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14174 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14175 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14176 resend_rqstp ? "TRUE" : "FALSE")); 14177 kmem_free(name, MAXNAMELEN); 14178 #endif 14179 14180 nfs4_error_zinit(ep); 14181 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14182 if (ep->error) 14183 return; 14184 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14185 if (ep->error) 14186 return; 14187 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14188 vp, cr, &cred_otw); 14189 14190 recov_retry: 14191 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14192 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14193 rp = VTOR4(vp); 14194 14195 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14196 &did_start_fop, &recovonly); 14197 14198 if (ep->error) 14199 goto out; 14200 14201 if (recovonly) { 14202 /* 14203 * Leave the request for the recovery system to deal with. 14204 */ 14205 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14206 ASSERT(cmd != F_GETLK); 14207 ASSERT(flk->l_type == F_UNLCK); 14208 14209 nfs4_error_init(ep, EINTR); 14210 needrecov = TRUE; 14211 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14212 if (lop != NULL) { 14213 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14214 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14215 (void) nfs4_start_recovery(ep, 14216 VTOMI4(vp), vp, NULL, NULL, 14217 (lost_rqst.lr_op == OP_LOCK || 14218 lost_rqst.lr_op == OP_LOCKU) ? 14219 &lost_rqst : NULL, OP_LOCKU, NULL); 14220 lock_owner_rele(lop); 14221 lop = NULL; 14222 } 14223 flk->l_pid = curproc->p_pid; 14224 nfs4_register_lock_locally(vp, flk, flag, offset); 14225 goto out; 14226 } 14227 14228 /* putfh directory fh */ 14229 argop[0].argop = OP_CPUTFH; 14230 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14231 14232 /* 14233 * Set up the over-the-wire arguments and get references to the 14234 * open owner, etc. 14235 */ 14236 14237 if (ctype == NFS4_LCK_CTYPE_RESEND || 14238 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14239 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14240 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14241 } else { 14242 bool_t go_otw = TRUE; 14243 14244 ASSERT(resend_rqstp == NULL); 14245 14246 switch (cmd) { 14247 case F_GETLK: 14248 case F_O_GETLK: 14249 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14250 &lockt_args, argsp, flk, rp); 14251 break; 14252 case F_SETLKW: 14253 case F_SETLK: 14254 if (flk->l_type == F_UNLCK) 14255 nfs4frlock_setup_locku_args(ctype, 14256 &argop[1], &locku_args, flk, 14257 &lop, ep, argsp, 14258 vp, flag, offset, cr, 14259 &skip_get_err, &go_otw); 14260 else 14261 nfs4frlock_setup_lock_args(ctype, 14262 &lock_args, &oop, &osp, &lop, &argop[1], 14263 argsp, flk, cmd, vp, cr, ep); 14264 14265 if (ep->error) 14266 goto out; 14267 14268 switch (ep->stat) { 14269 case NFS4_OK: 14270 break; 14271 case NFS4ERR_DELAY: 14272 /* recov thread never gets this error */ 14273 ASSERT(resend_rqstp == NULL); 14274 ASSERT(did_start_fop); 14275 14276 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14277 &recov_state, TRUE); 14278 did_start_fop = FALSE; 14279 if (argop[1].argop == OP_LOCK) 14280 nfs4args_lock_free(&argop[1]); 14281 else if (argop[1].argop == OP_LOCKT) 14282 nfs4args_lockt_free(&argop[1]); 14283 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14284 argsp = NULL; 14285 goto recov_retry; 14286 default: 14287 ep->error = EIO; 14288 goto out; 14289 } 14290 break; 14291 default: 14292 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14293 "nfs4_frlock: invalid cmd %d", cmd)); 14294 ep->error = EINVAL; 14295 goto out; 14296 } 14297 14298 if (!go_otw) 14299 goto out; 14300 } 14301 14302 /* XXX should we use the local reclock as a cache ? */ 14303 /* 14304 * Unregister the lock with the local locking code before 14305 * contacting the server. This avoids a potential race where 14306 * another process gets notified that it has been granted a lock 14307 * before we can unregister ourselves locally. 14308 */ 14309 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14310 if (ctype == NFS4_LCK_CTYPE_NORM) 14311 flk->l_pid = ttoproc(curthread)->p_pid; 14312 nfs4_register_lock_locally(vp, flk, flag, offset); 14313 } 14314 14315 /* 14316 * Send the server the lock request. Continually loop with a delay 14317 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14318 */ 14319 resp = &res; 14320 14321 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14322 (CE_NOTE, 14323 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14324 rnode4info(rp))); 14325 14326 if (lock_args && frc_no_reclaim) { 14327 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14328 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14329 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14330 lock_args->reclaim = FALSE; 14331 if (did_reclaimp) 14332 *did_reclaimp = 0; 14333 } 14334 14335 /* 14336 * Do the OTW call. 14337 */ 14338 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14339 14340 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14341 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14342 14343 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14344 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14345 "nfs4frlock: needrecov %d", needrecov)); 14346 14347 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14348 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14349 args.ctag); 14350 14351 /* 14352 * Check if one of these mutually exclusive error cases has 14353 * happened: 14354 * need to swap credentials due to access error 14355 * recovery is needed 14356 * different error (only known case is missing Kerberos ticket) 14357 */ 14358 14359 if ((ep->error == EACCES || 14360 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14361 cred_otw != cr) { 14362 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14363 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14364 cr, &cred_otw); 14365 goto recov_retry; 14366 } 14367 14368 if (needrecov) { 14369 /* 14370 * LOCKT requests don't need to recover from lost 14371 * requests since they don't create/modify state. 14372 */ 14373 if ((ep->error == EINTR || 14374 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14375 lockt_args) 14376 goto out; 14377 /* 14378 * Do not attempt recovery for requests initiated by 14379 * the recovery framework. Let the framework redrive them. 14380 */ 14381 if (ctype != NFS4_LCK_CTYPE_NORM) 14382 goto out; 14383 else { 14384 ASSERT(resend_rqstp == NULL); 14385 } 14386 14387 nfs4frlock_save_lost_rqst(ctype, ep->error, 14388 flk_to_locktype(cmd, flk->l_type), 14389 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14390 14391 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14392 &resp, lock_args, locku_args, &oop, &osp, &lop, 14393 rp, vp, &recov_state, op_hint, &did_start_fop, 14394 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14395 14396 if (retry) { 14397 ASSERT(oop == NULL); 14398 ASSERT(osp == NULL); 14399 ASSERT(lop == NULL); 14400 goto recov_retry; 14401 } 14402 goto out; 14403 } 14404 14405 /* 14406 * Bail out if have reached this point with ep->error set. Can 14407 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14408 * This happens if Kerberos ticket has expired or has been 14409 * destroyed. 14410 */ 14411 if (ep->error != 0) 14412 goto out; 14413 14414 /* 14415 * Process the reply. 14416 */ 14417 switch (resp->status) { 14418 case NFS4_OK: 14419 resop = &resp->array[1]; 14420 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14421 resend_rqstp); 14422 /* 14423 * Have a successful lock operation, now update state. 14424 */ 14425 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14426 resop, lop, vp, flk, cr, resend_rqstp); 14427 break; 14428 14429 case NFS4ERR_DENIED: 14430 resop = &resp->array[1]; 14431 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14432 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14433 &recov_state, needrecov, &argsp, &resp, 14434 &tick_delay, &whence, &ep->error, resop, cr, 14435 &did_start_fop, &skip_get_err); 14436 14437 if (retry) { 14438 ASSERT(oop == NULL); 14439 ASSERT(osp == NULL); 14440 ASSERT(lop == NULL); 14441 goto recov_retry; 14442 } 14443 break; 14444 /* 14445 * If the server won't let us reclaim, fall-back to trying to lock 14446 * the file from scratch. Code elsewhere will check the changeinfo 14447 * to ensure the file hasn't been changed. 14448 */ 14449 case NFS4ERR_NO_GRACE: 14450 if (lock_args && lock_args->reclaim == TRUE) { 14451 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14452 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14453 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14454 frc_no_reclaim = 1; 14455 /* clean up before retrying */ 14456 needrecov = 0; 14457 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14458 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14459 &recov_state, op_hint, &did_start_fop, NULL, flk); 14460 goto recov_retry; 14461 } 14462 /* FALLTHROUGH */ 14463 14464 default: 14465 nfs4frlock_results_default(resp, &ep->error); 14466 break; 14467 } 14468 out: 14469 /* 14470 * Process and cleanup from error. Make interrupted unlock 14471 * requests look successful, since they will be handled by the 14472 * client recovery code. 14473 */ 14474 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14475 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14476 lock_args, locku_args, did_start_fop, 14477 skip_get_err, cred_otw, cr); 14478 14479 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14480 (cmd == F_SETLK || cmd == F_SETLKW)) 14481 ep->error = 0; 14482 } 14483 14484 /* 14485 * nfs4_safelock: 14486 * 14487 * Return non-zero if the given lock request can be handled without 14488 * violating the constraints on concurrent mapping and locking. 14489 */ 14490 14491 static int 14492 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14493 { 14494 rnode4_t *rp = VTOR4(vp); 14495 struct vattr va; 14496 int error; 14497 14498 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14499 ASSERT(rp->r_mapcnt >= 0); 14500 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14501 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14502 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14503 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14504 14505 if (rp->r_mapcnt == 0) 14506 return (1); /* always safe if not mapped */ 14507 14508 /* 14509 * If the file is already mapped and there are locks, then they 14510 * should be all safe locks. So adding or removing a lock is safe 14511 * as long as the new request is safe (i.e., whole-file, meaning 14512 * length and starting offset are both zero). 14513 */ 14514 14515 if (bfp->l_start != 0 || bfp->l_len != 0) { 14516 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14517 "cannot lock a memory mapped file unless locking the " 14518 "entire file: start %"PRIx64", len %"PRIx64, 14519 bfp->l_start, bfp->l_len)); 14520 return (0); 14521 } 14522 14523 /* mandatory locking and mapping don't mix */ 14524 va.va_mask = AT_MODE; 14525 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14526 if (error != 0) { 14527 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14528 "getattr error %d", error)); 14529 return (0); /* treat errors conservatively */ 14530 } 14531 if (MANDLOCK(vp, va.va_mode)) { 14532 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14533 "cannot mandatory lock and mmap a file")); 14534 return (0); 14535 } 14536 14537 return (1); 14538 } 14539 14540 14541 /* 14542 * Register the lock locally within Solaris. 14543 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14544 * recording locks locally. 14545 * 14546 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14547 * are registered locally. 14548 */ 14549 void 14550 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14551 u_offset_t offset) 14552 { 14553 int oldsysid; 14554 int error; 14555 #ifdef DEBUG 14556 char *name; 14557 #endif 14558 14559 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14560 14561 #ifdef DEBUG 14562 name = fn_name(VTOSV(vp)->sv_name); 14563 NFS4_DEBUG(nfs4_client_lock_debug, 14564 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14565 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14566 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14567 flk->l_sysid)); 14568 kmem_free(name, MAXNAMELEN); 14569 #endif 14570 14571 /* register the lock with local locking */ 14572 oldsysid = flk->l_sysid; 14573 flk->l_sysid |= LM_SYSID_CLIENT; 14574 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14575 #ifdef DEBUG 14576 if (error != 0) { 14577 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14578 "nfs4_register_lock_locally: could not register with" 14579 " local locking")); 14580 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14581 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14582 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14583 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14584 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14585 flk->l_type, flk->l_start, flk->l_len)); 14586 (void) reclock(vp, flk, 0, flag, offset, NULL); 14587 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14588 "blocked by pid %d sysid 0x%x type %d " 14589 "off 0x%" PRIx64 " len 0x%" PRIx64, 14590 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14591 flk->l_len)); 14592 } 14593 #endif 14594 flk->l_sysid = oldsysid; 14595 } 14596 14597 /* 14598 * nfs4_lockrelease: 14599 * 14600 * Release any locks on the given vnode that are held by the current 14601 * process. Also removes the lock owner (if one exists) from the rnode's 14602 * list. 14603 */ 14604 static int 14605 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14606 { 14607 flock64_t ld; 14608 int ret, error; 14609 rnode4_t *rp; 14610 nfs4_lock_owner_t *lop; 14611 nfs4_recov_state_t recov_state; 14612 mntinfo4_t *mi; 14613 bool_t possible_orphan = FALSE; 14614 bool_t recovonly; 14615 14616 ASSERT((uintptr_t)vp > KERNELBASE); 14617 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14618 14619 rp = VTOR4(vp); 14620 mi = VTOMI4(vp); 14621 14622 /* 14623 * If we have not locked anything then we can 14624 * just return since we have no work to do. 14625 */ 14626 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14627 return (0); 14628 } 14629 14630 /* 14631 * We need to comprehend that another thread may 14632 * kick off recovery and the lock_owner we have stashed 14633 * in lop might be invalid so we should NOT cache it 14634 * locally! 14635 */ 14636 recov_state.rs_flags = 0; 14637 recov_state.rs_num_retry_despite_err = 0; 14638 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14639 &recovonly); 14640 if (error) { 14641 mutex_enter(&rp->r_statelock); 14642 rp->r_flags |= R4LODANGLERS; 14643 mutex_exit(&rp->r_statelock); 14644 return (error); 14645 } 14646 14647 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14648 14649 /* 14650 * Check if the lock owner might have a lock (request was sent but 14651 * no response was received). Also check if there are any remote 14652 * locks on the file. (In theory we shouldn't have to make this 14653 * second check if there's no lock owner, but for now we'll be 14654 * conservative and do it anyway.) If either condition is true, 14655 * send an unlock for the entire file to the server. 14656 * 14657 * Note that no explicit synchronization is needed here. At worst, 14658 * flk_has_remote_locks() will return a false positive, in which case 14659 * the unlock call wastes time but doesn't harm correctness. 14660 */ 14661 14662 if (lop) { 14663 mutex_enter(&lop->lo_lock); 14664 possible_orphan = lop->lo_pending_rqsts; 14665 mutex_exit(&lop->lo_lock); 14666 lock_owner_rele(lop); 14667 } 14668 14669 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14670 14671 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14672 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14673 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14674 (void *)lop)); 14675 14676 if (possible_orphan || flk_has_remote_locks(vp)) { 14677 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14678 ld.l_whence = 0; /* unlock from start of file */ 14679 ld.l_start = 0; 14680 ld.l_len = 0; /* do entire file */ 14681 14682 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14683 cr, NULL); 14684 14685 if (ret != 0) { 14686 /* 14687 * If VOP_FRLOCK fails, make sure we unregister 14688 * local locks before we continue. 14689 */ 14690 ld.l_pid = ttoproc(curthread)->p_pid; 14691 nfs4_register_lock_locally(vp, &ld, flag, offset); 14692 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14693 "nfs4_lockrelease: lock release error on vp" 14694 " %p: error %d.\n", (void *)vp, ret)); 14695 } 14696 } 14697 14698 recov_state.rs_flags = 0; 14699 recov_state.rs_num_retry_despite_err = 0; 14700 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14701 &recovonly); 14702 if (error) { 14703 mutex_enter(&rp->r_statelock); 14704 rp->r_flags |= R4LODANGLERS; 14705 mutex_exit(&rp->r_statelock); 14706 return (error); 14707 } 14708 14709 /* 14710 * So, here we're going to need to retrieve the lock-owner 14711 * again (in case recovery has done a switch-a-roo) and 14712 * remove it because we can. 14713 */ 14714 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14715 14716 if (lop) { 14717 nfs4_rnode_remove_lock_owner(rp, lop); 14718 lock_owner_rele(lop); 14719 } 14720 14721 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14722 return (0); 14723 } 14724 14725 /* 14726 * Wait for 'tick_delay' clock ticks. 14727 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14728 * NOTE: lock_lease_time is in seconds. 14729 * 14730 * XXX For future improvements, should implement a waiting queue scheme. 14731 */ 14732 static int 14733 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14734 { 14735 long milliseconds_delay; 14736 time_t lock_lease_time; 14737 14738 /* wait tick_delay clock ticks or siginteruptus */ 14739 if (delay_sig(*tick_delay)) { 14740 return (EINTR); 14741 } 14742 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14743 "reissue the lock request: blocked for %ld clock ticks: %ld " 14744 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14745 14746 /* get the lease time */ 14747 lock_lease_time = r2lease_time(rp); 14748 14749 /* drv_hztousec converts ticks to microseconds */ 14750 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14751 if (milliseconds_delay < lock_lease_time * 1000) { 14752 *tick_delay = 2 * *tick_delay; 14753 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14754 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14755 } 14756 return (0); 14757 } 14758 14759 14760 void 14761 nfs4_vnops_init(void) 14762 { 14763 } 14764 14765 void 14766 nfs4_vnops_fini(void) 14767 { 14768 } 14769 14770 /* 14771 * Return a reference to the directory (parent) vnode for a given vnode, 14772 * using the saved pathname information and the directory file handle. The 14773 * caller is responsible for disposing of the reference. 14774 * Returns zero or an errno value. 14775 * 14776 * Caller should set need_start_op to FALSE if it is the recovery 14777 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14778 */ 14779 int 14780 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14781 { 14782 svnode_t *svnp; 14783 vnode_t *dvp = NULL; 14784 servinfo4_t *svp; 14785 nfs4_fname_t *mfname; 14786 int error; 14787 14788 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14789 14790 if (vp->v_flag & VROOT) { 14791 nfs4_sharedfh_t *sfh; 14792 nfs_fh4 fh; 14793 mntinfo4_t *mi; 14794 14795 ASSERT(vp->v_type == VREG); 14796 14797 mi = VTOMI4(vp); 14798 svp = mi->mi_curr_serv; 14799 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14800 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14801 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14802 sfh = sfh4_get(&fh, VTOMI4(vp)); 14803 nfs_rw_exit(&svp->sv_lock); 14804 mfname = mi->mi_fname; 14805 fn_hold(mfname); 14806 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14807 sfh4_rele(&sfh); 14808 14809 if (dvp->v_type == VNON) 14810 dvp->v_type = VDIR; 14811 *dvpp = dvp; 14812 return (0); 14813 } 14814 14815 svnp = VTOSV(vp); 14816 14817 if (svnp == NULL) { 14818 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14819 "shadow node is NULL")); 14820 return (EINVAL); 14821 } 14822 14823 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14824 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14825 "shadow node name or dfh val == NULL")); 14826 return (EINVAL); 14827 } 14828 14829 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14830 (int)need_start_op); 14831 if (error != 0) { 14832 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14833 "nfs4_make_dotdot returned %d", error)); 14834 return (error); 14835 } 14836 if (!dvp) { 14837 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14838 "nfs4_make_dotdot returned a NULL dvp")); 14839 return (EIO); 14840 } 14841 if (dvp->v_type == VNON) 14842 dvp->v_type = VDIR; 14843 ASSERT(dvp->v_type == VDIR); 14844 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14845 mutex_enter(&dvp->v_lock); 14846 dvp->v_flag |= V_XATTRDIR; 14847 mutex_exit(&dvp->v_lock); 14848 } 14849 *dvpp = dvp; 14850 return (0); 14851 } 14852 14853 /* 14854 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14855 * length that fnamep can accept, including the trailing null. 14856 * Returns 0 if okay, returns an errno value if there was a problem. 14857 */ 14858 14859 int 14860 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14861 { 14862 char *fn; 14863 int err = 0; 14864 servinfo4_t *svp; 14865 svnode_t *shvp; 14866 14867 /* 14868 * If the file being opened has VROOT set, then this is 14869 * a "file" mount. sv_name will not be interesting, so 14870 * go back to the servinfo4 to get the original mount 14871 * path and strip off all but the final edge. Otherwise 14872 * just return the name from the shadow vnode. 14873 */ 14874 14875 if (vp->v_flag & VROOT) { 14876 14877 svp = VTOMI4(vp)->mi_curr_serv; 14878 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14879 14880 fn = strrchr(svp->sv_path, '/'); 14881 if (fn == NULL) 14882 err = EINVAL; 14883 else 14884 fn++; 14885 } else { 14886 shvp = VTOSV(vp); 14887 fn = fn_name(shvp->sv_name); 14888 } 14889 14890 if (err == 0) 14891 if (strlen(fn) < maxlen) 14892 (void) strcpy(fnamep, fn); 14893 else 14894 err = ENAMETOOLONG; 14895 14896 if (vp->v_flag & VROOT) 14897 nfs_rw_exit(&svp->sv_lock); 14898 else 14899 kmem_free(fn, MAXNAMELEN); 14900 14901 return (err); 14902 } 14903 14904 /* 14905 * Bookkeeping for a close that doesn't need to go over the wire. 14906 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14907 * it is left at 1. 14908 */ 14909 void 14910 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14911 { 14912 rnode4_t *rp; 14913 mntinfo4_t *mi; 14914 14915 mi = VTOMI4(vp); 14916 rp = VTOR4(vp); 14917 14918 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14919 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14920 ASSERT(nfs_zone() == mi->mi_zone); 14921 ASSERT(mutex_owned(&osp->os_sync_lock)); 14922 ASSERT(*have_lockp); 14923 14924 if (!osp->os_valid || 14925 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14926 return; 14927 } 14928 14929 /* 14930 * This removes the reference obtained at OPEN; ie, 14931 * when the open stream structure was created. 14932 * 14933 * We don't have to worry about calling 'open_stream_rele' 14934 * since we our currently holding a reference to this 14935 * open stream which means the count can not go to 0 with 14936 * this decrement. 14937 */ 14938 ASSERT(osp->os_ref_count >= 2); 14939 osp->os_ref_count--; 14940 osp->os_valid = 0; 14941 mutex_exit(&osp->os_sync_lock); 14942 *have_lockp = 0; 14943 14944 nfs4_dec_state_ref_count(mi); 14945 } 14946 14947 /* 14948 * Close all remaining open streams on the rnode. These open streams 14949 * could be here because: 14950 * - The close attempted at either close or delmap failed 14951 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14952 * - Someone did mknod on a regular file but never opened it 14953 */ 14954 int 14955 nfs4close_all(vnode_t *vp, cred_t *cr) 14956 { 14957 nfs4_open_stream_t *osp; 14958 int error; 14959 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14960 rnode4_t *rp; 14961 14962 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14963 14964 error = 0; 14965 rp = VTOR4(vp); 14966 14967 /* 14968 * At this point, all we know is that the last time 14969 * someone called vn_rele, the count was 1. Since then, 14970 * the vnode could have been re-activated. We want to 14971 * loop through the open streams and close each one, but 14972 * we have to be careful since once we release the rnode 14973 * hash bucket lock, someone else is free to come in and 14974 * re-activate the rnode and add new open streams. The 14975 * strategy is take the rnode hash bucket lock, verify that 14976 * the count is still 1, grab the open stream off the 14977 * head of the list and mark it invalid, then release the 14978 * rnode hash bucket lock and proceed with that open stream. 14979 * This is ok because nfs4close_one() will acquire the proper 14980 * open/create to close/destroy synchronization for open 14981 * streams, and will ensure that if someone has reopened 14982 * the open stream after we've dropped the hash bucket lock 14983 * then we'll just simply return without destroying the 14984 * open stream. 14985 * Repeat until the list is empty. 14986 */ 14987 14988 for (;;) { 14989 14990 /* make sure vnode hasn't been reactivated */ 14991 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14992 mutex_enter(&vp->v_lock); 14993 if (vp->v_count > 1) { 14994 mutex_exit(&vp->v_lock); 14995 rw_exit(&rp->r_hashq->r_lock); 14996 break; 14997 } 14998 /* 14999 * Grabbing r_os_lock before releasing v_lock prevents 15000 * a window where the rnode/open stream could get 15001 * reactivated (and os_force_close set to 0) before we 15002 * had a chance to set os_force_close to 1. 15003 */ 15004 mutex_enter(&rp->r_os_lock); 15005 mutex_exit(&vp->v_lock); 15006 15007 osp = list_head(&rp->r_open_streams); 15008 if (!osp) { 15009 /* nothing left to CLOSE OTW, so return */ 15010 mutex_exit(&rp->r_os_lock); 15011 rw_exit(&rp->r_hashq->r_lock); 15012 break; 15013 } 15014 15015 mutex_enter(&rp->r_statev4_lock); 15016 /* the file can't still be mem mapped */ 15017 ASSERT(rp->r_mapcnt == 0); 15018 if (rp->created_v4) 15019 rp->created_v4 = 0; 15020 mutex_exit(&rp->r_statev4_lock); 15021 15022 /* 15023 * Grab a ref on this open stream; nfs4close_one 15024 * will mark it as invalid 15025 */ 15026 mutex_enter(&osp->os_sync_lock); 15027 osp->os_ref_count++; 15028 osp->os_force_close = 1; 15029 mutex_exit(&osp->os_sync_lock); 15030 mutex_exit(&rp->r_os_lock); 15031 rw_exit(&rp->r_hashq->r_lock); 15032 15033 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15034 15035 /* Update error if it isn't already non-zero */ 15036 if (error == 0) { 15037 if (e.error) 15038 error = e.error; 15039 else if (e.stat) 15040 error = geterrno4(e.stat); 15041 } 15042 15043 #ifdef DEBUG 15044 nfs4close_all_cnt++; 15045 #endif 15046 /* Release the ref on osp acquired above. */ 15047 open_stream_rele(osp, rp); 15048 15049 /* Proceed to the next open stream, if any */ 15050 } 15051 return (error); 15052 } 15053 15054 /* 15055 * nfs4close_one - close one open stream for a file if needed. 15056 * 15057 * "close_type" indicates which close path this is: 15058 * CLOSE_NORM: close initiated via VOP_CLOSE. 15059 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15060 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15061 * the close and release of client state for this open stream 15062 * (unless someone else has the open stream open). 15063 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15064 * (e.g., due to abort because of a signal). 15065 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15066 * 15067 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15068 * recovery. Instead, the caller is expected to deal with retries. 15069 * 15070 * The caller can either pass in the osp ('provided_osp') or not. 15071 * 15072 * 'access_bits' represents the access we are closing/downgrading. 15073 * 15074 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15075 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15076 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15077 * 15078 * Errors are returned via the nfs4_error_t. 15079 */ 15080 void 15081 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15082 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15083 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15084 uint_t mmap_flags) 15085 { 15086 nfs4_open_owner_t *oop; 15087 nfs4_open_stream_t *osp = NULL; 15088 int retry = 0; 15089 int num_retries = NFS4_NUM_RECOV_RETRIES; 15090 rnode4_t *rp; 15091 mntinfo4_t *mi; 15092 nfs4_recov_state_t recov_state; 15093 cred_t *cred_otw = NULL; 15094 bool_t recovonly = FALSE; 15095 int isrecov; 15096 int force_close; 15097 int close_failed = 0; 15098 int did_dec_count = 0; 15099 int did_start_op = 0; 15100 int did_force_recovlock = 0; 15101 int did_start_seqid_sync = 0; 15102 int have_sync_lock = 0; 15103 15104 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15105 15106 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15107 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15108 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15109 len, maxprot, mmap_flags, access_bits)); 15110 15111 nfs4_error_zinit(ep); 15112 rp = VTOR4(vp); 15113 mi = VTOMI4(vp); 15114 isrecov = (close_type == CLOSE_RESEND || 15115 close_type == CLOSE_AFTER_RESEND); 15116 15117 /* 15118 * First get the open owner. 15119 */ 15120 if (!provided_osp) { 15121 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15122 } else { 15123 oop = provided_osp->os_open_owner; 15124 ASSERT(oop != NULL); 15125 open_owner_hold(oop); 15126 } 15127 15128 if (!oop) { 15129 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15130 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15131 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15132 (void *)provided_osp, close_type)); 15133 ep->error = EIO; 15134 goto out; 15135 } 15136 15137 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15138 recov_retry: 15139 osp = NULL; 15140 close_failed = 0; 15141 force_close = (close_type == CLOSE_FORCE); 15142 retry = 0; 15143 did_start_op = 0; 15144 did_force_recovlock = 0; 15145 did_start_seqid_sync = 0; 15146 have_sync_lock = 0; 15147 recovonly = FALSE; 15148 recov_state.rs_flags = 0; 15149 recov_state.rs_num_retry_despite_err = 0; 15150 15151 /* 15152 * Second synchronize with recovery. 15153 */ 15154 if (!isrecov) { 15155 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15156 &recov_state, &recovonly); 15157 if (!ep->error) { 15158 did_start_op = 1; 15159 } else { 15160 close_failed = 1; 15161 /* 15162 * If we couldn't get start_fop, but have to 15163 * cleanup state, then at least acquire the 15164 * mi_recovlock so we can synchronize with 15165 * recovery. 15166 */ 15167 if (close_type == CLOSE_FORCE) { 15168 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15169 RW_READER, FALSE); 15170 did_force_recovlock = 1; 15171 } else 15172 goto out; 15173 } 15174 } 15175 15176 /* 15177 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15178 * set 'recovonly' to TRUE since most likely this is due to 15179 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15180 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15181 * to retry, causing us to loop until recovery finishes. Plus we 15182 * don't need protection over the open seqid since we're not going 15183 * OTW, hence don't need to use the seqid. 15184 */ 15185 if (recovonly == FALSE) { 15186 /* need to grab the open owner sync before 'os_sync_lock' */ 15187 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15188 if (ep->error == EAGAIN) { 15189 ASSERT(!isrecov); 15190 if (did_start_op) 15191 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15192 &recov_state, TRUE); 15193 if (did_force_recovlock) 15194 nfs_rw_exit(&mi->mi_recovlock); 15195 goto recov_retry; 15196 } 15197 did_start_seqid_sync = 1; 15198 } 15199 15200 /* 15201 * Third get an open stream and acquire 'os_sync_lock' to 15202 * sychronize the opening/creating of an open stream with the 15203 * closing/destroying of an open stream. 15204 */ 15205 if (!provided_osp) { 15206 /* returns with 'os_sync_lock' held */ 15207 osp = find_open_stream(oop, rp); 15208 if (!osp) { 15209 ep->error = EIO; 15210 goto out; 15211 } 15212 } else { 15213 osp = provided_osp; 15214 open_stream_hold(osp); 15215 mutex_enter(&osp->os_sync_lock); 15216 } 15217 have_sync_lock = 1; 15218 15219 ASSERT(oop == osp->os_open_owner); 15220 15221 /* 15222 * Fourth, do any special pre-OTW CLOSE processing 15223 * based on the specific close type. 15224 */ 15225 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15226 !did_dec_count) { 15227 ASSERT(osp->os_open_ref_count > 0); 15228 osp->os_open_ref_count--; 15229 did_dec_count = 1; 15230 if (osp->os_open_ref_count == 0) 15231 osp->os_final_close = 1; 15232 } 15233 15234 if (close_type == CLOSE_FORCE) { 15235 /* see if somebody reopened the open stream. */ 15236 if (!osp->os_force_close) { 15237 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15238 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15239 "was reopened, vp %p", (void *)osp, (void *)vp)); 15240 ep->error = 0; 15241 ep->stat = NFS4_OK; 15242 goto out; 15243 } 15244 15245 if (!osp->os_final_close && !did_dec_count) { 15246 osp->os_open_ref_count--; 15247 did_dec_count = 1; 15248 } 15249 15250 /* 15251 * We can't depend on os_open_ref_count being 0 due to the 15252 * way executables are opened (VN_RELE to match a VOP_OPEN). 15253 */ 15254 #ifdef NOTYET 15255 ASSERT(osp->os_open_ref_count == 0); 15256 #endif 15257 if (osp->os_open_ref_count != 0) { 15258 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15259 "nfs4close_one: should panic here on an " 15260 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15261 "since this is probably the exec problem.")); 15262 15263 osp->os_open_ref_count = 0; 15264 } 15265 15266 /* 15267 * There is the possibility that nfs4close_one() 15268 * for close_type == CLOSE_DELMAP couldn't find the 15269 * open stream, thus couldn't decrement its os_mapcnt; 15270 * therefore we can't use this ASSERT yet. 15271 */ 15272 #ifdef NOTYET 15273 ASSERT(osp->os_mapcnt == 0); 15274 #endif 15275 osp->os_mapcnt = 0; 15276 } 15277 15278 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15279 ASSERT(osp->os_mapcnt >= btopr(len)); 15280 15281 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15282 osp->os_mmap_write -= btopr(len); 15283 if (maxprot & PROT_READ) 15284 osp->os_mmap_read -= btopr(len); 15285 if (maxprot & PROT_EXEC) 15286 osp->os_mmap_read -= btopr(len); 15287 /* mirror the PROT_NONE check in nfs4_addmap() */ 15288 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15289 !(maxprot & PROT_EXEC)) 15290 osp->os_mmap_read -= btopr(len); 15291 osp->os_mapcnt -= btopr(len); 15292 did_dec_count = 1; 15293 } 15294 15295 if (recovonly) { 15296 nfs4_lost_rqst_t lost_rqst; 15297 15298 /* request should not already be in recovery queue */ 15299 ASSERT(lrp == NULL); 15300 nfs4_error_init(ep, EINTR); 15301 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15302 osp, cred_otw, vp); 15303 mutex_exit(&osp->os_sync_lock); 15304 have_sync_lock = 0; 15305 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15306 lost_rqst.lr_op == OP_CLOSE ? 15307 &lost_rqst : NULL, OP_CLOSE, NULL); 15308 close_failed = 1; 15309 force_close = 0; 15310 goto close_cleanup; 15311 } 15312 15313 /* 15314 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15315 * we stopped operating on the open owner's <old oo_name, old seqid> 15316 * space, which means we stopped operating on the open stream 15317 * too. So don't go OTW (as the seqid is likely bad, and the 15318 * stateid could be stale, potentially triggering a false 15319 * setclientid), and just clean up the client's internal state. 15320 */ 15321 if (osp->os_orig_oo_name != oop->oo_name) { 15322 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15323 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15324 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15325 "oo_name %" PRIx64")", 15326 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15327 oop->oo_name)); 15328 close_failed = 1; 15329 } 15330 15331 /* If the file failed recovery, just quit. */ 15332 mutex_enter(&rp->r_statelock); 15333 if (rp->r_flags & R4RECOVERR) { 15334 close_failed = 1; 15335 } 15336 mutex_exit(&rp->r_statelock); 15337 15338 /* 15339 * If the force close path failed to obtain start_fop 15340 * then skip the OTW close and just remove the state. 15341 */ 15342 if (close_failed) 15343 goto close_cleanup; 15344 15345 /* 15346 * Fifth, check to see if there are still mapped pages or other 15347 * opens using this open stream. If there are then we can't 15348 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15349 */ 15350 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15351 nfs4_lost_rqst_t new_lost_rqst; 15352 bool_t needrecov = FALSE; 15353 cred_t *odg_cred_otw = NULL; 15354 seqid4 open_dg_seqid = 0; 15355 15356 if (osp->os_delegation) { 15357 /* 15358 * If this open stream was never OPENed OTW then we 15359 * surely can't DOWNGRADE it (especially since the 15360 * osp->open_stateid is really a delegation stateid 15361 * when os_delegation is 1). 15362 */ 15363 if (access_bits & FREAD) 15364 osp->os_share_acc_read--; 15365 if (access_bits & FWRITE) 15366 osp->os_share_acc_write--; 15367 osp->os_share_deny_none--; 15368 nfs4_error_zinit(ep); 15369 goto out; 15370 } 15371 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15372 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15373 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15374 if (needrecov && !isrecov) { 15375 bool_t abort; 15376 nfs4_bseqid_entry_t *bsep = NULL; 15377 15378 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15379 bsep = nfs4_create_bseqid_entry(oop, NULL, 15380 vp, 0, 15381 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15382 open_dg_seqid); 15383 15384 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15385 oop, osp, odg_cred_otw, vp, access_bits, 0); 15386 mutex_exit(&osp->os_sync_lock); 15387 have_sync_lock = 0; 15388 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15389 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15390 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15391 bsep); 15392 if (odg_cred_otw) 15393 crfree(odg_cred_otw); 15394 if (bsep) 15395 kmem_free(bsep, sizeof (*bsep)); 15396 15397 if (abort == TRUE) 15398 goto out; 15399 15400 if (did_start_seqid_sync) { 15401 nfs4_end_open_seqid_sync(oop); 15402 did_start_seqid_sync = 0; 15403 } 15404 open_stream_rele(osp, rp); 15405 15406 if (did_start_op) 15407 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15408 &recov_state, FALSE); 15409 if (did_force_recovlock) 15410 nfs_rw_exit(&mi->mi_recovlock); 15411 15412 goto recov_retry; 15413 } else { 15414 if (odg_cred_otw) 15415 crfree(odg_cred_otw); 15416 } 15417 goto out; 15418 } 15419 15420 /* 15421 * If this open stream was created as the results of an open 15422 * while holding a delegation, then just release it; no need 15423 * to do an OTW close. Otherwise do a "normal" OTW close. 15424 */ 15425 if (osp->os_delegation) { 15426 nfs4close_notw(vp, osp, &have_sync_lock); 15427 nfs4_error_zinit(ep); 15428 goto out; 15429 } 15430 15431 /* 15432 * If this stream is not valid, we're done. 15433 */ 15434 if (!osp->os_valid) { 15435 nfs4_error_zinit(ep); 15436 goto out; 15437 } 15438 15439 /* 15440 * Last open or mmap ref has vanished, need to do an OTW close. 15441 * First check to see if a close is still necessary. 15442 */ 15443 if (osp->os_failed_reopen) { 15444 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15445 "don't close OTW osp %p since reopen failed.", 15446 (void *)osp)); 15447 /* 15448 * Reopen of the open stream failed, hence the 15449 * stateid of the open stream is invalid/stale, and 15450 * sending this OTW would incorrectly cause another 15451 * round of recovery. In this case, we need to set 15452 * the 'os_valid' bit to 0 so another thread doesn't 15453 * come in and re-open this open stream before 15454 * this "closing" thread cleans up state (decrementing 15455 * the nfs4_server_t's state_ref_count and decrementing 15456 * the os_ref_count). 15457 */ 15458 osp->os_valid = 0; 15459 /* 15460 * This removes the reference obtained at OPEN; ie, 15461 * when the open stream structure was created. 15462 * 15463 * We don't have to worry about calling 'open_stream_rele' 15464 * since we our currently holding a reference to this 15465 * open stream which means the count can not go to 0 with 15466 * this decrement. 15467 */ 15468 ASSERT(osp->os_ref_count >= 2); 15469 osp->os_ref_count--; 15470 nfs4_error_zinit(ep); 15471 close_failed = 0; 15472 goto close_cleanup; 15473 } 15474 15475 ASSERT(osp->os_ref_count > 1); 15476 15477 /* 15478 * Sixth, try the CLOSE OTW. 15479 */ 15480 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15481 close_type, ep, &have_sync_lock); 15482 15483 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15484 /* 15485 * Let the recovery thread be responsible for 15486 * removing the state for CLOSE. 15487 */ 15488 close_failed = 1; 15489 force_close = 0; 15490 retry = 0; 15491 } 15492 15493 /* See if we need to retry with a different cred */ 15494 if ((ep->error == EACCES || 15495 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15496 cred_otw != cr) { 15497 crfree(cred_otw); 15498 cred_otw = cr; 15499 crhold(cred_otw); 15500 retry = 1; 15501 } 15502 15503 if (ep->error || ep->stat) 15504 close_failed = 1; 15505 15506 if (retry && !isrecov && num_retries-- > 0) { 15507 if (have_sync_lock) { 15508 mutex_exit(&osp->os_sync_lock); 15509 have_sync_lock = 0; 15510 } 15511 if (did_start_seqid_sync) { 15512 nfs4_end_open_seqid_sync(oop); 15513 did_start_seqid_sync = 0; 15514 } 15515 open_stream_rele(osp, rp); 15516 15517 if (did_start_op) 15518 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15519 &recov_state, FALSE); 15520 if (did_force_recovlock) 15521 nfs_rw_exit(&mi->mi_recovlock); 15522 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15523 "nfs4close_one: need to retry the close " 15524 "operation")); 15525 goto recov_retry; 15526 } 15527 close_cleanup: 15528 /* 15529 * Seventh and lastly, process our results. 15530 */ 15531 if (close_failed && force_close) { 15532 /* 15533 * It's ok to drop and regrab the 'os_sync_lock' since 15534 * nfs4close_notw() will recheck to make sure the 15535 * "close"/removal of state should happen. 15536 */ 15537 if (!have_sync_lock) { 15538 mutex_enter(&osp->os_sync_lock); 15539 have_sync_lock = 1; 15540 } 15541 /* 15542 * This is last call, remove the ref on the open 15543 * stream created by open and clean everything up. 15544 */ 15545 osp->os_pending_close = 0; 15546 nfs4close_notw(vp, osp, &have_sync_lock); 15547 nfs4_error_zinit(ep); 15548 } 15549 15550 if (!close_failed) { 15551 if (have_sync_lock) { 15552 osp->os_pending_close = 0; 15553 mutex_exit(&osp->os_sync_lock); 15554 have_sync_lock = 0; 15555 } else { 15556 mutex_enter(&osp->os_sync_lock); 15557 osp->os_pending_close = 0; 15558 mutex_exit(&osp->os_sync_lock); 15559 } 15560 if (did_start_op && recov_state.rs_sp != NULL) { 15561 mutex_enter(&recov_state.rs_sp->s_lock); 15562 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15563 mutex_exit(&recov_state.rs_sp->s_lock); 15564 } else { 15565 nfs4_dec_state_ref_count(mi); 15566 } 15567 nfs4_error_zinit(ep); 15568 } 15569 15570 out: 15571 if (have_sync_lock) 15572 mutex_exit(&osp->os_sync_lock); 15573 if (did_start_op) 15574 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15575 recovonly ? TRUE : FALSE); 15576 if (did_force_recovlock) 15577 nfs_rw_exit(&mi->mi_recovlock); 15578 if (cred_otw) 15579 crfree(cred_otw); 15580 if (osp) 15581 open_stream_rele(osp, rp); 15582 if (oop) { 15583 if (did_start_seqid_sync) 15584 nfs4_end_open_seqid_sync(oop); 15585 open_owner_rele(oop); 15586 } 15587 } 15588 15589 /* 15590 * Convert information returned by the server in the LOCK4denied 15591 * structure to the form required by fcntl. 15592 */ 15593 static void 15594 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15595 { 15596 nfs4_lo_name_t *lo; 15597 15598 #ifdef DEBUG 15599 if (denied_to_flk_debug) { 15600 lockt_denied_debug = lockt_denied; 15601 debug_enter("lockt_denied"); 15602 } 15603 #endif 15604 15605 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15606 flk->l_whence = 0; /* aka SEEK_SET */ 15607 flk->l_start = lockt_denied->offset; 15608 flk->l_len = lockt_denied->length; 15609 15610 /* 15611 * If the blocking clientid matches our client id, then we can 15612 * interpret the lockowner (since we built it). If not, then 15613 * fabricate a sysid and pid. Note that the l_sysid field 15614 * in *flk already has the local sysid. 15615 */ 15616 15617 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15618 15619 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15620 lo = (nfs4_lo_name_t *) 15621 lockt_denied->owner.owner_val; 15622 15623 flk->l_pid = lo->ln_pid; 15624 } else { 15625 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15626 "denied_to_flk: bad lock owner length\n")); 15627 15628 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15629 } 15630 } else { 15631 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15632 "denied_to_flk: foreign clientid\n")); 15633 15634 /* 15635 * Construct a new sysid which should be different from 15636 * sysids of other systems. 15637 */ 15638 15639 flk->l_sysid++; 15640 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15641 } 15642 } 15643 15644 static pid_t 15645 lo_to_pid(lock_owner4 *lop) 15646 { 15647 pid_t pid = 0; 15648 uchar_t *cp; 15649 int i; 15650 15651 cp = (uchar_t *)&lop->clientid; 15652 15653 for (i = 0; i < sizeof (lop->clientid); i++) 15654 pid += (pid_t)*cp++; 15655 15656 cp = (uchar_t *)lop->owner_val; 15657 15658 for (i = 0; i < lop->owner_len; i++) 15659 pid += (pid_t)*cp++; 15660 15661 return (pid); 15662 } 15663 15664 /* 15665 * Given a lock pointer, returns the length of that lock. 15666 * "end" is the last locked offset the "l_len" covers from 15667 * the start of the lock. 15668 */ 15669 static off64_t 15670 lock_to_end(flock64_t *lock) 15671 { 15672 off64_t lock_end; 15673 15674 if (lock->l_len == 0) 15675 lock_end = (off64_t)MAXEND; 15676 else 15677 lock_end = lock->l_start + lock->l_len - 1; 15678 15679 return (lock_end); 15680 } 15681 15682 /* 15683 * Given the end of a lock, it will return you the length "l_len" for that lock. 15684 */ 15685 static off64_t 15686 end_to_len(off64_t start, off64_t end) 15687 { 15688 off64_t lock_len; 15689 15690 ASSERT(end >= start); 15691 if (end == MAXEND) 15692 lock_len = 0; 15693 else 15694 lock_len = end - start + 1; 15695 15696 return (lock_len); 15697 } 15698 15699 /* 15700 * On given end for a lock it determines if it is the last locked offset 15701 * or not, if so keeps it as is, else adds one to return the length for 15702 * valid start. 15703 */ 15704 static off64_t 15705 start_check(off64_t x) 15706 { 15707 if (x == MAXEND) 15708 return (x); 15709 else 15710 return (x + 1); 15711 } 15712 15713 /* 15714 * See if these two locks overlap, and if so return 1; 15715 * otherwise, return 0. 15716 */ 15717 static int 15718 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15719 { 15720 off64_t llfp_end, curfp_end; 15721 15722 llfp_end = lock_to_end(llfp); 15723 curfp_end = lock_to_end(curfp); 15724 15725 if (((llfp_end >= curfp->l_start) && 15726 (llfp->l_start <= curfp->l_start)) || 15727 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15728 return (1); 15729 return (0); 15730 } 15731 15732 /* 15733 * Determine what the intersecting lock region is, and add that to the 15734 * 'nl_llpp' locklist in increasing order (by l_start). 15735 */ 15736 static void 15737 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15738 locklist_t **nl_llpp, vnode_t *vp) 15739 { 15740 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15741 off64_t lost_flp_end, local_flp_end, len, start; 15742 15743 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15744 15745 if (!locks_intersect(lost_flp, local_flp)) 15746 return; 15747 15748 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15749 "locks intersect")); 15750 15751 lost_flp_end = lock_to_end(lost_flp); 15752 local_flp_end = lock_to_end(local_flp); 15753 15754 /* Find the starting point of the intersecting region */ 15755 if (local_flp->l_start > lost_flp->l_start) 15756 start = local_flp->l_start; 15757 else 15758 start = lost_flp->l_start; 15759 15760 /* Find the lenght of the intersecting region */ 15761 if (lost_flp_end < local_flp_end) 15762 len = end_to_len(start, lost_flp_end); 15763 else 15764 len = end_to_len(start, local_flp_end); 15765 15766 /* 15767 * Prepare the flock structure for the intersection found and insert 15768 * it into the new list in increasing l_start order. This list contains 15769 * intersections of locks registered by the client with the local host 15770 * and the lost lock. 15771 * The lock type of this lock is the same as that of the local_flp. 15772 */ 15773 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15774 intersect_llp->ll_flock.l_start = start; 15775 intersect_llp->ll_flock.l_len = len; 15776 intersect_llp->ll_flock.l_type = local_flp->l_type; 15777 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15778 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15779 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15780 intersect_llp->ll_vp = vp; 15781 15782 tmp_fllp = *nl_llpp; 15783 cur_fllp = NULL; 15784 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15785 intersect_llp->ll_flock.l_start) { 15786 cur_fllp = tmp_fllp; 15787 tmp_fllp = tmp_fllp->ll_next; 15788 } 15789 if (cur_fllp == NULL) { 15790 /* first on the list */ 15791 intersect_llp->ll_next = *nl_llpp; 15792 *nl_llpp = intersect_llp; 15793 } else { 15794 intersect_llp->ll_next = cur_fllp->ll_next; 15795 cur_fllp->ll_next = intersect_llp; 15796 } 15797 15798 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15799 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15800 intersect_llp->ll_flock.l_start, 15801 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15802 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15803 } 15804 15805 /* 15806 * Our local locking current state is potentially different than 15807 * what the NFSv4 server thinks we have due to a lost lock that was 15808 * resent and then received. We need to reset our "NFSv4" locking 15809 * state to match the current local locking state for this pid since 15810 * that is what the user/application sees as what the world is. 15811 * 15812 * We cannot afford to drop the open/lock seqid sync since then we can 15813 * get confused about what the current local locking state "is" versus 15814 * "was". 15815 * 15816 * If we are unable to fix up the locks, we send SIGLOST to the affected 15817 * process. This is not done if the filesystem has been forcibly 15818 * unmounted, in case the process has already exited and a new process 15819 * exists with the same pid. 15820 */ 15821 static void 15822 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15823 nfs4_lock_owner_t *lop) 15824 { 15825 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15826 mntinfo4_t *mi = VTOMI4(vp); 15827 const int cmd = F_SETLK; 15828 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15829 flock64_t ul_fl; 15830 15831 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15832 "nfs4_reinstitute_local_lock_state")); 15833 15834 /* 15835 * Find active locks for this vp from the local locking code. 15836 * Scan through this list and find out the locks that intersect with 15837 * the lost lock. Once we find the lock that intersects, add the 15838 * intersection area as a new lock to a new list "ri_llp". The lock 15839 * type of the intersection region lock added to ri_llp is the same 15840 * as that found in the active lock list, "list". The intersecting 15841 * region locks are added to ri_llp in increasing l_start order. 15842 */ 15843 ASSERT(nfs_zone() == mi->mi_zone); 15844 15845 locks = flk_active_locks_for_vp(vp); 15846 ri_llp = NULL; 15847 15848 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15849 ASSERT(llp->ll_vp == vp); 15850 /* 15851 * Pick locks that belong to this pid/lockowner 15852 */ 15853 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15854 continue; 15855 15856 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15857 } 15858 15859 /* 15860 * Now we have the list of intersections with the lost lock. These are 15861 * the locks that were/are active before the server replied to the 15862 * last/lost lock. Issue these locks to the server here. Playing these 15863 * locks to the server will re-establish aur current local locking state 15864 * with the v4 server. 15865 * If we get an error, send SIGLOST to the application for that lock. 15866 */ 15867 15868 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15869 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15870 "nfs4_reinstitute_local_lock_state: need to issue " 15871 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15872 llp->ll_flock.l_start, 15873 llp->ll_flock.l_start + llp->ll_flock.l_len, 15874 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15875 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15876 /* 15877 * No need to relock what we already have 15878 */ 15879 if (llp->ll_flock.l_type == lost_flp->l_type) 15880 continue; 15881 15882 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15883 } 15884 15885 /* 15886 * Now keeping the start of the lost lock as our reference parse the 15887 * newly created ri_llp locklist to find the ranges that we have locked 15888 * with the v4 server but not in the current local locking. We need 15889 * to unlock these ranges. 15890 * These ranges can also be reffered to as those ranges, where the lost 15891 * lock does not overlap with the locks in the ri_llp but are locked 15892 * since the server replied to the lost lock. 15893 */ 15894 cur_start = lost_flp->l_start; 15895 lost_flp_end = lock_to_end(lost_flp); 15896 15897 ul_fl.l_type = F_UNLCK; 15898 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15899 ul_fl.l_sysid = lost_flp->l_sysid; 15900 ul_fl.l_pid = lost_flp->l_pid; 15901 15902 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15903 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15904 15905 if (llp->ll_flock.l_start <= cur_start) { 15906 cur_start = start_check(llp_ll_flock_end); 15907 continue; 15908 } 15909 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15910 "nfs4_reinstitute_local_lock_state: " 15911 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15912 cur_start, llp->ll_flock.l_start)); 15913 15914 ul_fl.l_start = cur_start; 15915 ul_fl.l_len = end_to_len(cur_start, 15916 (llp->ll_flock.l_start - 1)); 15917 15918 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15919 cur_start = start_check(llp_ll_flock_end); 15920 } 15921 15922 /* 15923 * In the case where the lost lock ends after all intersecting locks, 15924 * unlock the last part of the lost lock range. 15925 */ 15926 if (cur_start != start_check(lost_flp_end)) { 15927 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15928 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15929 "lost lock region [%"PRIx64" - %"PRIx64"]", 15930 cur_start, lost_flp->l_start + lost_flp->l_len)); 15931 15932 ul_fl.l_start = cur_start; 15933 /* 15934 * Is it an to-EOF lock? if so unlock till the end 15935 */ 15936 if (lost_flp->l_len == 0) 15937 ul_fl.l_len = 0; 15938 else 15939 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15940 15941 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15942 } 15943 15944 if (locks != NULL) 15945 flk_free_locklist(locks); 15946 15947 /* Free up our newly created locklist */ 15948 for (llp = ri_llp; llp != NULL; ) { 15949 tmp_llp = llp->ll_next; 15950 kmem_free(llp, sizeof (locklist_t)); 15951 llp = tmp_llp; 15952 } 15953 15954 /* 15955 * Now return back to the original calling nfs4frlock() 15956 * and let us naturally drop our seqid syncs. 15957 */ 15958 } 15959 15960 /* 15961 * Create a lost state record for the given lock reinstantiation request 15962 * and push it onto the lost state queue. 15963 */ 15964 static void 15965 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15966 nfs4_lock_owner_t *lop) 15967 { 15968 nfs4_lost_rqst_t req; 15969 nfs_lock_type4 locktype; 15970 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15971 15972 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15973 15974 locktype = flk_to_locktype(cmd, flk->l_type); 15975 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15976 NULL, NULL, lop, flk, &req, cr, vp); 15977 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15978 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15979 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15980 NULL); 15981 } 15982