1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2016 STRATO AG. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 28 */ 29 30 /* 31 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 32 * Use is subject to license terms. 33 */ 34 35 /* 36 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 37 * All Rights Reserved 38 */ 39 40 /* 41 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 42 */ 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 #include <sys/systm.h> 47 #include <sys/cred.h> 48 #include <sys/time.h> 49 #include <sys/vnode.h> 50 #include <sys/vfs.h> 51 #include <sys/vfs_opreg.h> 52 #include <sys/file.h> 53 #include <sys/filio.h> 54 #include <sys/uio.h> 55 #include <sys/buf.h> 56 #include <sys/mman.h> 57 #include <sys/pathname.h> 58 #include <sys/dirent.h> 59 #include <sys/debug.h> 60 #include <sys/vmsystm.h> 61 #include <sys/fcntl.h> 62 #include <sys/flock.h> 63 #include <sys/swap.h> 64 #include <sys/errno.h> 65 #include <sys/strsubr.h> 66 #include <sys/sysmacros.h> 67 #include <sys/kmem.h> 68 #include <sys/cmn_err.h> 69 #include <sys/pathconf.h> 70 #include <sys/utsname.h> 71 #include <sys/dnlc.h> 72 #include <sys/acl.h> 73 #include <sys/systeminfo.h> 74 #include <sys/policy.h> 75 #include <sys/sdt.h> 76 #include <sys/list.h> 77 #include <sys/stat.h> 78 #include <sys/zone.h> 79 80 #include <rpc/types.h> 81 #include <rpc/auth.h> 82 #include <rpc/clnt.h> 83 84 #include <nfs/nfs.h> 85 #include <nfs/nfs_clnt.h> 86 #include <nfs/nfs_acl.h> 87 #include <nfs/lm.h> 88 #include <nfs/nfs4.h> 89 #include <nfs/nfs4_kprot.h> 90 #include <nfs/rnode4.h> 91 #include <nfs/nfs4_clnt.h> 92 93 #include <vm/hat.h> 94 #include <vm/as.h> 95 #include <vm/page.h> 96 #include <vm/pvn.h> 97 #include <vm/seg.h> 98 #include <vm/seg_map.h> 99 #include <vm/seg_kpm.h> 100 #include <vm/seg_vn.h> 101 102 #include <fs/fs_subr.h> 103 104 #include <sys/ddi.h> 105 #include <sys/int_fmtio.h> 106 #include <sys/fs/autofs.h> 107 108 typedef struct { 109 nfs4_ga_res_t *di_garp; 110 cred_t *di_cred; 111 hrtime_t di_time_call; 112 } dirattr_info_t; 113 114 typedef enum nfs4_acl_op { 115 NFS4_ACL_GET, 116 NFS4_ACL_SET 117 } nfs4_acl_op_t; 118 119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *); 120 121 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 122 char *, dirattr_info_t *); 123 124 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 125 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 126 nfs4_error_t *, int *); 127 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 128 cred_t *); 129 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 130 stable_how4 *); 131 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 132 cred_t *, bool_t, struct uio *); 133 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 134 vsecattr_t *); 135 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 136 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 137 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 138 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 139 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 140 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 141 int, vnode_t **, cred_t *); 142 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 143 cred_t *, int, int, enum createmode4, int); 144 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 145 caller_context_t *); 146 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 147 vnode_t *, char *, cred_t *, nfsstat4 *); 148 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 149 vnode_t *, char *, cred_t *, nfsstat4 *); 150 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 151 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 152 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 153 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 154 page_t *[], size_t, struct seg *, caddr_t, 155 enum seg_rw, cred_t *); 156 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 157 cred_t *); 158 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 159 int, cred_t *); 160 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 161 int, cred_t *); 162 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 163 static void nfs4_set_mod(vnode_t *); 164 static void nfs4_get_commit(vnode_t *); 165 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 166 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 167 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 168 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 169 cred_t *); 170 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 171 cred_t *); 172 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 173 hrtime_t, vnode_t *, cred_t *); 174 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 175 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 176 static void nfs4_register_lock_locally(vnode_t *, struct flock64 *, int, 177 u_offset_t); 178 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 179 static int nfs4_block_and_wait(clock_t *, rnode4_t *); 180 static cred_t *state_to_cred(nfs4_open_stream_t *); 181 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 182 static pid_t lo_to_pid(lock_owner4 *); 183 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 184 cred_t *, nfs4_lock_owner_t *); 185 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 186 nfs4_lock_owner_t *); 187 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 188 static void nfs4_delmap_callback(struct as *, void *, uint_t); 189 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 190 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 191 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 192 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 193 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 194 uid_t, gid_t, int); 195 196 /* 197 * Routines that implement the setting of v4 args for the misc. ops 198 */ 199 static void nfs4args_lock_free(nfs_argop4 *); 200 static void nfs4args_lockt_free(nfs_argop4 *); 201 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 202 int, rnode4_t *, cred_t *, bitmap4, int *, 203 nfs4_stateid_types_t *); 204 static void nfs4args_setattr_free(nfs_argop4 *); 205 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 206 bitmap4); 207 static void nfs4args_verify_free(nfs_argop4 *); 208 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 209 WRITE4args **, nfs4_stateid_types_t *); 210 211 /* 212 * These are the vnode ops functions that implement the vnode interface to 213 * the networked file system. See more comments below at nfs4_vnodeops. 214 */ 215 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 216 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 217 caller_context_t *); 218 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 219 caller_context_t *); 220 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 221 caller_context_t *); 222 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 223 caller_context_t *); 224 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 225 caller_context_t *); 226 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 227 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 228 caller_context_t *); 229 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 230 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 231 int, vnode_t **, cred_t *, int, caller_context_t *, 232 vsecattr_t *); 233 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 234 int); 235 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 236 caller_context_t *, int); 237 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 238 caller_context_t *, int); 239 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 240 cred_t *, caller_context_t *, int, vsecattr_t *); 241 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 242 caller_context_t *, int); 243 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 244 cred_t *, caller_context_t *, int); 245 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 246 caller_context_t *, int); 247 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 248 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 249 page_t *[], size_t, struct seg *, caddr_t, 250 enum seg_rw, cred_t *, caller_context_t *); 251 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 252 caller_context_t *); 253 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 254 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 255 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 256 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 257 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 258 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 259 struct flk_callback *, cred_t *, caller_context_t *); 260 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 261 cred_t *, caller_context_t *); 262 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 263 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 264 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 265 cred_t *, caller_context_t *); 266 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 267 caller_context_t *); 268 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 269 caller_context_t *); 270 /* 271 * These vnode ops are required to be called from outside this source file, 272 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 273 * as static. 274 */ 275 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 276 caller_context_t *); 277 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 278 int nfs4_lookup(vnode_t *, char *, vnode_t **, 279 struct pathname *, int, vnode_t *, cred_t *, 280 caller_context_t *, int *, pathname_t *); 281 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 282 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 283 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 284 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 285 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 286 caller_context_t *); 287 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 288 caller_context_t *); 289 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 290 caller_context_t *); 291 292 /* 293 * Used for nfs4_commit_vp() to indicate if we should 294 * wait on pending writes. 295 */ 296 #define NFS4_WRITE_NOWAIT 0 297 #define NFS4_WRITE_WAIT 1 298 299 #define NFS4_BASE_WAIT_TIME 1 /* 1 second */ 300 301 /* 302 * Error flags used to pass information about certain special errors 303 * which need to be handled specially. 304 */ 305 #define NFS_EOF -98 306 #define NFS_VERF_MISMATCH -97 307 308 /* 309 * Flags used to differentiate between which operation drove the 310 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 311 */ 312 #define NFS4_CLOSE_OP 0x1 313 #define NFS4_DELMAP_OP 0x2 314 #define NFS4_INACTIVE_OP 0x3 315 316 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 317 318 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 319 #define ALIGN64(x, ptr, sz) \ 320 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 321 if (x) { \ 322 x = sizeof (uint64_t) - (x); \ 323 sz -= (x); \ 324 ptr += (x); \ 325 } 326 327 #ifdef DEBUG 328 int nfs4_client_attr_debug = 0; 329 int nfs4_client_state_debug = 0; 330 int nfs4_client_shadow_debug = 0; 331 int nfs4_client_lock_debug = 0; 332 int nfs4_seqid_sync = 0; 333 int nfs4_client_map_debug = 0; 334 static int nfs4_pageio_debug = 0; 335 int nfs4_client_inactive_debug = 0; 336 int nfs4_client_recov_debug = 0; 337 int nfs4_client_failover_debug = 0; 338 int nfs4_client_call_debug = 0; 339 int nfs4_client_lookup_debug = 0; 340 int nfs4_client_zone_debug = 0; 341 int nfs4_lost_rqst_debug = 0; 342 int nfs4_rdattrerr_debug = 0; 343 int nfs4_open_stream_debug = 0; 344 345 int nfs4read_error_inject; 346 347 static int nfs4_create_misses = 0; 348 349 static int nfs4_readdir_cache_shorts = 0; 350 static int nfs4_readdir_readahead = 0; 351 352 static int nfs4_bio_do_stop = 0; 353 354 static int nfs4_lostpage = 0; /* number of times we lost original page */ 355 356 int nfs4_mmap_debug = 0; 357 358 static int nfs4_pathconf_cache_hits = 0; 359 static int nfs4_pathconf_cache_misses = 0; 360 361 int nfs4close_all_cnt; 362 int nfs4close_one_debug = 0; 363 int nfs4close_notw_debug = 0; 364 365 int denied_to_flk_debug = 0; 366 void *lockt_denied_debug; 367 368 #endif 369 370 /* 371 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 372 * or NFS4ERR_RESOURCE. 373 */ 374 static int confirm_retry_sec = 30; 375 376 static int nfs4_lookup_neg_cache = 1; 377 378 /* 379 * number of pages to read ahead 380 * optimized for 100 base-T. 381 */ 382 static int nfs4_nra = 4; 383 384 static int nfs4_do_symlink_cache = 1; 385 386 static int nfs4_pathconf_disable_cache = 0; 387 388 /* 389 * These are the vnode ops routines which implement the vnode interface to 390 * the networked file system. These routines just take their parameters, 391 * make them look networkish by putting the right info into interface structs, 392 * and then calling the appropriate remote routine(s) to do the work. 393 * 394 * Note on directory name lookup cacheing: If we detect a stale fhandle, 395 * we purge the directory cache relative to that vnode. This way, the 396 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 397 * more details on rnode locking. 398 */ 399 400 struct vnodeops *nfs4_vnodeops; 401 402 const fs_operation_def_t nfs4_vnodeops_template[] = { 403 VOPNAME_OPEN, { .vop_open = nfs4_open }, 404 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 405 VOPNAME_READ, { .vop_read = nfs4_read }, 406 VOPNAME_WRITE, { .vop_write = nfs4_write }, 407 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 408 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 409 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 410 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 411 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 412 VOPNAME_CREATE, { .vop_create = nfs4_create }, 413 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 414 VOPNAME_LINK, { .vop_link = nfs4_link }, 415 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 416 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 417 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 418 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 419 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 420 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 421 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 422 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 423 VOPNAME_FID, { .vop_fid = nfs4_fid }, 424 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 425 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 426 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 427 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 428 VOPNAME_SPACE, { .vop_space = nfs4_space }, 429 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 430 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 431 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 432 VOPNAME_MAP, { .vop_map = nfs4_map }, 433 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 434 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 435 /* no separate nfs4_dump */ 436 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 437 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 438 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 439 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 440 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 441 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 442 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 443 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 444 NULL, NULL 445 }; 446 447 /* 448 * The following are subroutines and definitions to set args or get res 449 * for the different nfsv4 ops 450 */ 451 452 void 453 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 454 { 455 int i; 456 457 for (i = 0; i < arglen; i++) { 458 if (argop[i].argop == OP_LOOKUP) { 459 kmem_free( 460 argop[i].nfs_argop4_u.oplookup. 461 objname.utf8string_val, 462 argop[i].nfs_argop4_u.oplookup. 463 objname.utf8string_len); 464 } 465 } 466 } 467 468 static void 469 nfs4args_lock_free(nfs_argop4 *argop) 470 { 471 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 472 473 if (locker->new_lock_owner == TRUE) { 474 open_to_lock_owner4 *open_owner; 475 476 open_owner = &locker->locker4_u.open_owner; 477 if (open_owner->lock_owner.owner_val != NULL) { 478 kmem_free(open_owner->lock_owner.owner_val, 479 open_owner->lock_owner.owner_len); 480 } 481 } 482 } 483 484 static void 485 nfs4args_lockt_free(nfs_argop4 *argop) 486 { 487 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 488 489 if (lowner->owner_val != NULL) { 490 kmem_free(lowner->owner_val, lowner->owner_len); 491 } 492 } 493 494 static void 495 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 496 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 497 nfs4_stateid_types_t *sid_types) 498 { 499 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 500 mntinfo4_t *mi; 501 502 argop->argop = OP_SETATTR; 503 /* 504 * The stateid is set to 0 if client is not modifying the size 505 * and otherwise to whatever nfs4_get_stateid() returns. 506 * 507 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 508 * state struct could be found for the process/file pair. We may 509 * want to change this in the future (by OPENing the file). See 510 * bug # 4474852. 511 */ 512 if (vap->va_mask & AT_SIZE) { 513 514 ASSERT(rp != NULL); 515 mi = VTOMI4(RTOV4(rp)); 516 517 argop->nfs_argop4_u.opsetattr.stateid = 518 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 519 OP_SETATTR, sid_types, FALSE); 520 } else { 521 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 522 sizeof (stateid4)); 523 } 524 525 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 526 if (*error) 527 bzero(attr, sizeof (*attr)); 528 } 529 530 static void 531 nfs4args_setattr_free(nfs_argop4 *argop) 532 { 533 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 534 } 535 536 static int 537 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 538 bitmap4 supp) 539 { 540 fattr4 *attr; 541 int error = 0; 542 543 argop->argop = op; 544 switch (op) { 545 case OP_VERIFY: 546 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 547 break; 548 case OP_NVERIFY: 549 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 550 break; 551 default: 552 return (EINVAL); 553 } 554 if (!error) 555 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 556 if (error) 557 bzero(attr, sizeof (*attr)); 558 return (error); 559 } 560 561 static void 562 nfs4args_verify_free(nfs_argop4 *argop) 563 { 564 switch (argop->argop) { 565 case OP_VERIFY: 566 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 567 break; 568 case OP_NVERIFY: 569 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 570 break; 571 default: 572 break; 573 } 574 } 575 576 static void 577 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 578 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 579 { 580 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 581 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 582 583 argop->argop = OP_WRITE; 584 wargs->stable = stable; 585 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 586 mi, OP_WRITE, sid_tp); 587 wargs->mblk = NULL; 588 *wargs_pp = wargs; 589 } 590 591 void 592 nfs4args_copen_free(OPEN4cargs *open_args) 593 { 594 if (open_args->owner.owner_val) { 595 kmem_free(open_args->owner.owner_val, 596 open_args->owner.owner_len); 597 } 598 if ((open_args->opentype == OPEN4_CREATE) && 599 (open_args->mode != EXCLUSIVE4)) { 600 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 601 } 602 } 603 604 /* 605 * XXX: This is referenced in modstubs.s 606 */ 607 struct vnodeops * 608 nfs4_getvnodeops(void) 609 { 610 return (nfs4_vnodeops); 611 } 612 613 /* 614 * The OPEN operation opens a regular file. 615 */ 616 /*ARGSUSED3*/ 617 static int 618 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 619 { 620 vnode_t *dvp = NULL; 621 rnode4_t *rp, *drp; 622 int error; 623 int just_been_created; 624 char fn[MAXNAMELEN]; 625 626 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 627 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 628 return (EIO); 629 rp = VTOR4(*vpp); 630 631 /* 632 * Check to see if opening something besides a regular file; 633 * if so skip the OTW call 634 */ 635 if ((*vpp)->v_type != VREG) { 636 error = nfs4_open_non_reg_file(vpp, flag, cr); 637 return (error); 638 } 639 640 /* 641 * XXX - would like a check right here to know if the file is 642 * executable or not, so as to skip OTW 643 */ 644 645 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 646 return (error); 647 648 drp = VTOR4(dvp); 649 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 650 return (EINTR); 651 652 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 653 nfs_rw_exit(&drp->r_rwlock); 654 return (error); 655 } 656 657 /* 658 * See if this file has just been CREATEd. 659 * If so, clear the flag and update the dnlc, which was previously 660 * skipped in nfs4_create. 661 * XXX need better serilization on this. 662 * XXX move this into the nf4open_otw call, after we have 663 * XXX acquired the open owner seqid sync. 664 */ 665 mutex_enter(&rp->r_statev4_lock); 666 if (rp->created_v4) { 667 rp->created_v4 = 0; 668 mutex_exit(&rp->r_statev4_lock); 669 670 dnlc_update(dvp, fn, *vpp); 671 /* This is needed so we don't bump the open ref count */ 672 just_been_created = 1; 673 } else { 674 mutex_exit(&rp->r_statev4_lock); 675 just_been_created = 0; 676 } 677 678 /* 679 * If caller specified O_TRUNC/FTRUNC, then be sure to set 680 * FWRITE (to drive successful setattr(size=0) after open) 681 */ 682 if (flag & FTRUNC) 683 flag |= FWRITE; 684 685 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 686 just_been_created); 687 688 if (!error && !((*vpp)->v_flag & VROOT)) 689 dnlc_update(dvp, fn, *vpp); 690 691 nfs_rw_exit(&drp->r_rwlock); 692 693 /* release the hold from vtodv */ 694 VN_RELE(dvp); 695 696 /* exchange the shadow for the master vnode, if needed */ 697 698 if (error == 0 && IS_SHADOW(*vpp, rp)) 699 sv_exchange(vpp); 700 701 return (error); 702 } 703 704 /* 705 * See if there's a "lost open" request to be saved and recovered. 706 */ 707 static void 708 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 709 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 710 vnode_t *dvp, OPEN4cargs *open_args) 711 { 712 vfs_t *vfsp; 713 char *srccfp; 714 715 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 716 717 if (error != ETIMEDOUT && error != EINTR && 718 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 719 lost_rqstp->lr_op = 0; 720 return; 721 } 722 723 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 724 "nfs4open_save_lost_rqst: error %d", error)); 725 726 lost_rqstp->lr_op = OP_OPEN; 727 728 /* 729 * The vp (if it is not NULL) and dvp are held and rele'd via 730 * the recovery code. See nfs4_save_lost_rqst. 731 */ 732 lost_rqstp->lr_vp = vp; 733 lost_rqstp->lr_dvp = dvp; 734 lost_rqstp->lr_oop = oop; 735 lost_rqstp->lr_osp = NULL; 736 lost_rqstp->lr_lop = NULL; 737 lost_rqstp->lr_cr = cr; 738 lost_rqstp->lr_flk = NULL; 739 lost_rqstp->lr_oacc = open_args->share_access; 740 lost_rqstp->lr_odeny = open_args->share_deny; 741 lost_rqstp->lr_oclaim = open_args->claim; 742 if (open_args->claim == CLAIM_DELEGATE_CUR) { 743 lost_rqstp->lr_ostateid = 744 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 745 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 746 } else { 747 srccfp = open_args->open_claim4_u.cfile; 748 } 749 lost_rqstp->lr_ofile.utf8string_len = 0; 750 lost_rqstp->lr_ofile.utf8string_val = NULL; 751 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 752 lost_rqstp->lr_putfirst = FALSE; 753 } 754 755 struct nfs4_excl_time { 756 uint32 seconds; 757 uint32 nseconds; 758 }; 759 760 /* 761 * The OPEN operation creates and/or opens a regular file 762 * 763 * ARGSUSED 764 */ 765 static int 766 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 767 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 768 enum createmode4 createmode, int file_just_been_created) 769 { 770 rnode4_t *rp; 771 rnode4_t *drp = VTOR4(dvp); 772 vnode_t *vp = NULL; 773 vnode_t *vpi = *vpp; 774 bool_t needrecov = FALSE; 775 776 int doqueue = 1; 777 778 COMPOUND4args_clnt args; 779 COMPOUND4res_clnt res; 780 nfs_argop4 *argop; 781 nfs_resop4 *resop; 782 int argoplist_size; 783 int idx_open, idx_fattr; 784 785 GETFH4res *gf_res = NULL; 786 OPEN4res *op_res = NULL; 787 nfs4_ga_res_t *garp; 788 fattr4 *attr = NULL; 789 struct nfs4_excl_time verf; 790 bool_t did_excl_setup = FALSE; 791 int created_osp; 792 793 OPEN4cargs *open_args; 794 nfs4_open_owner_t *oop = NULL; 795 nfs4_open_stream_t *osp = NULL; 796 seqid4 seqid = 0; 797 bool_t retry_open = FALSE; 798 nfs4_recov_state_t recov_state; 799 nfs4_lost_rqst_t lost_rqst; 800 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 801 hrtime_t t; 802 int acc = 0; 803 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 804 cred_t *ncr = NULL; 805 806 nfs4_sharedfh_t *otw_sfh; 807 nfs4_sharedfh_t *orig_sfh; 808 int fh_differs = 0; 809 int numops, setgid_flag; 810 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 811 812 /* 813 * Make sure we properly deal with setting the right gid on 814 * a newly created file to reflect the parent's setgid bit 815 */ 816 setgid_flag = 0; 817 if (create_flag && in_va) { 818 819 /* 820 * If there is grpid mount flag used or 821 * the parent's directory has the setgid bit set 822 * _and_ the client was able to get a valid mapping 823 * for the parent dir's owner_group, we want to 824 * append NVERIFY(owner_group == dva.va_gid) and 825 * SETATTR to the CREATE compound. 826 */ 827 mutex_enter(&drp->r_statelock); 828 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID || 829 drp->r_attr.va_mode & VSGID) && 830 drp->r_attr.va_gid != GID_NOBODY) { 831 in_va->va_mask |= AT_GID; 832 in_va->va_gid = drp->r_attr.va_gid; 833 setgid_flag = 1; 834 } 835 mutex_exit(&drp->r_statelock); 836 } 837 838 /* 839 * Normal/non-create compound: 840 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 841 * 842 * Open(create) compound no setgid: 843 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 844 * RESTOREFH + GETATTR 845 * 846 * Open(create) setgid: 847 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 848 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 849 * NVERIFY(grp) + SETATTR 850 */ 851 if (setgid_flag) { 852 numops = 10; 853 idx_open = 1; 854 idx_fattr = 3; 855 } else if (create_flag) { 856 numops = 7; 857 idx_open = 2; 858 idx_fattr = 4; 859 } else { 860 numops = 4; 861 idx_open = 1; 862 idx_fattr = 3; 863 } 864 865 args.array_len = numops; 866 argoplist_size = numops * sizeof (nfs_argop4); 867 argop = kmem_alloc(argoplist_size, KM_SLEEP); 868 869 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 870 "open %s open flag 0x%x cred %p", file_name, open_flag, 871 (void *)cr)); 872 873 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 874 if (create_flag) { 875 /* 876 * We are to create a file. Initialize the passed in vnode 877 * pointer. 878 */ 879 vpi = NULL; 880 } else { 881 /* 882 * Check to see if the client owns a read delegation and is 883 * trying to open for write. If so, then return the delegation 884 * to avoid the server doing a cb_recall and returning DELAY. 885 * NB - we don't use the statev4_lock here because we'd have 886 * to drop the lock anyway and the result would be stale. 887 */ 888 if ((open_flag & FWRITE) && 889 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 890 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 891 892 /* 893 * If the file has a delegation, then do an access check up 894 * front. This avoids having to an access check later after 895 * we've already done start_op, which could deadlock. 896 */ 897 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 898 if (open_flag & FREAD && 899 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 900 acc |= VREAD; 901 if (open_flag & FWRITE && 902 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 903 acc |= VWRITE; 904 } 905 } 906 907 drp = VTOR4(dvp); 908 909 recov_state.rs_flags = 0; 910 recov_state.rs_num_retry_despite_err = 0; 911 cred_otw = cr; 912 913 recov_retry: 914 fh_differs = 0; 915 nfs4_error_zinit(&e); 916 917 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 918 if (e.error) { 919 if (ncr != NULL) 920 crfree(ncr); 921 kmem_free(argop, argoplist_size); 922 return (e.error); 923 } 924 925 args.ctag = TAG_OPEN; 926 args.array_len = numops; 927 args.array = argop; 928 929 /* putfh directory fh */ 930 argop[0].argop = OP_CPUTFH; 931 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 932 933 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 934 argop[idx_open].argop = OP_COPEN; 935 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 936 open_args->claim = CLAIM_NULL; 937 938 /* name of file */ 939 open_args->open_claim4_u.cfile = file_name; 940 open_args->owner.owner_len = 0; 941 open_args->owner.owner_val = NULL; 942 943 if (create_flag) { 944 /* CREATE a file */ 945 open_args->opentype = OPEN4_CREATE; 946 open_args->mode = createmode; 947 if (createmode == EXCLUSIVE4) { 948 if (did_excl_setup == FALSE) { 949 verf.seconds = zone_get_hostid(NULL); 950 if (verf.seconds != 0) 951 verf.nseconds = newnum(); 952 else { 953 timestruc_t now; 954 955 gethrestime(&now); 956 verf.seconds = now.tv_sec; 957 verf.nseconds = now.tv_nsec; 958 } 959 /* 960 * Since the server will use this value for the 961 * mtime, make sure that it can't overflow. Zero 962 * out the MSB. The actual value does not matter 963 * here, only its uniqeness. 964 */ 965 verf.seconds &= INT32_MAX; 966 did_excl_setup = TRUE; 967 } 968 969 /* Now copy over verifier to OPEN4args. */ 970 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 971 } else { 972 int v_error; 973 bitmap4 supp_attrs; 974 servinfo4_t *svp; 975 976 attr = &open_args->createhow4_u.createattrs; 977 978 svp = drp->r_server; 979 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 980 supp_attrs = svp->sv_supp_attrs; 981 nfs_rw_exit(&svp->sv_lock); 982 983 /* GUARDED4 or UNCHECKED4 */ 984 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 985 supp_attrs); 986 if (v_error) { 987 bzero(attr, sizeof (*attr)); 988 nfs4args_copen_free(open_args); 989 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 990 &recov_state, FALSE); 991 if (ncr != NULL) 992 crfree(ncr); 993 kmem_free(argop, argoplist_size); 994 return (v_error); 995 } 996 } 997 } else { 998 /* NO CREATE */ 999 open_args->opentype = OPEN4_NOCREATE; 1000 } 1001 1002 if (recov_state.rs_sp != NULL) { 1003 mutex_enter(&recov_state.rs_sp->s_lock); 1004 open_args->owner.clientid = recov_state.rs_sp->clientid; 1005 mutex_exit(&recov_state.rs_sp->s_lock); 1006 } else { 1007 /* XXX should we just fail here? */ 1008 open_args->owner.clientid = 0; 1009 } 1010 1011 /* 1012 * This increments oop's ref count or creates a temporary 'just_created' 1013 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 1014 * completes. 1015 */ 1016 mutex_enter(&VTOMI4(dvp)->mi_lock); 1017 1018 /* See if a permanent or just created open owner exists */ 1019 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1020 if (!oop) { 1021 /* 1022 * This open owner does not exist so create a temporary 1023 * just created one. 1024 */ 1025 oop = create_open_owner(cr, VTOMI4(dvp)); 1026 ASSERT(oop != NULL); 1027 } 1028 mutex_exit(&VTOMI4(dvp)->mi_lock); 1029 1030 /* this length never changes, do alloc before seqid sync */ 1031 open_args->owner.owner_len = sizeof (oop->oo_name); 1032 open_args->owner.owner_val = 1033 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1034 1035 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1036 if (e.error == EAGAIN) { 1037 open_owner_rele(oop); 1038 nfs4args_copen_free(open_args); 1039 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1040 if (ncr != NULL) { 1041 crfree(ncr); 1042 ncr = NULL; 1043 } 1044 goto recov_retry; 1045 } 1046 1047 /* Check to see if we need to do the OTW call */ 1048 if (!create_flag) { 1049 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1050 file_just_been_created, &e.error, acc, &recov_state)) { 1051 1052 /* 1053 * The OTW open is not necessary. Either 1054 * the open can succeed without it (eg. 1055 * delegation, error == 0) or the open 1056 * must fail due to an access failure 1057 * (error != 0). In either case, tidy 1058 * up and return. 1059 */ 1060 1061 nfs4_end_open_seqid_sync(oop); 1062 open_owner_rele(oop); 1063 nfs4args_copen_free(open_args); 1064 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1065 if (ncr != NULL) 1066 crfree(ncr); 1067 kmem_free(argop, argoplist_size); 1068 return (e.error); 1069 } 1070 } 1071 1072 bcopy(&oop->oo_name, open_args->owner.owner_val, 1073 open_args->owner.owner_len); 1074 1075 seqid = nfs4_get_open_seqid(oop) + 1; 1076 open_args->seqid = seqid; 1077 open_args->share_access = 0; 1078 if (open_flag & FREAD) 1079 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1080 if (open_flag & FWRITE) 1081 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1082 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1083 1084 1085 1086 /* 1087 * getfh w/sanity check for idx_open/idx_fattr 1088 */ 1089 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1090 argop[idx_open + 1].argop = OP_GETFH; 1091 1092 /* getattr */ 1093 argop[idx_fattr].argop = OP_GETATTR; 1094 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1095 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1096 1097 if (setgid_flag) { 1098 vattr_t _v; 1099 servinfo4_t *svp; 1100 bitmap4 supp_attrs; 1101 1102 svp = drp->r_server; 1103 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1104 supp_attrs = svp->sv_supp_attrs; 1105 nfs_rw_exit(&svp->sv_lock); 1106 1107 /* 1108 * For setgid case, we need to: 1109 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1110 */ 1111 argop[4].argop = OP_SAVEFH; 1112 1113 argop[5].argop = OP_CPUTFH; 1114 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1115 1116 argop[6].argop = OP_GETATTR; 1117 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1118 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1119 1120 argop[7].argop = OP_RESTOREFH; 1121 1122 /* 1123 * nverify 1124 */ 1125 _v.va_mask = AT_GID; 1126 _v.va_gid = in_va->va_gid; 1127 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1128 supp_attrs))) { 1129 1130 /* 1131 * setattr 1132 * 1133 * We _know_ we're not messing with AT_SIZE or 1134 * AT_XTIME, so no need for stateid or flags. 1135 * Also we specify NULL rp since we're only 1136 * interested in setting owner_group attributes. 1137 */ 1138 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1139 supp_attrs, &e.error, 0); 1140 if (e.error) 1141 nfs4args_verify_free(&argop[8]); 1142 } 1143 1144 if (e.error) { 1145 /* 1146 * XXX - Revisit the last argument to nfs4_end_op() 1147 * once 5020486 is fixed. 1148 */ 1149 nfs4_end_open_seqid_sync(oop); 1150 open_owner_rele(oop); 1151 nfs4args_copen_free(open_args); 1152 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1153 if (ncr != NULL) 1154 crfree(ncr); 1155 kmem_free(argop, argoplist_size); 1156 return (e.error); 1157 } 1158 } else if (create_flag) { 1159 argop[1].argop = OP_SAVEFH; 1160 1161 argop[5].argop = OP_RESTOREFH; 1162 1163 argop[6].argop = OP_GETATTR; 1164 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1165 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1166 } 1167 1168 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1169 "nfs4open_otw: %s call, nm %s, rp %s", 1170 needrecov ? "recov" : "first", file_name, 1171 rnode4info(VTOR4(dvp)))); 1172 1173 t = gethrtime(); 1174 1175 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1176 1177 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1178 nfs4_set_open_seqid(seqid, oop, args.ctag); 1179 1180 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1181 1182 if (e.error || needrecov) { 1183 bool_t abort = FALSE; 1184 1185 if (needrecov) { 1186 nfs4_bseqid_entry_t *bsep = NULL; 1187 1188 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1189 cred_otw, vpi, dvp, open_args); 1190 1191 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1192 bsep = nfs4_create_bseqid_entry(oop, NULL, 1193 vpi, 0, args.ctag, open_args->seqid); 1194 num_bseqid_retry--; 1195 } 1196 1197 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1198 NULL, lost_rqst.lr_op == OP_OPEN ? 1199 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL); 1200 1201 if (bsep) 1202 kmem_free(bsep, sizeof (*bsep)); 1203 /* give up if we keep getting BAD_SEQID */ 1204 if (num_bseqid_retry == 0) 1205 abort = TRUE; 1206 if (abort == TRUE && e.error == 0) 1207 e.error = geterrno4(res.status); 1208 } 1209 nfs4_end_open_seqid_sync(oop); 1210 open_owner_rele(oop); 1211 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1212 nfs4args_copen_free(open_args); 1213 if (setgid_flag) { 1214 nfs4args_verify_free(&argop[8]); 1215 nfs4args_setattr_free(&argop[9]); 1216 } 1217 if (!e.error) 1218 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1219 if (ncr != NULL) { 1220 crfree(ncr); 1221 ncr = NULL; 1222 } 1223 if (!needrecov || abort == TRUE || e.error == EINTR || 1224 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1225 kmem_free(argop, argoplist_size); 1226 return (e.error); 1227 } 1228 goto recov_retry; 1229 } 1230 1231 /* 1232 * Will check and update lease after checking the rflag for 1233 * OPEN_CONFIRM in the successful OPEN call. 1234 */ 1235 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1236 1237 /* 1238 * XXX what if we're crossing mount points from server1:/drp 1239 * to server2:/drp/rp. 1240 */ 1241 1242 /* Signal our end of use of the open seqid */ 1243 nfs4_end_open_seqid_sync(oop); 1244 1245 /* 1246 * This will destroy the open owner if it was just created, 1247 * and no one else has put a reference on it. 1248 */ 1249 open_owner_rele(oop); 1250 if (create_flag && (createmode != EXCLUSIVE4) && 1251 res.status == NFS4ERR_BADOWNER) 1252 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1253 1254 e.error = geterrno4(res.status); 1255 nfs4args_copen_free(open_args); 1256 if (setgid_flag) { 1257 nfs4args_verify_free(&argop[8]); 1258 nfs4args_setattr_free(&argop[9]); 1259 } 1260 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1261 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1262 /* 1263 * If the reply is NFS4ERR_ACCESS, it may be because 1264 * we are root (no root net access). If the real uid 1265 * is not root, then retry with the real uid instead. 1266 */ 1267 if (ncr != NULL) { 1268 crfree(ncr); 1269 ncr = NULL; 1270 } 1271 if (res.status == NFS4ERR_ACCESS && 1272 (ncr = crnetadjust(cred_otw)) != NULL) { 1273 cred_otw = ncr; 1274 goto recov_retry; 1275 } 1276 kmem_free(argop, argoplist_size); 1277 return (e.error); 1278 } 1279 1280 resop = &res.array[idx_open]; /* open res */ 1281 op_res = &resop->nfs_resop4_u.opopen; 1282 1283 #ifdef DEBUG 1284 /* 1285 * verify attrset bitmap 1286 */ 1287 if (create_flag && 1288 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1289 /* make sure attrset returned is what we asked for */ 1290 /* XXX Ignore this 'error' for now */ 1291 if (attr->attrmask != op_res->attrset) 1292 /* EMPTY */; 1293 } 1294 #endif 1295 1296 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1297 mutex_enter(&VTOMI4(dvp)->mi_lock); 1298 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1299 mutex_exit(&VTOMI4(dvp)->mi_lock); 1300 } 1301 1302 resop = &res.array[idx_open + 1]; /* getfh res */ 1303 gf_res = &resop->nfs_resop4_u.opgetfh; 1304 1305 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1306 1307 /* 1308 * The open stateid has been updated on the server but not 1309 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1310 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1311 * WRITE call. That, however, will use the old stateid, so go ahead 1312 * and upate the open stateid now, before any call to makenfs4node. 1313 */ 1314 if (vpi) { 1315 nfs4_open_stream_t *tmp_osp; 1316 rnode4_t *tmp_rp = VTOR4(vpi); 1317 1318 tmp_osp = find_open_stream(oop, tmp_rp); 1319 if (tmp_osp) { 1320 tmp_osp->open_stateid = op_res->stateid; 1321 mutex_exit(&tmp_osp->os_sync_lock); 1322 open_stream_rele(tmp_osp, tmp_rp); 1323 } 1324 1325 /* 1326 * We must determine if the file handle given by the otw open 1327 * is the same as the file handle which was passed in with 1328 * *vpp. This case can be reached if the file we are trying 1329 * to open has been removed and another file has been created 1330 * having the same file name. The passed in vnode is released 1331 * later. 1332 */ 1333 orig_sfh = VTOR4(vpi)->r_fh; 1334 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1335 } 1336 1337 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1338 1339 if (create_flag || fh_differs) { 1340 int rnode_err = 0; 1341 1342 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1343 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1344 1345 if (e.error) 1346 PURGE_ATTRCACHE4(vp); 1347 /* 1348 * For the newly created vp case, make sure the rnode 1349 * isn't bad before using it. 1350 */ 1351 mutex_enter(&(VTOR4(vp))->r_statelock); 1352 if (VTOR4(vp)->r_flags & R4RECOVERR) 1353 rnode_err = EIO; 1354 mutex_exit(&(VTOR4(vp))->r_statelock); 1355 1356 if (rnode_err) { 1357 nfs4_end_open_seqid_sync(oop); 1358 nfs4args_copen_free(open_args); 1359 if (setgid_flag) { 1360 nfs4args_verify_free(&argop[8]); 1361 nfs4args_setattr_free(&argop[9]); 1362 } 1363 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1364 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1365 needrecov); 1366 open_owner_rele(oop); 1367 VN_RELE(vp); 1368 if (ncr != NULL) 1369 crfree(ncr); 1370 sfh4_rele(&otw_sfh); 1371 kmem_free(argop, argoplist_size); 1372 return (EIO); 1373 } 1374 } else { 1375 vp = vpi; 1376 } 1377 sfh4_rele(&otw_sfh); 1378 1379 /* 1380 * It seems odd to get a full set of attrs and then not update 1381 * the object's attrcache in the non-create case. Create case uses 1382 * the attrs since makenfs4node checks to see if the attrs need to 1383 * be updated (and then updates them). The non-create case should 1384 * update attrs also. 1385 */ 1386 if (! create_flag && ! fh_differs && !e.error) { 1387 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1388 } 1389 1390 nfs4_error_zinit(&e); 1391 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1392 /* This does not do recovery for vp explicitly. */ 1393 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1394 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1395 1396 if (e.error || e.stat) { 1397 nfs4_end_open_seqid_sync(oop); 1398 nfs4args_copen_free(open_args); 1399 if (setgid_flag) { 1400 nfs4args_verify_free(&argop[8]); 1401 nfs4args_setattr_free(&argop[9]); 1402 } 1403 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1404 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1405 needrecov); 1406 open_owner_rele(oop); 1407 if (create_flag || fh_differs) { 1408 /* rele the makenfs4node */ 1409 VN_RELE(vp); 1410 } 1411 if (ncr != NULL) { 1412 crfree(ncr); 1413 ncr = NULL; 1414 } 1415 if (retry_open == TRUE) { 1416 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1417 "nfs4open_otw: retry the open since OPEN " 1418 "CONFIRM failed with error %d stat %d", 1419 e.error, e.stat)); 1420 if (create_flag && createmode == GUARDED4) { 1421 NFS4_DEBUG(nfs4_client_recov_debug, 1422 (CE_NOTE, "nfs4open_otw: switch " 1423 "createmode from GUARDED4 to " 1424 "UNCHECKED4")); 1425 createmode = UNCHECKED4; 1426 } 1427 goto recov_retry; 1428 } 1429 if (!e.error) { 1430 if (create_flag && (createmode != EXCLUSIVE4) && 1431 e.stat == NFS4ERR_BADOWNER) 1432 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1433 1434 e.error = geterrno4(e.stat); 1435 } 1436 kmem_free(argop, argoplist_size); 1437 return (e.error); 1438 } 1439 } 1440 1441 rp = VTOR4(vp); 1442 1443 mutex_enter(&rp->r_statev4_lock); 1444 if (create_flag) 1445 rp->created_v4 = 1; 1446 mutex_exit(&rp->r_statev4_lock); 1447 1448 mutex_enter(&oop->oo_lock); 1449 /* Doesn't matter if 'oo_just_created' already was set as this */ 1450 oop->oo_just_created = NFS4_PERM_CREATED; 1451 if (oop->oo_cred_otw) 1452 crfree(oop->oo_cred_otw); 1453 oop->oo_cred_otw = cred_otw; 1454 crhold(oop->oo_cred_otw); 1455 mutex_exit(&oop->oo_lock); 1456 1457 /* returns with 'os_sync_lock' held */ 1458 osp = find_or_create_open_stream(oop, rp, &created_osp); 1459 if (!osp) { 1460 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1461 "nfs4open_otw: failed to create an open stream")); 1462 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1463 "signal our end of use of the open seqid")); 1464 1465 nfs4_end_open_seqid_sync(oop); 1466 open_owner_rele(oop); 1467 nfs4args_copen_free(open_args); 1468 if (setgid_flag) { 1469 nfs4args_verify_free(&argop[8]); 1470 nfs4args_setattr_free(&argop[9]); 1471 } 1472 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1473 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1474 if (create_flag || fh_differs) 1475 VN_RELE(vp); 1476 if (ncr != NULL) 1477 crfree(ncr); 1478 1479 kmem_free(argop, argoplist_size); 1480 return (EINVAL); 1481 1482 } 1483 1484 osp->open_stateid = op_res->stateid; 1485 1486 if (open_flag & FREAD) 1487 osp->os_share_acc_read++; 1488 if (open_flag & FWRITE) 1489 osp->os_share_acc_write++; 1490 osp->os_share_deny_none++; 1491 1492 /* 1493 * Need to reset this bitfield for the possible case where we were 1494 * going to OTW CLOSE the file, got a non-recoverable error, and before 1495 * we could retry the CLOSE, OPENed the file again. 1496 */ 1497 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1498 osp->os_final_close = 0; 1499 osp->os_force_close = 0; 1500 #ifdef DEBUG 1501 if (osp->os_failed_reopen) 1502 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1503 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1504 (void *)osp, (void *)cr, rnode4info(rp))); 1505 #endif 1506 osp->os_failed_reopen = 0; 1507 1508 mutex_exit(&osp->os_sync_lock); 1509 1510 nfs4_end_open_seqid_sync(oop); 1511 1512 if (created_osp && recov_state.rs_sp != NULL) { 1513 mutex_enter(&recov_state.rs_sp->s_lock); 1514 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1515 mutex_exit(&recov_state.rs_sp->s_lock); 1516 } 1517 1518 /* get rid of our reference to find oop */ 1519 open_owner_rele(oop); 1520 1521 open_stream_rele(osp, rp); 1522 1523 /* accept delegation, if any */ 1524 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1525 1526 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1527 1528 if (createmode == EXCLUSIVE4 && 1529 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1530 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1531 " EXCLUSIVE4: sending a SETATTR")); 1532 /* 1533 * If doing an exclusive create, then generate 1534 * a SETATTR to set the initial attributes. 1535 * Try to set the mtime and the atime to the 1536 * server's current time. It is somewhat 1537 * expected that these fields will be used to 1538 * store the exclusive create cookie. If not, 1539 * server implementors will need to know that 1540 * a SETATTR will follow an exclusive create 1541 * and the cookie should be destroyed if 1542 * appropriate. 1543 * 1544 * The AT_GID and AT_SIZE bits are turned off 1545 * so that the SETATTR request will not attempt 1546 * to process these. The gid will be set 1547 * separately if appropriate. The size is turned 1548 * off because it is assumed that a new file will 1549 * be created empty and if the file wasn't empty, 1550 * then the exclusive create will have failed 1551 * because the file must have existed already. 1552 * Therefore, no truncate operation is needed. 1553 */ 1554 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1555 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1556 1557 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1558 if (e.error) { 1559 /* 1560 * Couldn't correct the attributes of 1561 * the newly created file and the 1562 * attributes are wrong. Remove the 1563 * file and return an error to the 1564 * application. 1565 */ 1566 /* XXX will this take care of client state ? */ 1567 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1568 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1569 " remove file", e.error)); 1570 VN_RELE(vp); 1571 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1572 /* 1573 * Since we've reled the vnode and removed 1574 * the file we now need to return the error. 1575 * At this point we don't want to update the 1576 * dircaches, call nfs4_waitfor_purge_complete 1577 * or set vpp to vp so we need to skip these 1578 * as well. 1579 */ 1580 goto skip_update_dircaches; 1581 } 1582 } 1583 1584 /* 1585 * If we created or found the correct vnode, due to create_flag or 1586 * fh_differs being set, then update directory cache attribute, readdir 1587 * and dnlc caches. 1588 */ 1589 if (create_flag || fh_differs) { 1590 dirattr_info_t dinfo, *dinfop; 1591 1592 /* 1593 * Make sure getattr succeeded before using results. 1594 * note: op 7 is getattr(dir) for both flavors of 1595 * open(create). 1596 */ 1597 if (create_flag && res.status == NFS4_OK) { 1598 dinfo.di_time_call = t; 1599 dinfo.di_cred = cr; 1600 dinfo.di_garp = 1601 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1602 dinfop = &dinfo; 1603 } else { 1604 dinfop = NULL; 1605 } 1606 1607 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1608 dinfop); 1609 } 1610 1611 /* 1612 * If the page cache for this file was flushed from actions 1613 * above, it was done asynchronously and if that is true, 1614 * there is a need to wait here for it to complete. This must 1615 * be done outside of start_fop/end_fop. 1616 */ 1617 (void) nfs4_waitfor_purge_complete(vp); 1618 1619 /* 1620 * It is implicit that we are in the open case (create_flag == 0) since 1621 * fh_differs can only be set to a non-zero value in the open case. 1622 */ 1623 if (fh_differs != 0 && vpi != NULL) 1624 VN_RELE(vpi); 1625 1626 /* 1627 * Be sure to set *vpp to the correct value before returning. 1628 */ 1629 *vpp = vp; 1630 1631 skip_update_dircaches: 1632 1633 nfs4args_copen_free(open_args); 1634 if (setgid_flag) { 1635 nfs4args_verify_free(&argop[8]); 1636 nfs4args_setattr_free(&argop[9]); 1637 } 1638 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1639 1640 if (ncr) 1641 crfree(ncr); 1642 kmem_free(argop, argoplist_size); 1643 return (e.error); 1644 } 1645 1646 /* 1647 * Reopen an open instance. cf. nfs4open_otw(). 1648 * 1649 * Errors are returned by the nfs4_error_t parameter. 1650 * - ep->error contains an errno value or zero. 1651 * - if it is zero, ep->stat is set to an NFS status code, if any. 1652 * If the file could not be reopened, but the caller should continue, the 1653 * file is marked dead and no error values are returned. If the caller 1654 * should stop recovering open files and start over, either the ep->error 1655 * value or ep->stat will indicate an error (either something that requires 1656 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1657 * filehandles) may be handled silently by this routine. 1658 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1659 * will be started, so the caller should not do it. 1660 * 1661 * Gotos: 1662 * - kill_file : reopen failed in such a fashion to constitute marking the 1663 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1664 * is for cases where recovery is not possible. 1665 * - failed_reopen : same as above, except that the file has already been 1666 * marked dead, so no need to do it again. 1667 * - bailout : reopen failed but we are able to recover and retry the reopen - 1668 * either within this function immediately or via the calling function. 1669 */ 1670 1671 void 1672 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1673 open_claim_type4 claim, bool_t frc_use_claim_previous, 1674 bool_t is_recov) 1675 { 1676 COMPOUND4args_clnt args; 1677 COMPOUND4res_clnt res; 1678 nfs_argop4 argop[4]; 1679 nfs_resop4 *resop; 1680 OPEN4res *op_res = NULL; 1681 OPEN4cargs *open_args; 1682 GETFH4res *gf_res; 1683 rnode4_t *rp = VTOR4(vp); 1684 int doqueue = 1; 1685 cred_t *cr = NULL, *cred_otw = NULL; 1686 nfs4_open_owner_t *oop = NULL; 1687 seqid4 seqid; 1688 nfs4_ga_res_t *garp; 1689 char fn[MAXNAMELEN]; 1690 nfs4_recov_state_t recov = {NULL, 0}; 1691 nfs4_lost_rqst_t lost_rqst; 1692 mntinfo4_t *mi = VTOMI4(vp); 1693 bool_t abort; 1694 char *failed_msg = ""; 1695 int fh_different; 1696 hrtime_t t; 1697 nfs4_bseqid_entry_t *bsep = NULL; 1698 1699 ASSERT(nfs4_consistent_type(vp)); 1700 ASSERT(nfs_zone() == mi->mi_zone); 1701 1702 nfs4_error_zinit(ep); 1703 1704 /* this is the cred used to find the open owner */ 1705 cr = state_to_cred(osp); 1706 if (cr == NULL) { 1707 failed_msg = "Couldn't reopen: no cred"; 1708 goto kill_file; 1709 } 1710 /* use this cred for OTW operations */ 1711 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1712 1713 top: 1714 nfs4_error_zinit(ep); 1715 1716 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1717 /* File system has been unmounted, quit */ 1718 ep->error = EIO; 1719 failed_msg = "Couldn't reopen: file system has been unmounted"; 1720 goto kill_file; 1721 } 1722 1723 oop = osp->os_open_owner; 1724 1725 ASSERT(oop != NULL); 1726 if (oop == NULL) { /* be defensive in non-DEBUG */ 1727 failed_msg = "can't reopen: no open owner"; 1728 goto kill_file; 1729 } 1730 open_owner_hold(oop); 1731 1732 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1733 if (ep->error) { 1734 open_owner_rele(oop); 1735 oop = NULL; 1736 goto bailout; 1737 } 1738 1739 /* 1740 * If the rnode has a delegation and the delegation has been 1741 * recovered and the server didn't request a recall and the caller 1742 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1743 * recovery) and the rnode hasn't been marked dead, then install 1744 * the delegation stateid in the open stream. Otherwise, proceed 1745 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1746 */ 1747 mutex_enter(&rp->r_statev4_lock); 1748 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1749 !rp->r_deleg_return_pending && 1750 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1751 !rp->r_deleg_needs_recall && 1752 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1753 !(rp->r_flags & R4RECOVERR)) { 1754 mutex_enter(&osp->os_sync_lock); 1755 osp->os_delegation = 1; 1756 osp->open_stateid = rp->r_deleg_stateid; 1757 mutex_exit(&osp->os_sync_lock); 1758 mutex_exit(&rp->r_statev4_lock); 1759 goto bailout; 1760 } 1761 mutex_exit(&rp->r_statev4_lock); 1762 1763 /* 1764 * If the file failed recovery, just quit. This failure need not 1765 * affect other reopens, so don't return an error. 1766 */ 1767 mutex_enter(&rp->r_statelock); 1768 if (rp->r_flags & R4RECOVERR) { 1769 mutex_exit(&rp->r_statelock); 1770 ep->error = 0; 1771 goto failed_reopen; 1772 } 1773 mutex_exit(&rp->r_statelock); 1774 1775 /* 1776 * argop is empty here 1777 * 1778 * PUTFH, OPEN, GETATTR 1779 */ 1780 args.ctag = TAG_REOPEN; 1781 args.array_len = 4; 1782 args.array = argop; 1783 1784 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1785 "nfs4_reopen: file is type %d, id %s", 1786 vp->v_type, rnode4info(VTOR4(vp)))); 1787 1788 argop[0].argop = OP_CPUTFH; 1789 1790 if (claim != CLAIM_PREVIOUS) { 1791 /* 1792 * if this is a file mount then 1793 * use the mntinfo parentfh 1794 */ 1795 argop[0].nfs_argop4_u.opcputfh.sfh = 1796 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1797 VTOSV(vp)->sv_dfh; 1798 } else { 1799 /* putfh fh to reopen */ 1800 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1801 } 1802 1803 argop[1].argop = OP_COPEN; 1804 open_args = &argop[1].nfs_argop4_u.opcopen; 1805 open_args->claim = claim; 1806 1807 if (claim == CLAIM_NULL) { 1808 1809 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1810 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1811 "failed for vp 0x%p for CLAIM_NULL with %m", 1812 (void *)vp); 1813 failed_msg = "Couldn't reopen: vtoname failed for " 1814 "CLAIM_NULL"; 1815 /* nothing allocated yet */ 1816 goto kill_file; 1817 } 1818 1819 open_args->open_claim4_u.cfile = fn; 1820 } else if (claim == CLAIM_PREVIOUS) { 1821 1822 /* 1823 * We have two cases to deal with here: 1824 * 1) We're being called to reopen files in order to satisfy 1825 * a lock operation request which requires us to explicitly 1826 * reopen files which were opened under a delegation. If 1827 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1828 * that case, frc_use_claim_previous is TRUE and we must 1829 * use the rnode's current delegation type (r_deleg_type). 1830 * 2) We're reopening files during some form of recovery. 1831 * In this case, frc_use_claim_previous is FALSE and we 1832 * use the delegation type appropriate for recovery 1833 * (r_deleg_needs_recovery). 1834 */ 1835 mutex_enter(&rp->r_statev4_lock); 1836 open_args->open_claim4_u.delegate_type = 1837 frc_use_claim_previous ? 1838 rp->r_deleg_type : 1839 rp->r_deleg_needs_recovery; 1840 mutex_exit(&rp->r_statev4_lock); 1841 1842 } else if (claim == CLAIM_DELEGATE_CUR) { 1843 1844 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1845 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1846 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1847 "with %m", (void *)vp); 1848 failed_msg = "Couldn't reopen: vtoname failed for " 1849 "CLAIM_DELEGATE_CUR"; 1850 /* nothing allocated yet */ 1851 goto kill_file; 1852 } 1853 1854 mutex_enter(&rp->r_statev4_lock); 1855 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1856 rp->r_deleg_stateid; 1857 mutex_exit(&rp->r_statev4_lock); 1858 1859 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1860 } 1861 open_args->opentype = OPEN4_NOCREATE; 1862 open_args->owner.clientid = mi2clientid(mi); 1863 open_args->owner.owner_len = sizeof (oop->oo_name); 1864 open_args->owner.owner_val = 1865 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1866 bcopy(&oop->oo_name, open_args->owner.owner_val, 1867 open_args->owner.owner_len); 1868 open_args->share_access = 0; 1869 open_args->share_deny = 0; 1870 1871 mutex_enter(&osp->os_sync_lock); 1872 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1873 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1874 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1875 (void *)osp, (void *)rp, osp->os_share_acc_read, 1876 osp->os_share_acc_write, osp->os_open_ref_count, 1877 osp->os_mmap_read, osp->os_mmap_write, claim)); 1878 1879 if (osp->os_share_acc_read || osp->os_mmap_read) 1880 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1881 if (osp->os_share_acc_write || osp->os_mmap_write) 1882 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1883 if (osp->os_share_deny_read) 1884 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1885 if (osp->os_share_deny_write) 1886 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1887 mutex_exit(&osp->os_sync_lock); 1888 1889 seqid = nfs4_get_open_seqid(oop) + 1; 1890 open_args->seqid = seqid; 1891 1892 /* Construct the getfh part of the compound */ 1893 argop[2].argop = OP_GETFH; 1894 1895 /* Construct the getattr part of the compound */ 1896 argop[3].argop = OP_GETATTR; 1897 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1898 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1899 1900 t = gethrtime(); 1901 1902 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1903 1904 if (ep->error) { 1905 if (!is_recov && !frc_use_claim_previous && 1906 (ep->error == EINTR || ep->error == ETIMEDOUT || 1907 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1908 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1909 cred_otw, vp, NULL, open_args); 1910 abort = nfs4_start_recovery(ep, 1911 VTOMI4(vp), vp, NULL, NULL, 1912 lost_rqst.lr_op == OP_OPEN ? 1913 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL); 1914 nfs4args_copen_free(open_args); 1915 goto bailout; 1916 } 1917 1918 nfs4args_copen_free(open_args); 1919 1920 if (ep->error == EACCES && cred_otw != cr) { 1921 crfree(cred_otw); 1922 cred_otw = cr; 1923 crhold(cred_otw); 1924 nfs4_end_open_seqid_sync(oop); 1925 open_owner_rele(oop); 1926 oop = NULL; 1927 goto top; 1928 } 1929 if (ep->error == ETIMEDOUT) 1930 goto bailout; 1931 failed_msg = "Couldn't reopen: rpc error"; 1932 goto kill_file; 1933 } 1934 1935 if (nfs4_need_to_bump_seqid(&res)) 1936 nfs4_set_open_seqid(seqid, oop, args.ctag); 1937 1938 switch (res.status) { 1939 case NFS4_OK: 1940 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1941 mutex_enter(&rp->r_statelock); 1942 rp->r_delay_interval = 0; 1943 mutex_exit(&rp->r_statelock); 1944 } 1945 break; 1946 case NFS4ERR_BAD_SEQID: 1947 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1948 args.ctag, open_args->seqid); 1949 1950 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1951 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1952 NULL, OP_OPEN, bsep, NULL, NULL); 1953 1954 nfs4args_copen_free(open_args); 1955 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1956 nfs4_end_open_seqid_sync(oop); 1957 open_owner_rele(oop); 1958 oop = NULL; 1959 kmem_free(bsep, sizeof (*bsep)); 1960 1961 goto kill_file; 1962 case NFS4ERR_NO_GRACE: 1963 nfs4args_copen_free(open_args); 1964 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1965 nfs4_end_open_seqid_sync(oop); 1966 open_owner_rele(oop); 1967 oop = NULL; 1968 if (claim == CLAIM_PREVIOUS) { 1969 /* 1970 * Retry as a plain open. We don't need to worry about 1971 * checking the changeinfo: it is acceptable for a 1972 * client to re-open a file and continue processing 1973 * (in the absence of locks). 1974 */ 1975 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1976 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1977 "will retry as CLAIM_NULL")); 1978 claim = CLAIM_NULL; 1979 nfs4_mi_kstat_inc_no_grace(mi); 1980 goto top; 1981 } 1982 failed_msg = 1983 "Couldn't reopen: tried reclaim outside grace period. "; 1984 goto kill_file; 1985 case NFS4ERR_GRACE: 1986 nfs4_set_grace_wait(mi); 1987 nfs4args_copen_free(open_args); 1988 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1989 nfs4_end_open_seqid_sync(oop); 1990 open_owner_rele(oop); 1991 oop = NULL; 1992 ep->error = nfs4_wait_for_grace(mi, &recov); 1993 if (ep->error != 0) 1994 goto bailout; 1995 goto top; 1996 case NFS4ERR_DELAY: 1997 nfs4_set_delay_wait(vp); 1998 nfs4args_copen_free(open_args); 1999 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2000 nfs4_end_open_seqid_sync(oop); 2001 open_owner_rele(oop); 2002 oop = NULL; 2003 ep->error = nfs4_wait_for_delay(vp, &recov); 2004 nfs4_mi_kstat_inc_delay(mi); 2005 if (ep->error != 0) 2006 goto bailout; 2007 goto top; 2008 case NFS4ERR_FHEXPIRED: 2009 /* recover filehandle and retry */ 2010 abort = nfs4_start_recovery(ep, 2011 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL); 2012 nfs4args_copen_free(open_args); 2013 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2014 nfs4_end_open_seqid_sync(oop); 2015 open_owner_rele(oop); 2016 oop = NULL; 2017 if (abort == FALSE) 2018 goto top; 2019 failed_msg = "Couldn't reopen: recovery aborted"; 2020 goto kill_file; 2021 case NFS4ERR_RESOURCE: 2022 case NFS4ERR_STALE_CLIENTID: 2023 case NFS4ERR_WRONGSEC: 2024 case NFS4ERR_EXPIRED: 2025 /* 2026 * Do not mark the file dead and let the calling 2027 * function initiate recovery. 2028 */ 2029 nfs4args_copen_free(open_args); 2030 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2031 nfs4_end_open_seqid_sync(oop); 2032 open_owner_rele(oop); 2033 oop = NULL; 2034 goto bailout; 2035 case NFS4ERR_ACCESS: 2036 if (cred_otw != cr) { 2037 crfree(cred_otw); 2038 cred_otw = cr; 2039 crhold(cred_otw); 2040 nfs4args_copen_free(open_args); 2041 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2042 nfs4_end_open_seqid_sync(oop); 2043 open_owner_rele(oop); 2044 oop = NULL; 2045 goto top; 2046 } 2047 /* fall through */ 2048 default: 2049 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2050 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2051 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2052 rnode4info(VTOR4(vp)))); 2053 failed_msg = "Couldn't reopen: NFSv4 error"; 2054 nfs4args_copen_free(open_args); 2055 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2056 goto kill_file; 2057 } 2058 2059 resop = &res.array[1]; /* open res */ 2060 op_res = &resop->nfs_resop4_u.opopen; 2061 2062 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2063 2064 /* 2065 * Check if the path we reopened really is the same 2066 * file. We could end up in a situation where the file 2067 * was removed and a new file created with the same name. 2068 */ 2069 resop = &res.array[2]; 2070 gf_res = &resop->nfs_resop4_u.opgetfh; 2071 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2072 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2073 if (fh_different) { 2074 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2075 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2076 /* Oops, we don't have the same file */ 2077 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2078 failed_msg = "Couldn't reopen: Persistent " 2079 "file handle changed"; 2080 else 2081 failed_msg = "Couldn't reopen: Volatile " 2082 "(no expire on open) file handle changed"; 2083 2084 nfs4args_copen_free(open_args); 2085 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2086 nfs_rw_exit(&mi->mi_fh_lock); 2087 goto kill_file; 2088 2089 } else { 2090 /* 2091 * We have volatile file handles that don't compare. 2092 * If the fids are the same then we assume that the 2093 * file handle expired but the rnode still refers to 2094 * the same file object. 2095 * 2096 * First check that we have fids or not. 2097 * If we don't we have a dumb server so we will 2098 * just assume every thing is ok for now. 2099 */ 2100 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2101 rp->r_attr.va_mask & AT_NODEID && 2102 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2103 /* 2104 * We have fids, but they don't 2105 * compare. So kill the file. 2106 */ 2107 failed_msg = 2108 "Couldn't reopen: file handle changed" 2109 " due to mismatched fids"; 2110 nfs4args_copen_free(open_args); 2111 xdr_free(xdr_COMPOUND4res_clnt, 2112 (caddr_t)&res); 2113 nfs_rw_exit(&mi->mi_fh_lock); 2114 goto kill_file; 2115 } else { 2116 /* 2117 * We have volatile file handles that refers 2118 * to the same file (at least they have the 2119 * same fid) or we don't have fids so we 2120 * can't tell. :(. We'll be a kind and accepting 2121 * client so we'll update the rnode's file 2122 * handle with the otw handle. 2123 * 2124 * We need to drop mi->mi_fh_lock since 2125 * sh4_update acquires it. Since there is 2126 * only one recovery thread there is no 2127 * race. 2128 */ 2129 nfs_rw_exit(&mi->mi_fh_lock); 2130 sfh4_update(rp->r_fh, &gf_res->object); 2131 } 2132 } 2133 } else { 2134 nfs_rw_exit(&mi->mi_fh_lock); 2135 } 2136 2137 ASSERT(nfs4_consistent_type(vp)); 2138 2139 /* 2140 * If the server wanted an OPEN_CONFIRM but that fails, just start 2141 * over. Presumably if there is a persistent error it will show up 2142 * when we resend the OPEN. 2143 */ 2144 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2145 bool_t retry_open = FALSE; 2146 2147 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2148 cred_otw, is_recov, &retry_open, 2149 oop, FALSE, ep, NULL); 2150 if (ep->error || ep->stat) { 2151 nfs4args_copen_free(open_args); 2152 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2153 nfs4_end_open_seqid_sync(oop); 2154 open_owner_rele(oop); 2155 oop = NULL; 2156 goto top; 2157 } 2158 } 2159 2160 mutex_enter(&osp->os_sync_lock); 2161 osp->open_stateid = op_res->stateid; 2162 osp->os_delegation = 0; 2163 /* 2164 * Need to reset this bitfield for the possible case where we were 2165 * going to OTW CLOSE the file, got a non-recoverable error, and before 2166 * we could retry the CLOSE, OPENed the file again. 2167 */ 2168 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2169 osp->os_final_close = 0; 2170 osp->os_force_close = 0; 2171 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2172 osp->os_dc_openacc = open_args->share_access; 2173 mutex_exit(&osp->os_sync_lock); 2174 2175 nfs4_end_open_seqid_sync(oop); 2176 2177 /* accept delegation, if any */ 2178 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2179 2180 nfs4args_copen_free(open_args); 2181 2182 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2183 2184 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2185 2186 ASSERT(nfs4_consistent_type(vp)); 2187 2188 open_owner_rele(oop); 2189 crfree(cr); 2190 crfree(cred_otw); 2191 return; 2192 2193 kill_file: 2194 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2195 failed_reopen: 2196 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2197 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2198 (void *)osp, (void *)cr, rnode4info(rp))); 2199 mutex_enter(&osp->os_sync_lock); 2200 osp->os_failed_reopen = 1; 2201 mutex_exit(&osp->os_sync_lock); 2202 bailout: 2203 if (oop != NULL) { 2204 nfs4_end_open_seqid_sync(oop); 2205 open_owner_rele(oop); 2206 } 2207 if (cr != NULL) 2208 crfree(cr); 2209 if (cred_otw != NULL) 2210 crfree(cred_otw); 2211 } 2212 2213 /* for . and .. OPENs */ 2214 /* ARGSUSED */ 2215 static int 2216 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2217 { 2218 rnode4_t *rp; 2219 nfs4_ga_res_t gar; 2220 2221 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2222 2223 /* 2224 * If close-to-open consistency checking is turned off or 2225 * if there is no cached data, we can avoid 2226 * the over the wire getattr. Otherwise, force a 2227 * call to the server to get fresh attributes and to 2228 * check caches. This is required for close-to-open 2229 * consistency. 2230 */ 2231 rp = VTOR4(*vpp); 2232 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2233 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2234 return (0); 2235 2236 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2237 } 2238 2239 /* 2240 * CLOSE a file 2241 */ 2242 /* ARGSUSED */ 2243 static int 2244 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2245 caller_context_t *ct) 2246 { 2247 rnode4_t *rp; 2248 int error = 0; 2249 int r_error = 0; 2250 int n4error = 0; 2251 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2252 2253 /* 2254 * Remove client state for this (lockowner, file) pair. 2255 * Issue otw v4 call to have the server do the same. 2256 */ 2257 2258 rp = VTOR4(vp); 2259 2260 /* 2261 * zone_enter(2) prevents processes from changing zones with NFS files 2262 * open; if we happen to get here from the wrong zone we can't do 2263 * anything over the wire. 2264 */ 2265 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2266 /* 2267 * We could attempt to clean up locks, except we're sure 2268 * that the current process didn't acquire any locks on 2269 * the file: any attempt to lock a file belong to another zone 2270 * will fail, and one can't lock an NFS file and then change 2271 * zones, as that fails too. 2272 * 2273 * Returning an error here is the sane thing to do. A 2274 * subsequent call to VN_RELE() which translates to a 2275 * nfs4_inactive() will clean up state: if the zone of the 2276 * vnode's origin is still alive and kicking, the inactive 2277 * thread will handle the request (from the correct zone), and 2278 * everything (minus the OTW close call) should be OK. If the 2279 * zone is going away nfs4_async_inactive() will throw away 2280 * delegations, open streams and cached pages inline. 2281 */ 2282 return (EIO); 2283 } 2284 2285 /* 2286 * If we are using local locking for this filesystem, then 2287 * release all of the SYSV style record locks. Otherwise, 2288 * we are doing network locking and we need to release all 2289 * of the network locks. All of the locks held by this 2290 * process on this file are released no matter what the 2291 * incoming reference count is. 2292 */ 2293 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2294 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2295 cleanshares(vp, ttoproc(curthread)->p_pid); 2296 } else 2297 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2298 2299 if (e.error) { 2300 struct lm_sysid *lmsid; 2301 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2302 if (lmsid == NULL) { 2303 DTRACE_PROBE2(unknown__sysid, int, e.error, 2304 vnode_t *, vp); 2305 } else { 2306 cleanlocks(vp, ttoproc(curthread)->p_pid, 2307 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2308 2309 lm_rel_sysid(lmsid); 2310 } 2311 return (e.error); 2312 } 2313 2314 if (count > 1) 2315 return (0); 2316 2317 /* 2318 * If the file has been `unlinked', then purge the 2319 * DNLC so that this vnode will get reycled quicker 2320 * and the .nfs* file on the server will get removed. 2321 */ 2322 if (rp->r_unldvp != NULL) 2323 dnlc_purge_vp(vp); 2324 2325 /* 2326 * If the file was open for write and there are pages, 2327 * do a synchronous flush and commit of all of the 2328 * dirty and uncommitted pages. 2329 */ 2330 ASSERT(!e.error); 2331 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2332 error = nfs4_putpage_commit(vp, 0, 0, cr); 2333 2334 mutex_enter(&rp->r_statelock); 2335 r_error = rp->r_error; 2336 rp->r_error = 0; 2337 mutex_exit(&rp->r_statelock); 2338 2339 /* 2340 * If this file type is one for which no explicit 'open' was 2341 * done, then bail now (ie. no need for protocol 'close'). If 2342 * there was an error w/the vm subsystem, return _that_ error, 2343 * otherwise, return any errors that may've been reported via 2344 * the rnode. 2345 */ 2346 if (vp->v_type != VREG) 2347 return (error ? error : r_error); 2348 2349 /* 2350 * The sync putpage commit may have failed above, but since 2351 * we're working w/a regular file, we need to do the protocol 2352 * 'close' (nfs4close_one will figure out if an otw close is 2353 * needed or not). Report any errors _after_ doing the protocol 2354 * 'close'. 2355 */ 2356 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2357 n4error = e.error ? e.error : geterrno4(e.stat); 2358 2359 /* 2360 * Error reporting prio (Hi -> Lo) 2361 * 2362 * i) nfs4_putpage_commit (error) 2363 * ii) rnode's (r_error) 2364 * iii) nfs4close_one (n4error) 2365 */ 2366 return (error ? error : (r_error ? r_error : n4error)); 2367 } 2368 2369 /* 2370 * Initialize *lost_rqstp. 2371 */ 2372 2373 static void 2374 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2375 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2376 vnode_t *vp) 2377 { 2378 if (error != ETIMEDOUT && error != EINTR && 2379 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2380 lost_rqstp->lr_op = 0; 2381 return; 2382 } 2383 2384 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2385 "nfs4close_save_lost_rqst: error %d", error)); 2386 2387 lost_rqstp->lr_op = OP_CLOSE; 2388 /* 2389 * The vp is held and rele'd via the recovery code. 2390 * See nfs4_save_lost_rqst. 2391 */ 2392 lost_rqstp->lr_vp = vp; 2393 lost_rqstp->lr_dvp = NULL; 2394 lost_rqstp->lr_oop = oop; 2395 lost_rqstp->lr_osp = osp; 2396 ASSERT(osp != NULL); 2397 ASSERT(mutex_owned(&osp->os_sync_lock)); 2398 osp->os_pending_close = 1; 2399 lost_rqstp->lr_lop = NULL; 2400 lost_rqstp->lr_cr = cr; 2401 lost_rqstp->lr_flk = NULL; 2402 lost_rqstp->lr_putfirst = FALSE; 2403 } 2404 2405 /* 2406 * Assumes you already have the open seqid sync grabbed as well as the 2407 * 'os_sync_lock'. Note: this will release the open seqid sync and 2408 * 'os_sync_lock' if client recovery starts. Calling functions have to 2409 * be prepared to handle this. 2410 * 2411 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2412 * was needed and was started, and that the calling function should retry 2413 * this function; otherwise it is returned as 0. 2414 * 2415 * Errors are returned via the nfs4_error_t parameter. 2416 */ 2417 static void 2418 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2419 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2420 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2421 { 2422 COMPOUND4args_clnt args; 2423 COMPOUND4res_clnt res; 2424 CLOSE4args *close_args; 2425 nfs_resop4 *resop; 2426 nfs_argop4 argop[3]; 2427 int doqueue = 1; 2428 mntinfo4_t *mi; 2429 seqid4 seqid; 2430 vnode_t *vp; 2431 bool_t needrecov = FALSE; 2432 nfs4_lost_rqst_t lost_rqst; 2433 hrtime_t t; 2434 2435 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2436 2437 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2438 2439 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2440 2441 /* Only set this to 1 if recovery is started */ 2442 *recov = 0; 2443 2444 /* do the OTW call to close the file */ 2445 2446 if (close_type == CLOSE_RESEND) 2447 args.ctag = TAG_CLOSE_LOST; 2448 else if (close_type == CLOSE_AFTER_RESEND) 2449 args.ctag = TAG_CLOSE_UNDO; 2450 else 2451 args.ctag = TAG_CLOSE; 2452 2453 args.array_len = 3; 2454 args.array = argop; 2455 2456 vp = RTOV4(rp); 2457 2458 mi = VTOMI4(vp); 2459 2460 /* putfh target fh */ 2461 argop[0].argop = OP_CPUTFH; 2462 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2463 2464 argop[1].argop = OP_GETATTR; 2465 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2466 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2467 2468 argop[2].argop = OP_CLOSE; 2469 close_args = &argop[2].nfs_argop4_u.opclose; 2470 2471 seqid = nfs4_get_open_seqid(oop) + 1; 2472 2473 close_args->seqid = seqid; 2474 close_args->open_stateid = osp->open_stateid; 2475 2476 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2477 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2478 rnode4info(rp))); 2479 2480 t = gethrtime(); 2481 2482 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2483 2484 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2485 nfs4_set_open_seqid(seqid, oop, args.ctag); 2486 } 2487 2488 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2489 if (ep->error && !needrecov) { 2490 /* 2491 * if there was an error and no recovery is to be done 2492 * then then set up the file to flush its cache if 2493 * needed for the next caller. 2494 */ 2495 mutex_enter(&rp->r_statelock); 2496 PURGE_ATTRCACHE4_LOCKED(rp); 2497 rp->r_flags &= ~R4WRITEMODIFIED; 2498 mutex_exit(&rp->r_statelock); 2499 return; 2500 } 2501 2502 if (needrecov) { 2503 bool_t abort; 2504 nfs4_bseqid_entry_t *bsep = NULL; 2505 2506 if (close_type != CLOSE_RESEND) 2507 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2508 osp, cred_otw, vp); 2509 2510 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2511 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2512 0, args.ctag, close_args->seqid); 2513 2514 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2515 "nfs4close_otw: initiating recovery. error %d " 2516 "res.status %d", ep->error, res.status)); 2517 2518 /* 2519 * Drop the 'os_sync_lock' here so we don't hit 2520 * a potential recursive mutex_enter via an 2521 * 'open_stream_hold()'. 2522 */ 2523 mutex_exit(&osp->os_sync_lock); 2524 *have_sync_lockp = 0; 2525 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2526 (close_type != CLOSE_RESEND && 2527 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2528 OP_CLOSE, bsep, NULL, NULL); 2529 2530 /* drop open seq sync, and let the calling function regrab it */ 2531 nfs4_end_open_seqid_sync(oop); 2532 *did_start_seqid_syncp = 0; 2533 2534 if (bsep) 2535 kmem_free(bsep, sizeof (*bsep)); 2536 /* 2537 * For signals, the caller wants to quit, so don't say to 2538 * retry. For forced unmount, if it's a user thread, it 2539 * wants to quit. If it's a recovery thread, the retry 2540 * will happen higher-up on the call stack. Either way, 2541 * don't say to retry. 2542 */ 2543 if (abort == FALSE && ep->error != EINTR && 2544 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2545 close_type != CLOSE_RESEND && 2546 close_type != CLOSE_AFTER_RESEND) 2547 *recov = 1; 2548 else 2549 *recov = 0; 2550 2551 if (!ep->error) 2552 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2553 return; 2554 } 2555 2556 if (res.status) { 2557 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2558 return; 2559 } 2560 2561 mutex_enter(&rp->r_statev4_lock); 2562 rp->created_v4 = 0; 2563 mutex_exit(&rp->r_statev4_lock); 2564 2565 resop = &res.array[2]; 2566 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2567 osp->os_valid = 0; 2568 2569 /* 2570 * This removes the reference obtained at OPEN; ie, when the 2571 * open stream structure was created. 2572 * 2573 * We don't have to worry about calling 'open_stream_rele' 2574 * since we our currently holding a reference to the open 2575 * stream which means the count cannot go to 0 with this 2576 * decrement. 2577 */ 2578 ASSERT(osp->os_ref_count >= 2); 2579 osp->os_ref_count--; 2580 2581 if (ep->error == 0) { 2582 /* 2583 * Avoid a deadlock with the r_serial thread waiting for 2584 * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be 2585 * held by us. We will wait in nfs4_attr_cache() for the 2586 * completion of the r_serial thread. 2587 */ 2588 mutex_exit(&osp->os_sync_lock); 2589 *have_sync_lockp = 0; 2590 2591 nfs4_attr_cache(vp, 2592 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2593 t, cred_otw, TRUE, NULL); 2594 } 2595 2596 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2597 " returning %d", ep->error)); 2598 2599 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2600 } 2601 2602 /* ARGSUSED */ 2603 static int 2604 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2605 caller_context_t *ct) 2606 { 2607 rnode4_t *rp; 2608 u_offset_t off; 2609 offset_t diff; 2610 uint_t on; 2611 uint_t n; 2612 caddr_t base; 2613 uint_t flags; 2614 int error; 2615 mntinfo4_t *mi; 2616 2617 rp = VTOR4(vp); 2618 2619 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2620 2621 if (IS_SHADOW(vp, rp)) 2622 vp = RTOV4(rp); 2623 2624 if (vp->v_type != VREG) 2625 return (EISDIR); 2626 2627 mi = VTOMI4(vp); 2628 2629 if (nfs_zone() != mi->mi_zone) 2630 return (EIO); 2631 2632 if (uiop->uio_resid == 0) 2633 return (0); 2634 2635 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2636 return (EINVAL); 2637 2638 mutex_enter(&rp->r_statelock); 2639 if (rp->r_flags & R4RECOVERRP) 2640 error = (rp->r_error ? rp->r_error : EIO); 2641 else 2642 error = 0; 2643 mutex_exit(&rp->r_statelock); 2644 if (error) 2645 return (error); 2646 2647 /* 2648 * Bypass VM if caching has been disabled (e.g., locking) or if 2649 * using client-side direct I/O and the file is not mmap'd and 2650 * there are no cached pages. 2651 */ 2652 if ((vp->v_flag & VNOCACHE) || 2653 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2654 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2655 size_t resid = 0; 2656 2657 return (nfs4read(vp, NULL, uiop->uio_loffset, 2658 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2659 } 2660 2661 error = 0; 2662 2663 do { 2664 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2665 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2666 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2667 2668 if (error = nfs4_validate_caches(vp, cr)) 2669 break; 2670 2671 mutex_enter(&rp->r_statelock); 2672 while (rp->r_flags & R4INCACHEPURGE) { 2673 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2674 mutex_exit(&rp->r_statelock); 2675 return (EINTR); 2676 } 2677 } 2678 diff = rp->r_size - uiop->uio_loffset; 2679 mutex_exit(&rp->r_statelock); 2680 if (diff <= 0) 2681 break; 2682 if (diff < n) 2683 n = (uint_t)diff; 2684 2685 if (vpm_enable) { 2686 /* 2687 * Copy data. 2688 */ 2689 error = vpm_data_copy(vp, off + on, n, uiop, 2690 1, NULL, 0, S_READ); 2691 } else { 2692 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2693 S_READ); 2694 2695 error = uiomove(base + on, n, UIO_READ, uiop); 2696 } 2697 2698 if (!error) { 2699 /* 2700 * If read a whole block or read to eof, 2701 * won't need this buffer again soon. 2702 */ 2703 mutex_enter(&rp->r_statelock); 2704 if (n + on == MAXBSIZE || 2705 uiop->uio_loffset == rp->r_size) 2706 flags = SM_DONTNEED; 2707 else 2708 flags = 0; 2709 mutex_exit(&rp->r_statelock); 2710 if (vpm_enable) { 2711 error = vpm_sync_pages(vp, off, n, flags); 2712 } else { 2713 error = segmap_release(segkmap, base, flags); 2714 } 2715 } else { 2716 if (vpm_enable) { 2717 (void) vpm_sync_pages(vp, off, n, 0); 2718 } else { 2719 (void) segmap_release(segkmap, base, 0); 2720 } 2721 } 2722 } while (!error && uiop->uio_resid > 0); 2723 2724 return (error); 2725 } 2726 2727 /* ARGSUSED */ 2728 static int 2729 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2730 caller_context_t *ct) 2731 { 2732 rlim64_t limit = uiop->uio_llimit; 2733 rnode4_t *rp; 2734 u_offset_t off; 2735 caddr_t base; 2736 uint_t flags; 2737 int remainder; 2738 size_t n; 2739 int on; 2740 int error; 2741 int resid; 2742 u_offset_t offset; 2743 mntinfo4_t *mi; 2744 uint_t bsize; 2745 2746 rp = VTOR4(vp); 2747 2748 if (IS_SHADOW(vp, rp)) 2749 vp = RTOV4(rp); 2750 2751 if (vp->v_type != VREG) 2752 return (EISDIR); 2753 2754 mi = VTOMI4(vp); 2755 2756 if (nfs_zone() != mi->mi_zone) 2757 return (EIO); 2758 2759 if (uiop->uio_resid == 0) 2760 return (0); 2761 2762 mutex_enter(&rp->r_statelock); 2763 if (rp->r_flags & R4RECOVERRP) 2764 error = (rp->r_error ? rp->r_error : EIO); 2765 else 2766 error = 0; 2767 mutex_exit(&rp->r_statelock); 2768 if (error) 2769 return (error); 2770 2771 if (ioflag & FAPPEND) { 2772 struct vattr va; 2773 2774 /* 2775 * Must serialize if appending. 2776 */ 2777 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2778 nfs_rw_exit(&rp->r_rwlock); 2779 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2780 INTR4(vp))) 2781 return (EINTR); 2782 } 2783 2784 va.va_mask = AT_SIZE; 2785 error = nfs4getattr(vp, &va, cr); 2786 if (error) 2787 return (error); 2788 uiop->uio_loffset = va.va_size; 2789 } 2790 2791 offset = uiop->uio_loffset + uiop->uio_resid; 2792 2793 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2794 return (EINVAL); 2795 2796 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2797 limit = MAXOFFSET_T; 2798 2799 /* 2800 * Check to make sure that the process will not exceed 2801 * its limit on file size. It is okay to write up to 2802 * the limit, but not beyond. Thus, the write which 2803 * reaches the limit will be short and the next write 2804 * will return an error. 2805 */ 2806 remainder = 0; 2807 if (offset > uiop->uio_llimit) { 2808 remainder = offset - uiop->uio_llimit; 2809 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2810 if (uiop->uio_resid <= 0) { 2811 proc_t *p = ttoproc(curthread); 2812 2813 uiop->uio_resid += remainder; 2814 mutex_enter(&p->p_lock); 2815 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2816 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2817 mutex_exit(&p->p_lock); 2818 return (EFBIG); 2819 } 2820 } 2821 2822 /* update the change attribute, if we have a write delegation */ 2823 2824 mutex_enter(&rp->r_statev4_lock); 2825 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2826 rp->r_deleg_change++; 2827 2828 mutex_exit(&rp->r_statev4_lock); 2829 2830 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) 2831 return (EINTR); 2832 2833 /* 2834 * Bypass VM if caching has been disabled (e.g., locking) or if 2835 * using client-side direct I/O and the file is not mmap'd and 2836 * there are no cached pages. 2837 */ 2838 if ((vp->v_flag & VNOCACHE) || 2839 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2840 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2841 size_t bufsize; 2842 int count; 2843 u_offset_t org_offset; 2844 stable_how4 stab_comm; 2845 nfs4_fwrite: 2846 if (rp->r_flags & R4STALE) { 2847 resid = uiop->uio_resid; 2848 offset = uiop->uio_loffset; 2849 error = rp->r_error; 2850 /* 2851 * A close may have cleared r_error, if so, 2852 * propagate ESTALE error return properly 2853 */ 2854 if (error == 0) 2855 error = ESTALE; 2856 goto bottom; 2857 } 2858 2859 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2860 base = kmem_alloc(bufsize, KM_SLEEP); 2861 do { 2862 if (ioflag & FDSYNC) 2863 stab_comm = DATA_SYNC4; 2864 else 2865 stab_comm = FILE_SYNC4; 2866 resid = uiop->uio_resid; 2867 offset = uiop->uio_loffset; 2868 count = MIN(uiop->uio_resid, bufsize); 2869 org_offset = uiop->uio_loffset; 2870 error = uiomove(base, count, UIO_WRITE, uiop); 2871 if (!error) { 2872 error = nfs4write(vp, base, org_offset, 2873 count, cr, &stab_comm); 2874 if (!error) { 2875 mutex_enter(&rp->r_statelock); 2876 if (rp->r_size < uiop->uio_loffset) 2877 rp->r_size = uiop->uio_loffset; 2878 mutex_exit(&rp->r_statelock); 2879 } 2880 } 2881 } while (!error && uiop->uio_resid > 0); 2882 kmem_free(base, bufsize); 2883 goto bottom; 2884 } 2885 2886 bsize = vp->v_vfsp->vfs_bsize; 2887 2888 do { 2889 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2890 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2891 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2892 2893 resid = uiop->uio_resid; 2894 offset = uiop->uio_loffset; 2895 2896 if (rp->r_flags & R4STALE) { 2897 error = rp->r_error; 2898 /* 2899 * A close may have cleared r_error, if so, 2900 * propagate ESTALE error return properly 2901 */ 2902 if (error == 0) 2903 error = ESTALE; 2904 break; 2905 } 2906 2907 /* 2908 * Don't create dirty pages faster than they 2909 * can be cleaned so that the system doesn't 2910 * get imbalanced. If the async queue is 2911 * maxed out, then wait for it to drain before 2912 * creating more dirty pages. Also, wait for 2913 * any threads doing pagewalks in the vop_getattr 2914 * entry points so that they don't block for 2915 * long periods. 2916 */ 2917 mutex_enter(&rp->r_statelock); 2918 while ((mi->mi_max_threads != 0 && 2919 rp->r_awcount > 2 * mi->mi_max_threads) || 2920 rp->r_gcount > 0) { 2921 if (INTR4(vp)) { 2922 klwp_t *lwp = ttolwp(curthread); 2923 2924 if (lwp != NULL) 2925 lwp->lwp_nostop++; 2926 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2927 mutex_exit(&rp->r_statelock); 2928 if (lwp != NULL) 2929 lwp->lwp_nostop--; 2930 error = EINTR; 2931 goto bottom; 2932 } 2933 if (lwp != NULL) 2934 lwp->lwp_nostop--; 2935 } else 2936 cv_wait(&rp->r_cv, &rp->r_statelock); 2937 } 2938 mutex_exit(&rp->r_statelock); 2939 2940 /* 2941 * Touch the page and fault it in if it is not in core 2942 * before segmap_getmapflt or vpm_data_copy can lock it. 2943 * This is to avoid the deadlock if the buffer is mapped 2944 * to the same file through mmap which we want to write. 2945 */ 2946 uio_prefaultpages((long)n, uiop); 2947 2948 if (vpm_enable) { 2949 /* 2950 * It will use kpm mappings, so no need to 2951 * pass an address. 2952 */ 2953 error = writerp4(rp, NULL, n, uiop, 0); 2954 } else { 2955 if (segmap_kpm) { 2956 int pon = uiop->uio_loffset & PAGEOFFSET; 2957 size_t pn = MIN(PAGESIZE - pon, 2958 uiop->uio_resid); 2959 int pagecreate; 2960 2961 mutex_enter(&rp->r_statelock); 2962 pagecreate = (pon == 0) && (pn == PAGESIZE || 2963 uiop->uio_loffset + pn >= rp->r_size); 2964 mutex_exit(&rp->r_statelock); 2965 2966 base = segmap_getmapflt(segkmap, vp, off + on, 2967 pn, !pagecreate, S_WRITE); 2968 2969 error = writerp4(rp, base + pon, n, uiop, 2970 pagecreate); 2971 2972 } else { 2973 base = segmap_getmapflt(segkmap, vp, off + on, 2974 n, 0, S_READ); 2975 error = writerp4(rp, base + on, n, uiop, 0); 2976 } 2977 } 2978 2979 if (!error) { 2980 if (mi->mi_flags & MI4_NOAC) 2981 flags = SM_WRITE; 2982 else if ((uiop->uio_loffset % bsize) == 0 || 2983 IS_SWAPVP(vp)) { 2984 /* 2985 * Have written a whole block. 2986 * Start an asynchronous write 2987 * and mark the buffer to 2988 * indicate that it won't be 2989 * needed again soon. 2990 */ 2991 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2992 } else 2993 flags = 0; 2994 if ((ioflag & (FSYNC|FDSYNC)) || 2995 (rp->r_flags & R4OUTOFSPACE)) { 2996 flags &= ~SM_ASYNC; 2997 flags |= SM_WRITE; 2998 } 2999 if (vpm_enable) { 3000 error = vpm_sync_pages(vp, off, n, flags); 3001 } else { 3002 error = segmap_release(segkmap, base, flags); 3003 } 3004 } else { 3005 if (vpm_enable) { 3006 (void) vpm_sync_pages(vp, off, n, 0); 3007 } else { 3008 (void) segmap_release(segkmap, base, 0); 3009 } 3010 /* 3011 * In the event that we got an access error while 3012 * faulting in a page for a write-only file just 3013 * force a write. 3014 */ 3015 if (error == EACCES) 3016 goto nfs4_fwrite; 3017 } 3018 } while (!error && uiop->uio_resid > 0); 3019 3020 bottom: 3021 if (error) { 3022 uiop->uio_resid = resid + remainder; 3023 uiop->uio_loffset = offset; 3024 } else { 3025 uiop->uio_resid += remainder; 3026 3027 mutex_enter(&rp->r_statev4_lock); 3028 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3029 gethrestime(&rp->r_attr.va_mtime); 3030 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3031 } 3032 mutex_exit(&rp->r_statev4_lock); 3033 } 3034 3035 nfs_rw_exit(&rp->r_lkserlock); 3036 3037 return (error); 3038 } 3039 3040 /* 3041 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3042 */ 3043 static int 3044 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3045 int flags, cred_t *cr) 3046 { 3047 struct buf *bp; 3048 int error; 3049 page_t *savepp; 3050 uchar_t fsdata; 3051 stable_how4 stab_comm; 3052 3053 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3054 bp = pageio_setup(pp, len, vp, flags); 3055 ASSERT(bp != NULL); 3056 3057 /* 3058 * pageio_setup should have set b_addr to 0. This 3059 * is correct since we want to do I/O on a page 3060 * boundary. bp_mapin will use this addr to calculate 3061 * an offset, and then set b_addr to the kernel virtual 3062 * address it allocated for us. 3063 */ 3064 ASSERT(bp->b_un.b_addr == 0); 3065 3066 bp->b_edev = 0; 3067 bp->b_dev = 0; 3068 bp->b_lblkno = lbtodb(off); 3069 bp->b_file = vp; 3070 bp->b_offset = (offset_t)off; 3071 bp_mapin(bp); 3072 3073 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3074 freemem > desfree) 3075 stab_comm = UNSTABLE4; 3076 else 3077 stab_comm = FILE_SYNC4; 3078 3079 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3080 3081 bp_mapout(bp); 3082 pageio_done(bp); 3083 3084 if (stab_comm == UNSTABLE4) 3085 fsdata = C_DELAYCOMMIT; 3086 else 3087 fsdata = C_NOCOMMIT; 3088 3089 savepp = pp; 3090 do { 3091 pp->p_fsdata = fsdata; 3092 } while ((pp = pp->p_next) != savepp); 3093 3094 return (error); 3095 } 3096 3097 /* 3098 */ 3099 static int 3100 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3101 { 3102 nfs4_open_owner_t *oop; 3103 nfs4_open_stream_t *osp; 3104 rnode4_t *rp = VTOR4(vp); 3105 mntinfo4_t *mi = VTOMI4(vp); 3106 int reopen_needed; 3107 3108 ASSERT(nfs_zone() == mi->mi_zone); 3109 3110 3111 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3112 if (!oop) 3113 return (EIO); 3114 3115 /* returns with 'os_sync_lock' held */ 3116 osp = find_open_stream(oop, rp); 3117 if (!osp) { 3118 open_owner_rele(oop); 3119 return (EIO); 3120 } 3121 3122 if (osp->os_failed_reopen) { 3123 mutex_exit(&osp->os_sync_lock); 3124 open_stream_rele(osp, rp); 3125 open_owner_rele(oop); 3126 return (EIO); 3127 } 3128 3129 /* 3130 * Determine whether a reopen is needed. If this 3131 * is a delegation open stream, then the os_delegation bit 3132 * should be set. 3133 */ 3134 3135 reopen_needed = osp->os_delegation; 3136 3137 mutex_exit(&osp->os_sync_lock); 3138 open_owner_rele(oop); 3139 3140 if (reopen_needed) { 3141 nfs4_error_zinit(ep); 3142 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3143 mutex_enter(&osp->os_sync_lock); 3144 if (ep->error || ep->stat || osp->os_failed_reopen) { 3145 mutex_exit(&osp->os_sync_lock); 3146 open_stream_rele(osp, rp); 3147 return (EIO); 3148 } 3149 mutex_exit(&osp->os_sync_lock); 3150 } 3151 open_stream_rele(osp, rp); 3152 3153 return (0); 3154 } 3155 3156 /* 3157 * Write to file. Writes to remote server in largest size 3158 * chunks that the server can handle. Write is synchronous. 3159 */ 3160 static int 3161 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3162 stable_how4 *stab_comm) 3163 { 3164 mntinfo4_t *mi; 3165 COMPOUND4args_clnt args; 3166 COMPOUND4res_clnt res; 3167 WRITE4args *wargs; 3168 WRITE4res *wres; 3169 nfs_argop4 argop[2]; 3170 nfs_resop4 *resop; 3171 int tsize; 3172 stable_how4 stable; 3173 rnode4_t *rp; 3174 int doqueue = 1; 3175 bool_t needrecov; 3176 nfs4_recov_state_t recov_state; 3177 nfs4_stateid_types_t sid_types; 3178 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3179 int recov; 3180 3181 rp = VTOR4(vp); 3182 mi = VTOMI4(vp); 3183 3184 ASSERT(nfs_zone() == mi->mi_zone); 3185 3186 stable = *stab_comm; 3187 *stab_comm = FILE_SYNC4; 3188 3189 needrecov = FALSE; 3190 recov_state.rs_flags = 0; 3191 recov_state.rs_num_retry_despite_err = 0; 3192 nfs4_init_stateid_types(&sid_types); 3193 3194 /* Is curthread the recovery thread? */ 3195 mutex_enter(&mi->mi_lock); 3196 recov = (mi->mi_recovthread == curthread); 3197 mutex_exit(&mi->mi_lock); 3198 3199 recov_retry: 3200 args.ctag = TAG_WRITE; 3201 args.array_len = 2; 3202 args.array = argop; 3203 3204 if (!recov) { 3205 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3206 &recov_state, NULL); 3207 if (e.error) 3208 return (e.error); 3209 } 3210 3211 /* 0. putfh target fh */ 3212 argop[0].argop = OP_CPUTFH; 3213 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3214 3215 /* 1. write */ 3216 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3217 3218 do { 3219 3220 wargs->offset = (offset4)offset; 3221 wargs->data_val = base; 3222 3223 if (mi->mi_io_kstats) { 3224 mutex_enter(&mi->mi_lock); 3225 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3226 mutex_exit(&mi->mi_lock); 3227 } 3228 3229 if ((vp->v_flag & VNOCACHE) || 3230 (rp->r_flags & R4DIRECTIO) || 3231 (mi->mi_flags & MI4_DIRECTIO)) 3232 tsize = MIN(mi->mi_stsize, count); 3233 else 3234 tsize = MIN(mi->mi_curwrite, count); 3235 wargs->data_len = (uint_t)tsize; 3236 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3237 3238 if (mi->mi_io_kstats) { 3239 mutex_enter(&mi->mi_lock); 3240 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3241 mutex_exit(&mi->mi_lock); 3242 } 3243 3244 if (!recov) { 3245 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3246 if (e.error && !needrecov) { 3247 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3248 &recov_state, needrecov); 3249 return (e.error); 3250 } 3251 } else { 3252 if (e.error) 3253 return (e.error); 3254 } 3255 3256 /* 3257 * Do handling of OLD_STATEID outside 3258 * of the normal recovery framework. 3259 * 3260 * If write receives a BAD stateid error while using a 3261 * delegation stateid, retry using the open stateid (if it 3262 * exists). If it doesn't have an open stateid, reopen the 3263 * file first, then retry. 3264 */ 3265 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3266 sid_types.cur_sid_type != SPEC_SID) { 3267 nfs4_save_stateid(&wargs->stateid, &sid_types); 3268 if (!recov) 3269 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3270 &recov_state, needrecov); 3271 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3272 goto recov_retry; 3273 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3274 sid_types.cur_sid_type == DEL_SID) { 3275 nfs4_save_stateid(&wargs->stateid, &sid_types); 3276 mutex_enter(&rp->r_statev4_lock); 3277 rp->r_deleg_return_pending = TRUE; 3278 mutex_exit(&rp->r_statev4_lock); 3279 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3280 if (!recov) 3281 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3282 &recov_state, needrecov); 3283 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3284 return (EIO); 3285 } 3286 if (!recov) 3287 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3288 &recov_state, needrecov); 3289 /* hold needed for nfs4delegreturn_thread */ 3290 VN_HOLD(vp); 3291 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3292 NFS4_DR_DISCARD), FALSE); 3293 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3294 goto recov_retry; 3295 } 3296 3297 if (needrecov) { 3298 bool_t abort; 3299 3300 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3301 "nfs4write: client got error %d, res.status %d" 3302 ", so start recovery", e.error, res.status)); 3303 3304 abort = nfs4_start_recovery(&e, 3305 VTOMI4(vp), vp, NULL, &wargs->stateid, 3306 NULL, OP_WRITE, NULL, NULL, NULL); 3307 if (!e.error) { 3308 e.error = geterrno4(res.status); 3309 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3310 } 3311 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3312 &recov_state, needrecov); 3313 if (abort == FALSE) 3314 goto recov_retry; 3315 return (e.error); 3316 } 3317 3318 if (res.status) { 3319 e.error = geterrno4(res.status); 3320 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3321 if (!recov) 3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3323 &recov_state, needrecov); 3324 return (e.error); 3325 } 3326 3327 resop = &res.array[1]; /* write res */ 3328 wres = &resop->nfs_resop4_u.opwrite; 3329 3330 if ((int)wres->count > tsize) { 3331 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3332 3333 zcmn_err(getzoneid(), CE_WARN, 3334 "nfs4write: server wrote %u, requested was %u", 3335 (int)wres->count, tsize); 3336 if (!recov) 3337 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3338 &recov_state, needrecov); 3339 return (EIO); 3340 } 3341 if (wres->committed == UNSTABLE4) { 3342 *stab_comm = UNSTABLE4; 3343 if (wargs->stable == DATA_SYNC4 || 3344 wargs->stable == FILE_SYNC4) { 3345 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3346 zcmn_err(getzoneid(), CE_WARN, 3347 "nfs4write: server %s did not commit " 3348 "to stable storage", 3349 rp->r_server->sv_hostname); 3350 if (!recov) 3351 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3352 OH_WRITE, &recov_state, needrecov); 3353 return (EIO); 3354 } 3355 } 3356 3357 tsize = (int)wres->count; 3358 count -= tsize; 3359 base += tsize; 3360 offset += tsize; 3361 if (mi->mi_io_kstats) { 3362 mutex_enter(&mi->mi_lock); 3363 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3364 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3365 tsize; 3366 mutex_exit(&mi->mi_lock); 3367 } 3368 lwp_stat_update(LWP_STAT_OUBLK, 1); 3369 mutex_enter(&rp->r_statelock); 3370 if (rp->r_flags & R4HAVEVERF) { 3371 if (rp->r_writeverf != wres->writeverf) { 3372 nfs4_set_mod(vp); 3373 rp->r_writeverf = wres->writeverf; 3374 } 3375 } else { 3376 rp->r_writeverf = wres->writeverf; 3377 rp->r_flags |= R4HAVEVERF; 3378 } 3379 PURGE_ATTRCACHE4_LOCKED(rp); 3380 rp->r_flags |= R4WRITEMODIFIED; 3381 gethrestime(&rp->r_attr.va_mtime); 3382 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3383 mutex_exit(&rp->r_statelock); 3384 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3385 } while (count); 3386 3387 if (!recov) 3388 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3389 needrecov); 3390 3391 return (e.error); 3392 } 3393 3394 /* 3395 * Read from a file. Reads data in largest chunks our interface can handle. 3396 */ 3397 static int 3398 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3399 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3400 { 3401 mntinfo4_t *mi; 3402 COMPOUND4args_clnt args; 3403 COMPOUND4res_clnt res; 3404 READ4args *rargs; 3405 nfs_argop4 argop[2]; 3406 int tsize; 3407 int doqueue; 3408 rnode4_t *rp; 3409 int data_len; 3410 bool_t is_eof; 3411 bool_t needrecov = FALSE; 3412 nfs4_recov_state_t recov_state; 3413 nfs4_stateid_types_t sid_types; 3414 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3415 3416 rp = VTOR4(vp); 3417 mi = VTOMI4(vp); 3418 doqueue = 1; 3419 3420 ASSERT(nfs_zone() == mi->mi_zone); 3421 3422 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3423 3424 args.array_len = 2; 3425 args.array = argop; 3426 3427 nfs4_init_stateid_types(&sid_types); 3428 3429 recov_state.rs_flags = 0; 3430 recov_state.rs_num_retry_despite_err = 0; 3431 3432 recov_retry: 3433 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3434 &recov_state, NULL); 3435 if (e.error) 3436 return (e.error); 3437 3438 /* putfh target fh */ 3439 argop[0].argop = OP_CPUTFH; 3440 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3441 3442 /* read */ 3443 argop[1].argop = OP_READ; 3444 rargs = &argop[1].nfs_argop4_u.opread; 3445 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3446 OP_READ, &sid_types, async); 3447 3448 do { 3449 if (mi->mi_io_kstats) { 3450 mutex_enter(&mi->mi_lock); 3451 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3452 mutex_exit(&mi->mi_lock); 3453 } 3454 3455 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3456 "nfs4read: %s call, rp %s", 3457 needrecov ? "recov" : "first", 3458 rnode4info(rp))); 3459 3460 if ((vp->v_flag & VNOCACHE) || 3461 (rp->r_flags & R4DIRECTIO) || 3462 (mi->mi_flags & MI4_DIRECTIO)) 3463 tsize = MIN(mi->mi_tsize, count); 3464 else 3465 tsize = MIN(mi->mi_curread, count); 3466 3467 rargs->offset = (offset4)offset; 3468 rargs->count = (count4)tsize; 3469 rargs->res_data_val_alt = NULL; 3470 rargs->res_mblk = NULL; 3471 rargs->res_uiop = NULL; 3472 rargs->res_maxsize = 0; 3473 rargs->wlist = NULL; 3474 3475 if (uiop) 3476 rargs->res_uiop = uiop; 3477 else 3478 rargs->res_data_val_alt = base; 3479 rargs->res_maxsize = tsize; 3480 3481 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3482 #ifdef DEBUG 3483 if (nfs4read_error_inject) { 3484 res.status = nfs4read_error_inject; 3485 nfs4read_error_inject = 0; 3486 } 3487 #endif 3488 3489 if (mi->mi_io_kstats) { 3490 mutex_enter(&mi->mi_lock); 3491 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3492 mutex_exit(&mi->mi_lock); 3493 } 3494 3495 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3496 if (e.error != 0 && !needrecov) { 3497 nfs4_end_fop(mi, vp, NULL, OH_READ, 3498 &recov_state, needrecov); 3499 return (e.error); 3500 } 3501 3502 /* 3503 * Do proper retry for OLD and BAD stateid errors outside 3504 * of the normal recovery framework. There are two differences 3505 * between async and sync reads. The first is that we allow 3506 * retry on BAD_STATEID for async reads, but not sync reads. 3507 * The second is that we mark the file dead for a failed 3508 * attempt with a special stateid for sync reads, but just 3509 * return EIO for async reads. 3510 * 3511 * If a sync read receives a BAD stateid error while using a 3512 * delegation stateid, retry using the open stateid (if it 3513 * exists). If it doesn't have an open stateid, reopen the 3514 * file first, then retry. 3515 */ 3516 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3517 res.status == NFS4ERR_BAD_STATEID) && async) { 3518 nfs4_end_fop(mi, vp, NULL, OH_READ, 3519 &recov_state, needrecov); 3520 if (sid_types.cur_sid_type == SPEC_SID) { 3521 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3522 return (EIO); 3523 } 3524 nfs4_save_stateid(&rargs->stateid, &sid_types); 3525 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3526 goto recov_retry; 3527 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3528 !async && sid_types.cur_sid_type != SPEC_SID) { 3529 nfs4_save_stateid(&rargs->stateid, &sid_types); 3530 nfs4_end_fop(mi, vp, NULL, OH_READ, 3531 &recov_state, needrecov); 3532 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3533 goto recov_retry; 3534 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3535 sid_types.cur_sid_type == DEL_SID) { 3536 nfs4_save_stateid(&rargs->stateid, &sid_types); 3537 mutex_enter(&rp->r_statev4_lock); 3538 rp->r_deleg_return_pending = TRUE; 3539 mutex_exit(&rp->r_statev4_lock); 3540 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3541 nfs4_end_fop(mi, vp, NULL, OH_READ, 3542 &recov_state, needrecov); 3543 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3544 return (EIO); 3545 } 3546 nfs4_end_fop(mi, vp, NULL, OH_READ, 3547 &recov_state, needrecov); 3548 /* hold needed for nfs4delegreturn_thread */ 3549 VN_HOLD(vp); 3550 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3551 NFS4_DR_DISCARD), FALSE); 3552 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3553 goto recov_retry; 3554 } 3555 if (needrecov) { 3556 bool_t abort; 3557 3558 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3559 "nfs4read: initiating recovery\n")); 3560 abort = nfs4_start_recovery(&e, 3561 mi, vp, NULL, &rargs->stateid, 3562 NULL, OP_READ, NULL, NULL, NULL); 3563 nfs4_end_fop(mi, vp, NULL, OH_READ, 3564 &recov_state, needrecov); 3565 /* 3566 * Do not retry if we got OLD_STATEID using a special 3567 * stateid. This avoids looping with a broken server. 3568 */ 3569 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3570 sid_types.cur_sid_type == SPEC_SID) 3571 abort = TRUE; 3572 3573 if (abort == FALSE) { 3574 /* 3575 * Need to retry all possible stateids in 3576 * case the recovery error wasn't stateid 3577 * related or the stateids have become 3578 * stale (server reboot). 3579 */ 3580 nfs4_init_stateid_types(&sid_types); 3581 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3582 goto recov_retry; 3583 } 3584 3585 if (!e.error) { 3586 e.error = geterrno4(res.status); 3587 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3588 } 3589 return (e.error); 3590 } 3591 3592 if (res.status) { 3593 e.error = geterrno4(res.status); 3594 nfs4_end_fop(mi, vp, NULL, OH_READ, 3595 &recov_state, needrecov); 3596 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3597 return (e.error); 3598 } 3599 3600 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3601 count -= data_len; 3602 if (base) 3603 base += data_len; 3604 offset += data_len; 3605 if (mi->mi_io_kstats) { 3606 mutex_enter(&mi->mi_lock); 3607 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3608 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3609 mutex_exit(&mi->mi_lock); 3610 } 3611 lwp_stat_update(LWP_STAT_INBLK, 1); 3612 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3613 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3614 3615 } while (count && !is_eof); 3616 3617 *residp = count; 3618 3619 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3620 3621 return (e.error); 3622 } 3623 3624 /* ARGSUSED */ 3625 static int 3626 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3627 caller_context_t *ct) 3628 { 3629 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3630 return (EIO); 3631 switch (cmd) { 3632 case _FIODIRECTIO: 3633 return (nfs4_directio(vp, (int)arg, cr)); 3634 default: 3635 return (ENOTTY); 3636 } 3637 } 3638 3639 /* ARGSUSED */ 3640 int 3641 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3642 caller_context_t *ct) 3643 { 3644 int error; 3645 rnode4_t *rp = VTOR4(vp); 3646 3647 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3648 return (EIO); 3649 /* 3650 * If it has been specified that the return value will 3651 * just be used as a hint, and we are only being asked 3652 * for size, fsid or rdevid, then return the client's 3653 * notion of these values without checking to make sure 3654 * that the attribute cache is up to date. 3655 * The whole point is to avoid an over the wire GETATTR 3656 * call. 3657 */ 3658 if (flags & ATTR_HINT) { 3659 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) { 3660 mutex_enter(&rp->r_statelock); 3661 if (vap->va_mask & AT_SIZE) 3662 vap->va_size = rp->r_size; 3663 if (vap->va_mask & AT_FSID) 3664 vap->va_fsid = rp->r_attr.va_fsid; 3665 if (vap->va_mask & AT_RDEV) 3666 vap->va_rdev = rp->r_attr.va_rdev; 3667 mutex_exit(&rp->r_statelock); 3668 return (0); 3669 } 3670 } 3671 3672 /* 3673 * Only need to flush pages if asking for the mtime 3674 * and if there any dirty pages or any outstanding 3675 * asynchronous (write) requests for this file. 3676 */ 3677 if (vap->va_mask & AT_MTIME) { 3678 rp = VTOR4(vp); 3679 if (nfs4_has_pages(vp)) { 3680 mutex_enter(&rp->r_statev4_lock); 3681 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3682 mutex_exit(&rp->r_statev4_lock); 3683 if (rp->r_flags & R4DIRTY || 3684 rp->r_awcount > 0) { 3685 mutex_enter(&rp->r_statelock); 3686 rp->r_gcount++; 3687 mutex_exit(&rp->r_statelock); 3688 error = 3689 nfs4_putpage(vp, (u_offset_t)0, 3690 0, 0, cr, NULL); 3691 mutex_enter(&rp->r_statelock); 3692 if (error && (error == ENOSPC || 3693 error == EDQUOT)) { 3694 if (!rp->r_error) 3695 rp->r_error = error; 3696 } 3697 if (--rp->r_gcount == 0) 3698 cv_broadcast(&rp->r_cv); 3699 mutex_exit(&rp->r_statelock); 3700 } 3701 } else { 3702 mutex_exit(&rp->r_statev4_lock); 3703 } 3704 } 3705 } 3706 return (nfs4getattr(vp, vap, cr)); 3707 } 3708 3709 int 3710 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3711 { 3712 /* 3713 * If these are the only two bits cleared 3714 * on the server then return 0 (OK) else 3715 * return 1 (BAD). 3716 */ 3717 on_client &= ~(S_ISUID|S_ISGID); 3718 if (on_client == from_server) 3719 return (0); 3720 else 3721 return (1); 3722 } 3723 3724 /*ARGSUSED4*/ 3725 static int 3726 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3727 caller_context_t *ct) 3728 { 3729 int error; 3730 3731 if (vap->va_mask & AT_NOSET) 3732 return (EINVAL); 3733 3734 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3735 return (EIO); 3736 3737 /* 3738 * Don't call secpolicy_vnode_setattr, the client cannot 3739 * use its cached attributes to make security decisions 3740 * as the server may be faking mode bits or mapping uid/gid. 3741 * Always just let the server to the checking. 3742 * If we provide the ability to remove basic priviledges 3743 * to setattr (e.g. basic without chmod) then we will 3744 * need to add a check here before calling the server. 3745 */ 3746 error = nfs4setattr(vp, vap, flags, cr, NULL); 3747 3748 if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) 3749 vnevent_truncate(vp, ct); 3750 3751 return (error); 3752 } 3753 3754 /* 3755 * To replace the "guarded" version 3 setattr, we use two types of compound 3756 * setattr requests: 3757 * 1. The "normal" setattr, used when the size of the file isn't being 3758 * changed - { Putfh <fh>; Setattr; Getattr }/ 3759 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3760 * with only ctime as the argument. If the server ctime differs from 3761 * what is cached on the client, the verify will fail, but we would 3762 * already have the ctime from the preceding getattr, so just set it 3763 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3764 * Setattr; Getattr }. 3765 * 3766 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3767 * this setattr and NULL if they are not. 3768 */ 3769 static int 3770 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3771 vsecattr_t *vsap) 3772 { 3773 COMPOUND4args_clnt args; 3774 COMPOUND4res_clnt res, *resp = NULL; 3775 nfs4_ga_res_t *garp = NULL; 3776 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3777 nfs_argop4 argop[5]; 3778 int verify_argop = -1; 3779 int setattr_argop = 1; 3780 nfs_resop4 *resop; 3781 vattr_t va; 3782 rnode4_t *rp; 3783 int doqueue = 1; 3784 uint_t mask = vap->va_mask; 3785 mode_t omode; 3786 vsecattr_t *vsp; 3787 timestruc_t ctime; 3788 bool_t needrecov = FALSE; 3789 nfs4_recov_state_t recov_state; 3790 nfs4_stateid_types_t sid_types; 3791 stateid4 stateid; 3792 hrtime_t t; 3793 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3794 servinfo4_t *svp; 3795 bitmap4 supp_attrs; 3796 3797 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3798 rp = VTOR4(vp); 3799 nfs4_init_stateid_types(&sid_types); 3800 3801 /* 3802 * Only need to flush pages if there are any pages and 3803 * if the file is marked as dirty in some fashion. The 3804 * file must be flushed so that we can accurately 3805 * determine the size of the file and the cached data 3806 * after the SETATTR returns. A file is considered to 3807 * be dirty if it is either marked with R4DIRTY, has 3808 * outstanding i/o's active, or is mmap'd. In this 3809 * last case, we can't tell whether there are dirty 3810 * pages, so we flush just to be sure. 3811 */ 3812 if (nfs4_has_pages(vp) && 3813 ((rp->r_flags & R4DIRTY) || 3814 rp->r_count > 0 || 3815 rp->r_mapcnt > 0)) { 3816 ASSERT(vp->v_type != VCHR); 3817 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3818 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3819 mutex_enter(&rp->r_statelock); 3820 if (!rp->r_error) 3821 rp->r_error = e.error; 3822 mutex_exit(&rp->r_statelock); 3823 } 3824 } 3825 3826 if (mask & AT_SIZE) { 3827 /* 3828 * Verification setattr compound for non-deleg AT_SIZE: 3829 * { Putfh; Getattr; Verify; Setattr; Getattr } 3830 * Set ctime local here (outside the do_again label) 3831 * so that subsequent retries (after failed VERIFY) 3832 * will use ctime from GETATTR results (from failed 3833 * verify compound) as VERIFY arg. 3834 * If file has delegation, then VERIFY(time_metadata) 3835 * is of little added value, so don't bother. 3836 */ 3837 mutex_enter(&rp->r_statev4_lock); 3838 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3839 rp->r_deleg_return_pending) { 3840 numops = 5; 3841 ctime = rp->r_attr.va_ctime; 3842 } 3843 mutex_exit(&rp->r_statev4_lock); 3844 } 3845 3846 recov_state.rs_flags = 0; 3847 recov_state.rs_num_retry_despite_err = 0; 3848 3849 args.ctag = TAG_SETATTR; 3850 do_again: 3851 recov_retry: 3852 setattr_argop = numops - 2; 3853 3854 args.array = argop; 3855 args.array_len = numops; 3856 3857 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3858 if (e.error) 3859 return (e.error); 3860 3861 3862 /* putfh target fh */ 3863 argop[0].argop = OP_CPUTFH; 3864 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3865 3866 if (numops == 5) { 3867 /* 3868 * We only care about the ctime, but need to get mtime 3869 * and size for proper cache update. 3870 */ 3871 /* getattr */ 3872 argop[1].argop = OP_GETATTR; 3873 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3874 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3875 3876 /* verify - set later in loop */ 3877 verify_argop = 2; 3878 } 3879 3880 /* setattr */ 3881 svp = rp->r_server; 3882 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3883 supp_attrs = svp->sv_supp_attrs; 3884 nfs_rw_exit(&svp->sv_lock); 3885 3886 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3887 supp_attrs, &e.error, &sid_types); 3888 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3889 if (e.error) { 3890 /* req time field(s) overflow - return immediately */ 3891 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3892 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3893 opsetattr.obj_attributes); 3894 return (e.error); 3895 } 3896 omode = rp->r_attr.va_mode; 3897 3898 /* getattr */ 3899 argop[numops-1].argop = OP_GETATTR; 3900 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3901 /* 3902 * If we are setting the ACL (indicated only by vsap != NULL), request 3903 * the ACL in this getattr. The ACL returned from this getattr will be 3904 * used in updating the ACL cache. 3905 */ 3906 if (vsap != NULL) 3907 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3908 FATTR4_ACL_MASK; 3909 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3910 3911 /* 3912 * setattr iterates if the object size is set and the cached ctime 3913 * does not match the file ctime. In that case, verify the ctime first. 3914 */ 3915 3916 do { 3917 if (verify_argop != -1) { 3918 /* 3919 * Verify that the ctime match before doing setattr. 3920 */ 3921 va.va_mask = AT_CTIME; 3922 va.va_ctime = ctime; 3923 svp = rp->r_server; 3924 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3925 supp_attrs = svp->sv_supp_attrs; 3926 nfs_rw_exit(&svp->sv_lock); 3927 e.error = nfs4args_verify(&argop[verify_argop], &va, 3928 OP_VERIFY, supp_attrs); 3929 if (e.error) { 3930 /* req time field(s) overflow - return */ 3931 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3932 needrecov); 3933 break; 3934 } 3935 } 3936 3937 doqueue = 1; 3938 3939 t = gethrtime(); 3940 3941 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3942 3943 /* 3944 * Purge the access cache and ACL cache if changing either the 3945 * owner of the file, the group owner, or the mode. These may 3946 * change the access permissions of the file, so purge old 3947 * information and start over again. 3948 */ 3949 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3950 (void) nfs4_access_purge_rp(rp); 3951 if (rp->r_secattr != NULL) { 3952 mutex_enter(&rp->r_statelock); 3953 vsp = rp->r_secattr; 3954 rp->r_secattr = NULL; 3955 mutex_exit(&rp->r_statelock); 3956 if (vsp != NULL) 3957 nfs4_acl_free_cache(vsp); 3958 } 3959 } 3960 3961 /* 3962 * If res.array_len == numops, then everything succeeded, 3963 * except for possibly the final getattr. If only the 3964 * last getattr failed, give up, and don't try recovery. 3965 */ 3966 if (res.array_len == numops) { 3967 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3968 needrecov); 3969 if (! e.error) 3970 resp = &res; 3971 break; 3972 } 3973 3974 /* 3975 * if either rpc call failed or completely succeeded - done 3976 */ 3977 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3978 if (e.error) { 3979 PURGE_ATTRCACHE4(vp); 3980 if (!needrecov) { 3981 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3982 needrecov); 3983 break; 3984 } 3985 } 3986 3987 /* 3988 * Do proper retry for OLD_STATEID outside of the normal 3989 * recovery framework. 3990 */ 3991 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3992 sid_types.cur_sid_type != SPEC_SID && 3993 sid_types.cur_sid_type != NO_SID) { 3994 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3995 needrecov); 3996 nfs4_save_stateid(&stateid, &sid_types); 3997 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3998 opsetattr.obj_attributes); 3999 if (verify_argop != -1) { 4000 nfs4args_verify_free(&argop[verify_argop]); 4001 verify_argop = -1; 4002 } 4003 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4004 goto recov_retry; 4005 } 4006 4007 if (needrecov) { 4008 bool_t abort; 4009 4010 abort = nfs4_start_recovery(&e, 4011 VTOMI4(vp), vp, NULL, NULL, NULL, 4012 OP_SETATTR, NULL, NULL, NULL); 4013 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4014 needrecov); 4015 /* 4016 * Do not retry if we failed with OLD_STATEID using 4017 * a special stateid. This is done to avoid looping 4018 * with a broken server. 4019 */ 4020 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4021 (sid_types.cur_sid_type == SPEC_SID || 4022 sid_types.cur_sid_type == NO_SID)) 4023 abort = TRUE; 4024 if (!e.error) { 4025 if (res.status == NFS4ERR_BADOWNER) 4026 nfs4_log_badowner(VTOMI4(vp), 4027 OP_SETATTR); 4028 4029 e.error = geterrno4(res.status); 4030 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4031 } 4032 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4033 opsetattr.obj_attributes); 4034 if (verify_argop != -1) { 4035 nfs4args_verify_free(&argop[verify_argop]); 4036 verify_argop = -1; 4037 } 4038 if (abort == FALSE) { 4039 /* 4040 * Need to retry all possible stateids in 4041 * case the recovery error wasn't stateid 4042 * related or the stateids have become 4043 * stale (server reboot). 4044 */ 4045 nfs4_init_stateid_types(&sid_types); 4046 goto recov_retry; 4047 } 4048 return (e.error); 4049 } 4050 4051 /* 4052 * Need to call nfs4_end_op before nfs4getattr to 4053 * avoid potential nfs4_start_op deadlock. See RFE 4054 * 4777612. Calls to nfs4_invalidate_pages() and 4055 * nfs4_purge_stale_fh() might also generate over the 4056 * wire calls which my cause nfs4_start_op() deadlock. 4057 */ 4058 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4059 4060 /* 4061 * Check to update lease. 4062 */ 4063 resp = &res; 4064 if (res.status == NFS4_OK) { 4065 break; 4066 } 4067 4068 /* 4069 * Check if verify failed to see if try again 4070 */ 4071 if ((verify_argop == -1) || (res.array_len != 3)) { 4072 /* 4073 * can't continue... 4074 */ 4075 if (res.status == NFS4ERR_BADOWNER) 4076 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4077 4078 e.error = geterrno4(res.status); 4079 } else { 4080 /* 4081 * When the verify request fails, the client ctime is 4082 * not in sync with the server. This is the same as 4083 * the version 3 "not synchronized" error, and we 4084 * handle it in a similar manner (XXX do we need to???). 4085 * Use the ctime returned in the first getattr for 4086 * the input to the next verify. 4087 * If we couldn't get the attributes, then we give up 4088 * because we can't complete the operation as required. 4089 */ 4090 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4091 } 4092 if (e.error) { 4093 PURGE_ATTRCACHE4(vp); 4094 nfs4_purge_stale_fh(e.error, vp, cr); 4095 } else { 4096 /* 4097 * retry with a new verify value 4098 */ 4099 ctime = garp->n4g_va.va_ctime; 4100 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4101 resp = NULL; 4102 } 4103 if (!e.error) { 4104 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4105 opsetattr.obj_attributes); 4106 if (verify_argop != -1) { 4107 nfs4args_verify_free(&argop[verify_argop]); 4108 verify_argop = -1; 4109 } 4110 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4111 goto do_again; 4112 } 4113 } while (!e.error); 4114 4115 if (e.error) { 4116 /* 4117 * If we are here, rfs4call has an irrecoverable error - return 4118 */ 4119 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4120 opsetattr.obj_attributes); 4121 if (verify_argop != -1) { 4122 nfs4args_verify_free(&argop[verify_argop]); 4123 verify_argop = -1; 4124 } 4125 if (resp) 4126 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4127 return (e.error); 4128 } 4129 4130 4131 4132 /* 4133 * If changing the size of the file, invalidate 4134 * any local cached data which is no longer part 4135 * of the file. We also possibly invalidate the 4136 * last page in the file. We could use 4137 * pvn_vpzero(), but this would mark the page as 4138 * modified and require it to be written back to 4139 * the server for no particularly good reason. 4140 * This way, if we access it, then we bring it 4141 * back in. A read should be cheaper than a 4142 * write. 4143 */ 4144 if (mask & AT_SIZE) { 4145 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4146 } 4147 4148 /* either no error or one of the postop getattr failed */ 4149 4150 /* 4151 * XXX Perform a simplified version of wcc checking. Instead of 4152 * have another getattr to get pre-op, just purge cache if 4153 * any of the ops prior to and including the getattr failed. 4154 * If the getattr succeeded then update the attrcache accordingly. 4155 */ 4156 4157 garp = NULL; 4158 if (res.status == NFS4_OK) { 4159 /* 4160 * Last getattr 4161 */ 4162 resop = &res.array[numops - 1]; 4163 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4164 } 4165 /* 4166 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4167 * rather than filling it. See the function itself for details. 4168 */ 4169 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4170 if (garp != NULL) { 4171 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4172 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4173 vs_ace4_destroy(&garp->n4g_vsa); 4174 } else { 4175 if (vsap != NULL) { 4176 /* 4177 * The ACL was supposed to be set and to be 4178 * returned in the last getattr of this 4179 * compound, but for some reason the getattr 4180 * result doesn't contain the ACL. In this 4181 * case, purge the ACL cache. 4182 */ 4183 if (rp->r_secattr != NULL) { 4184 mutex_enter(&rp->r_statelock); 4185 vsp = rp->r_secattr; 4186 rp->r_secattr = NULL; 4187 mutex_exit(&rp->r_statelock); 4188 if (vsp != NULL) 4189 nfs4_acl_free_cache(vsp); 4190 } 4191 } 4192 } 4193 } 4194 4195 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4196 /* 4197 * Set the size, rather than relying on getting it updated 4198 * via a GETATTR. With delegations the client tries to 4199 * suppress GETATTR calls. 4200 */ 4201 mutex_enter(&rp->r_statelock); 4202 rp->r_size = vap->va_size; 4203 mutex_exit(&rp->r_statelock); 4204 } 4205 4206 /* 4207 * Can free up request args and res 4208 */ 4209 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4210 opsetattr.obj_attributes); 4211 if (verify_argop != -1) { 4212 nfs4args_verify_free(&argop[verify_argop]); 4213 verify_argop = -1; 4214 } 4215 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4216 4217 /* 4218 * Some servers will change the mode to clear the setuid 4219 * and setgid bits when changing the uid or gid. The 4220 * client needs to compensate appropriately. 4221 */ 4222 if (mask & (AT_UID | AT_GID)) { 4223 int terror, do_setattr; 4224 4225 do_setattr = 0; 4226 va.va_mask = AT_MODE; 4227 terror = nfs4getattr(vp, &va, cr); 4228 if (!terror && 4229 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4230 (!(mask & AT_MODE) && va.va_mode != omode))) { 4231 va.va_mask = AT_MODE; 4232 if (mask & AT_MODE) { 4233 /* 4234 * We asked the mode to be changed and what 4235 * we just got from the server in getattr is 4236 * not what we wanted it to be, so set it now. 4237 */ 4238 va.va_mode = vap->va_mode; 4239 do_setattr = 1; 4240 } else { 4241 /* 4242 * We did not ask the mode to be changed, 4243 * Check to see that the server just cleared 4244 * I_SUID and I_GUID from it. If not then 4245 * set mode to omode with UID/GID cleared. 4246 */ 4247 if (nfs4_compare_modes(va.va_mode, omode)) { 4248 omode &= ~(S_ISUID|S_ISGID); 4249 va.va_mode = omode; 4250 do_setattr = 1; 4251 } 4252 } 4253 4254 if (do_setattr) 4255 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4256 } 4257 } 4258 4259 return (e.error); 4260 } 4261 4262 /* ARGSUSED */ 4263 static int 4264 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4265 { 4266 COMPOUND4args_clnt args; 4267 COMPOUND4res_clnt res; 4268 int doqueue; 4269 uint32_t acc, resacc, argacc; 4270 rnode4_t *rp; 4271 cred_t *cred, *ncr, *ncrfree = NULL; 4272 nfs4_access_type_t cacc; 4273 int num_ops; 4274 nfs_argop4 argop[3]; 4275 nfs_resop4 *resop; 4276 bool_t needrecov = FALSE, do_getattr; 4277 nfs4_recov_state_t recov_state; 4278 int rpc_error; 4279 hrtime_t t; 4280 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4281 mntinfo4_t *mi = VTOMI4(vp); 4282 4283 if (nfs_zone() != mi->mi_zone) 4284 return (EIO); 4285 4286 acc = 0; 4287 if (mode & VREAD) 4288 acc |= ACCESS4_READ; 4289 if (mode & VWRITE) { 4290 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4291 return (EROFS); 4292 if (vp->v_type == VDIR) 4293 acc |= ACCESS4_DELETE; 4294 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4295 } 4296 if (mode & VEXEC) { 4297 if (vp->v_type == VDIR) 4298 acc |= ACCESS4_LOOKUP; 4299 else 4300 acc |= ACCESS4_EXECUTE; 4301 } 4302 4303 if (VTOR4(vp)->r_acache != NULL) { 4304 e.error = nfs4_validate_caches(vp, cr); 4305 if (e.error) 4306 return (e.error); 4307 } 4308 4309 rp = VTOR4(vp); 4310 if (vp->v_type == VDIR) 4311 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4312 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4313 else 4314 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4315 ACCESS4_EXECUTE; 4316 recov_state.rs_flags = 0; 4317 recov_state.rs_num_retry_despite_err = 0; 4318 4319 cred = cr; 4320 /* 4321 * ncr and ncrfree both initially 4322 * point to the memory area returned 4323 * by crnetadjust(); 4324 * ncrfree not NULL when exiting means 4325 * that we need to release it 4326 */ 4327 ncr = crnetadjust(cred); 4328 ncrfree = ncr; 4329 4330 tryagain: 4331 cacc = nfs4_access_check(rp, acc, cred); 4332 if (cacc == NFS4_ACCESS_ALLOWED) { 4333 if (ncrfree != NULL) 4334 crfree(ncrfree); 4335 return (0); 4336 } 4337 if (cacc == NFS4_ACCESS_DENIED) { 4338 /* 4339 * If the cred can be adjusted, try again 4340 * with the new cred. 4341 */ 4342 if (ncr != NULL) { 4343 cred = ncr; 4344 ncr = NULL; 4345 goto tryagain; 4346 } 4347 if (ncrfree != NULL) 4348 crfree(ncrfree); 4349 return (EACCES); 4350 } 4351 4352 recov_retry: 4353 /* 4354 * Don't take with r_statev4_lock here. r_deleg_type could 4355 * change as soon as lock is released. Since it is an int, 4356 * there is no atomicity issue. 4357 */ 4358 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4359 num_ops = do_getattr ? 3 : 2; 4360 4361 args.ctag = TAG_ACCESS; 4362 4363 args.array_len = num_ops; 4364 args.array = argop; 4365 4366 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4367 &recov_state, NULL)) { 4368 if (ncrfree != NULL) 4369 crfree(ncrfree); 4370 return (e.error); 4371 } 4372 4373 /* putfh target fh */ 4374 argop[0].argop = OP_CPUTFH; 4375 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4376 4377 /* access */ 4378 argop[1].argop = OP_ACCESS; 4379 argop[1].nfs_argop4_u.opaccess.access = argacc; 4380 4381 /* getattr */ 4382 if (do_getattr) { 4383 argop[2].argop = OP_GETATTR; 4384 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4385 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4386 } 4387 4388 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4389 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4390 rnode4info(VTOR4(vp)))); 4391 4392 doqueue = 1; 4393 t = gethrtime(); 4394 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4395 rpc_error = e.error; 4396 4397 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4398 if (needrecov) { 4399 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4400 "nfs4_access: initiating recovery\n")); 4401 4402 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4403 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) { 4404 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4405 &recov_state, needrecov); 4406 if (!e.error) 4407 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4408 goto recov_retry; 4409 } 4410 } 4411 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4412 4413 if (e.error) 4414 goto out; 4415 4416 if (res.status) { 4417 e.error = geterrno4(res.status); 4418 /* 4419 * This might generate over the wire calls throught 4420 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4421 * here to avoid a deadlock. 4422 */ 4423 nfs4_purge_stale_fh(e.error, vp, cr); 4424 goto out; 4425 } 4426 resop = &res.array[1]; /* access res */ 4427 4428 resacc = resop->nfs_resop4_u.opaccess.access; 4429 4430 if (do_getattr) { 4431 resop++; /* getattr res */ 4432 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4433 t, cr, FALSE, NULL); 4434 } 4435 4436 if (!e.error) { 4437 nfs4_access_cache(rp, argacc, resacc, cred); 4438 /* 4439 * we just cached results with cred; if cred is the 4440 * adjusted credentials from crnetadjust, we do not want 4441 * to release them before exiting: hence setting ncrfree 4442 * to NULL 4443 */ 4444 if (cred != cr) 4445 ncrfree = NULL; 4446 /* XXX check the supported bits too? */ 4447 if ((acc & resacc) != acc) { 4448 /* 4449 * The following code implements the semantic 4450 * that a setuid root program has *at least* the 4451 * permissions of the user that is running the 4452 * program. See rfs3call() for more portions 4453 * of the implementation of this functionality. 4454 */ 4455 /* XXX-LP */ 4456 if (ncr != NULL) { 4457 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4458 cred = ncr; 4459 ncr = NULL; 4460 goto tryagain; 4461 } 4462 e.error = EACCES; 4463 } 4464 } 4465 4466 out: 4467 if (!rpc_error) 4468 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4469 4470 if (ncrfree != NULL) 4471 crfree(ncrfree); 4472 4473 return (e.error); 4474 } 4475 4476 /* ARGSUSED */ 4477 static int 4478 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4479 { 4480 COMPOUND4args_clnt args; 4481 COMPOUND4res_clnt res; 4482 int doqueue; 4483 rnode4_t *rp; 4484 nfs_argop4 argop[3]; 4485 nfs_resop4 *resop; 4486 READLINK4res *lr_res; 4487 nfs4_ga_res_t *garp; 4488 uint_t len; 4489 char *linkdata; 4490 bool_t needrecov = FALSE; 4491 nfs4_recov_state_t recov_state; 4492 hrtime_t t; 4493 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4494 4495 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4496 return (EIO); 4497 /* 4498 * Can't readlink anything other than a symbolic link. 4499 */ 4500 if (vp->v_type != VLNK) 4501 return (EINVAL); 4502 4503 rp = VTOR4(vp); 4504 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4505 e.error = nfs4_validate_caches(vp, cr); 4506 if (e.error) 4507 return (e.error); 4508 mutex_enter(&rp->r_statelock); 4509 if (rp->r_symlink.contents != NULL) { 4510 e.error = uiomove(rp->r_symlink.contents, 4511 rp->r_symlink.len, UIO_READ, uiop); 4512 mutex_exit(&rp->r_statelock); 4513 return (e.error); 4514 } 4515 mutex_exit(&rp->r_statelock); 4516 } 4517 recov_state.rs_flags = 0; 4518 recov_state.rs_num_retry_despite_err = 0; 4519 4520 recov_retry: 4521 args.array_len = 3; 4522 args.array = argop; 4523 args.ctag = TAG_READLINK; 4524 4525 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4526 if (e.error) { 4527 return (e.error); 4528 } 4529 4530 /* 0. putfh symlink fh */ 4531 argop[0].argop = OP_CPUTFH; 4532 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4533 4534 /* 1. readlink */ 4535 argop[1].argop = OP_READLINK; 4536 4537 /* 2. getattr */ 4538 argop[2].argop = OP_GETATTR; 4539 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4540 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4541 4542 doqueue = 1; 4543 4544 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4545 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4546 rnode4info(VTOR4(vp)))); 4547 4548 t = gethrtime(); 4549 4550 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4551 4552 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4553 if (needrecov) { 4554 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4555 "nfs4_readlink: initiating recovery\n")); 4556 4557 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4558 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) { 4559 if (!e.error) 4560 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4561 4562 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4563 needrecov); 4564 goto recov_retry; 4565 } 4566 } 4567 4568 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4569 4570 if (e.error) 4571 return (e.error); 4572 4573 /* 4574 * There is an path in the code below which calls 4575 * nfs4_purge_stale_fh(), which may generate otw calls through 4576 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4577 * here to avoid nfs4_start_op() deadlock. 4578 */ 4579 4580 if (res.status && (res.array_len < args.array_len)) { 4581 /* 4582 * either Putfh or Link failed 4583 */ 4584 e.error = geterrno4(res.status); 4585 nfs4_purge_stale_fh(e.error, vp, cr); 4586 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4587 return (e.error); 4588 } 4589 4590 resop = &res.array[1]; /* readlink res */ 4591 lr_res = &resop->nfs_resop4_u.opreadlink; 4592 4593 /* 4594 * treat symlink names as data 4595 */ 4596 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL); 4597 if (linkdata != NULL) { 4598 int uio_len = len - 1; 4599 /* len includes null byte, which we won't uiomove */ 4600 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4601 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4602 mutex_enter(&rp->r_statelock); 4603 if (rp->r_symlink.contents == NULL) { 4604 rp->r_symlink.contents = linkdata; 4605 rp->r_symlink.len = uio_len; 4606 rp->r_symlink.size = len; 4607 mutex_exit(&rp->r_statelock); 4608 } else { 4609 mutex_exit(&rp->r_statelock); 4610 kmem_free(linkdata, len); 4611 } 4612 } else { 4613 kmem_free(linkdata, len); 4614 } 4615 } 4616 if (res.status == NFS4_OK) { 4617 resop++; /* getattr res */ 4618 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4619 } 4620 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4621 4622 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4623 4624 /* 4625 * The over the wire error for attempting to readlink something 4626 * other than a symbolic link is ENXIO. However, we need to 4627 * return EINVAL instead of ENXIO, so we map it here. 4628 */ 4629 return (e.error == ENXIO ? EINVAL : e.error); 4630 } 4631 4632 /* 4633 * Flush local dirty pages to stable storage on the server. 4634 * 4635 * If FNODSYNC is specified, then there is nothing to do because 4636 * metadata changes are not cached on the client before being 4637 * sent to the server. 4638 */ 4639 /* ARGSUSED */ 4640 static int 4641 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4642 { 4643 int error; 4644 4645 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4646 return (0); 4647 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4648 return (EIO); 4649 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4650 if (!error) 4651 error = VTOR4(vp)->r_error; 4652 return (error); 4653 } 4654 4655 /* 4656 * Weirdness: if the file was removed or the target of a rename 4657 * operation while it was open, it got renamed instead. Here we 4658 * remove the renamed file. 4659 */ 4660 /* ARGSUSED */ 4661 void 4662 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4663 { 4664 rnode4_t *rp; 4665 4666 ASSERT(vp != DNLC_NO_VNODE); 4667 4668 rp = VTOR4(vp); 4669 4670 if (IS_SHADOW(vp, rp)) { 4671 sv_inactive(vp); 4672 return; 4673 } 4674 4675 /* 4676 * If this is coming from the wrong zone, we let someone in the right 4677 * zone take care of it asynchronously. We can get here due to 4678 * VN_RELE() being called from pageout() or fsflush(). This call may 4679 * potentially turn into an expensive no-op if, for instance, v_count 4680 * gets incremented in the meantime, but it's still correct. 4681 */ 4682 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4683 nfs4_async_inactive(vp, cr); 4684 return; 4685 } 4686 4687 /* 4688 * Some of the cleanup steps might require over-the-wire 4689 * operations. Since VOP_INACTIVE can get called as a result of 4690 * other over-the-wire operations (e.g., an attribute cache update 4691 * can lead to a DNLC purge), doing those steps now would lead to a 4692 * nested call to the recovery framework, which can deadlock. So 4693 * do any over-the-wire cleanups asynchronously, in a separate 4694 * thread. 4695 */ 4696 4697 mutex_enter(&rp->r_os_lock); 4698 mutex_enter(&rp->r_statelock); 4699 mutex_enter(&rp->r_statev4_lock); 4700 4701 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4702 mutex_exit(&rp->r_statev4_lock); 4703 mutex_exit(&rp->r_statelock); 4704 mutex_exit(&rp->r_os_lock); 4705 nfs4_async_inactive(vp, cr); 4706 return; 4707 } 4708 4709 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4710 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4711 mutex_exit(&rp->r_statev4_lock); 4712 mutex_exit(&rp->r_statelock); 4713 mutex_exit(&rp->r_os_lock); 4714 nfs4_async_inactive(vp, cr); 4715 return; 4716 } 4717 4718 if (rp->r_unldvp != NULL) { 4719 mutex_exit(&rp->r_statev4_lock); 4720 mutex_exit(&rp->r_statelock); 4721 mutex_exit(&rp->r_os_lock); 4722 nfs4_async_inactive(vp, cr); 4723 return; 4724 } 4725 mutex_exit(&rp->r_statev4_lock); 4726 mutex_exit(&rp->r_statelock); 4727 mutex_exit(&rp->r_os_lock); 4728 4729 rp4_addfree(rp, cr); 4730 } 4731 4732 /* 4733 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4734 * various bits of state. The caller must not refer to vp after this call. 4735 */ 4736 4737 void 4738 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4739 { 4740 rnode4_t *rp = VTOR4(vp); 4741 nfs4_recov_state_t recov_state; 4742 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4743 vnode_t *unldvp; 4744 char *unlname; 4745 cred_t *unlcred; 4746 COMPOUND4args_clnt args; 4747 COMPOUND4res_clnt res, *resp; 4748 nfs_argop4 argop[2]; 4749 int doqueue; 4750 #ifdef DEBUG 4751 char *name; 4752 #endif 4753 4754 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4755 ASSERT(!IS_SHADOW(vp, rp)); 4756 4757 #ifdef DEBUG 4758 name = fn_name(VTOSV(vp)->sv_name); 4759 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4760 "release vnode %s", name)); 4761 kmem_free(name, MAXNAMELEN); 4762 #endif 4763 4764 if (vp->v_type == VREG) { 4765 bool_t recov_failed = FALSE; 4766 4767 e.error = nfs4close_all(vp, cr); 4768 if (e.error) { 4769 /* Check to see if recovery failed */ 4770 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4771 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4772 recov_failed = TRUE; 4773 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4774 if (!recov_failed) { 4775 mutex_enter(&rp->r_statelock); 4776 if (rp->r_flags & R4RECOVERR) 4777 recov_failed = TRUE; 4778 mutex_exit(&rp->r_statelock); 4779 } 4780 if (recov_failed) { 4781 NFS4_DEBUG(nfs4_client_recov_debug, 4782 (CE_NOTE, "nfs4_inactive_otw: " 4783 "close failed (recovery failure)")); 4784 } 4785 } 4786 } 4787 4788 redo: 4789 if (rp->r_unldvp == NULL) { 4790 rp4_addfree(rp, cr); 4791 return; 4792 } 4793 4794 /* 4795 * Save the vnode pointer for the directory where the 4796 * unlinked-open file got renamed, then set it to NULL 4797 * to prevent another thread from getting here before 4798 * we're done with the remove. While we have the 4799 * statelock, make local copies of the pertinent rnode 4800 * fields. If we weren't to do this in an atomic way, the 4801 * the unl* fields could become inconsistent with respect 4802 * to each other due to a race condition between this 4803 * code and nfs_remove(). See bug report 1034328. 4804 */ 4805 mutex_enter(&rp->r_statelock); 4806 if (rp->r_unldvp == NULL) { 4807 mutex_exit(&rp->r_statelock); 4808 rp4_addfree(rp, cr); 4809 return; 4810 } 4811 4812 unldvp = rp->r_unldvp; 4813 rp->r_unldvp = NULL; 4814 unlname = rp->r_unlname; 4815 rp->r_unlname = NULL; 4816 unlcred = rp->r_unlcred; 4817 rp->r_unlcred = NULL; 4818 mutex_exit(&rp->r_statelock); 4819 4820 /* 4821 * If there are any dirty pages left, then flush 4822 * them. This is unfortunate because they just 4823 * may get thrown away during the remove operation, 4824 * but we have to do this for correctness. 4825 */ 4826 if (nfs4_has_pages(vp) && 4827 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4828 ASSERT(vp->v_type != VCHR); 4829 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4830 if (e.error) { 4831 mutex_enter(&rp->r_statelock); 4832 if (!rp->r_error) 4833 rp->r_error = e.error; 4834 mutex_exit(&rp->r_statelock); 4835 } 4836 } 4837 4838 recov_state.rs_flags = 0; 4839 recov_state.rs_num_retry_despite_err = 0; 4840 recov_retry_remove: 4841 /* 4842 * Do the remove operation on the renamed file 4843 */ 4844 args.ctag = TAG_INACTIVE; 4845 4846 /* 4847 * Remove ops: putfh dir; remove 4848 */ 4849 args.array_len = 2; 4850 args.array = argop; 4851 4852 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4853 if (e.error) { 4854 kmem_free(unlname, MAXNAMELEN); 4855 crfree(unlcred); 4856 VN_RELE(unldvp); 4857 /* 4858 * Try again; this time around r_unldvp will be NULL, so we'll 4859 * just call rp4_addfree() and return. 4860 */ 4861 goto redo; 4862 } 4863 4864 /* putfh directory */ 4865 argop[0].argop = OP_CPUTFH; 4866 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4867 4868 /* remove */ 4869 argop[1].argop = OP_CREMOVE; 4870 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4871 4872 doqueue = 1; 4873 resp = &res; 4874 4875 #if 0 /* notyet */ 4876 /* 4877 * Can't do this yet. We may be being called from 4878 * dnlc_purge_XXX while that routine is holding a 4879 * mutex lock to the nc_rele list. The calls to 4880 * nfs3_cache_wcc_data may result in calls to 4881 * dnlc_purge_XXX. This will result in a deadlock. 4882 */ 4883 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4884 if (e.error) { 4885 PURGE_ATTRCACHE4(unldvp); 4886 resp = NULL; 4887 } else if (res.status) { 4888 e.error = geterrno4(res.status); 4889 PURGE_ATTRCACHE4(unldvp); 4890 /* 4891 * This code is inactive right now 4892 * but if made active there should 4893 * be a nfs4_end_op() call before 4894 * nfs4_purge_stale_fh to avoid start_op() 4895 * deadlock. See BugId: 4948726 4896 */ 4897 nfs4_purge_stale_fh(error, unldvp, cr); 4898 } else { 4899 nfs_resop4 *resop; 4900 REMOVE4res *rm_res; 4901 4902 resop = &res.array[1]; 4903 rm_res = &resop->nfs_resop4_u.opremove; 4904 /* 4905 * Update directory cache attribute, 4906 * readdir and dnlc caches. 4907 */ 4908 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4909 } 4910 #else 4911 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4912 4913 PURGE_ATTRCACHE4(unldvp); 4914 #endif 4915 4916 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4917 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4918 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 4919 if (!e.error) 4920 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4921 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4922 &recov_state, TRUE); 4923 goto recov_retry_remove; 4924 } 4925 } 4926 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4927 4928 /* 4929 * Release stuff held for the remove 4930 */ 4931 VN_RELE(unldvp); 4932 if (!e.error && resp) 4933 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4934 4935 kmem_free(unlname, MAXNAMELEN); 4936 crfree(unlcred); 4937 goto redo; 4938 } 4939 4940 /* 4941 * Remote file system operations having to do with directory manipulation. 4942 */ 4943 /* ARGSUSED3 */ 4944 int 4945 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4946 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4947 int *direntflags, pathname_t *realpnp) 4948 { 4949 int error; 4950 vnode_t *vp, *avp = NULL; 4951 rnode4_t *drp; 4952 4953 *vpp = NULL; 4954 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4955 return (EPERM); 4956 /* 4957 * if LOOKUP_XATTR, must replace dvp (object) with 4958 * object's attrdir before continuing with lookup 4959 */ 4960 if (flags & LOOKUP_XATTR) { 4961 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4962 if (error) 4963 return (error); 4964 4965 dvp = avp; 4966 4967 /* 4968 * If lookup is for "", just return dvp now. The attrdir 4969 * has already been activated (from nfs4lookup_xattr), and 4970 * the caller will RELE the original dvp -- not 4971 * the attrdir. So, set vpp and return. 4972 * Currently, when the LOOKUP_XATTR flag is 4973 * passed to VOP_LOOKUP, the name is always empty, and 4974 * shortcircuiting here avoids 3 unneeded lock/unlock 4975 * pairs. 4976 * 4977 * If a non-empty name was provided, then it is the 4978 * attribute name, and it will be looked up below. 4979 */ 4980 if (*nm == '\0') { 4981 *vpp = dvp; 4982 return (0); 4983 } 4984 4985 /* 4986 * The vfs layer never sends a name when asking for the 4987 * attrdir, so we should never get here (unless of course 4988 * name is passed at some time in future -- at which time 4989 * we'll blow up here). 4990 */ 4991 ASSERT(0); 4992 } 4993 4994 drp = VTOR4(dvp); 4995 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 4996 return (EINTR); 4997 4998 error = nfs4lookup(dvp, nm, vpp, cr, 0); 4999 nfs_rw_exit(&drp->r_rwlock); 5000 5001 /* 5002 * If vnode is a device, create special vnode. 5003 */ 5004 if (!error && ISVDEV((*vpp)->v_type)) { 5005 vp = *vpp; 5006 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 5007 VN_RELE(vp); 5008 } 5009 5010 return (error); 5011 } 5012 5013 /* ARGSUSED */ 5014 static int 5015 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5016 { 5017 int error; 5018 rnode4_t *drp; 5019 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5020 mntinfo4_t *mi; 5021 5022 mi = VTOMI4(dvp); 5023 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5024 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5025 return (EINVAL); 5026 5027 drp = VTOR4(dvp); 5028 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5029 return (EINTR); 5030 5031 mutex_enter(&drp->r_statelock); 5032 /* 5033 * If the server doesn't support xattrs just return EINVAL 5034 */ 5035 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5036 mutex_exit(&drp->r_statelock); 5037 nfs_rw_exit(&drp->r_rwlock); 5038 return (EINVAL); 5039 } 5040 5041 /* 5042 * If there is a cached xattr directory entry, 5043 * use it as long as the attributes are valid. If the 5044 * attributes are not valid, take the simple approach and 5045 * free the cached value and re-fetch a new value. 5046 * 5047 * We don't negative entry cache for now, if we did we 5048 * would need to check if the file has changed on every 5049 * lookup. But xattrs don't exist very often and failing 5050 * an openattr is not much more expensive than and NVERIFY or GETATTR 5051 * so do an openattr over the wire for now. 5052 */ 5053 if (drp->r_xattr_dir != NULL) { 5054 if (ATTRCACHE4_VALID(dvp)) { 5055 VN_HOLD(drp->r_xattr_dir); 5056 *vpp = drp->r_xattr_dir; 5057 mutex_exit(&drp->r_statelock); 5058 nfs_rw_exit(&drp->r_rwlock); 5059 return (0); 5060 } 5061 VN_RELE(drp->r_xattr_dir); 5062 drp->r_xattr_dir = NULL; 5063 } 5064 mutex_exit(&drp->r_statelock); 5065 5066 error = nfs4openattr(dvp, vpp, cflag, cr); 5067 5068 nfs_rw_exit(&drp->r_rwlock); 5069 5070 return (error); 5071 } 5072 5073 static int 5074 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5075 { 5076 int error; 5077 rnode4_t *drp; 5078 5079 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5080 5081 /* 5082 * If lookup is for "", just return dvp. Don't need 5083 * to send it over the wire, look it up in the dnlc, 5084 * or perform any access checks. 5085 */ 5086 if (*nm == '\0') { 5087 VN_HOLD(dvp); 5088 *vpp = dvp; 5089 return (0); 5090 } 5091 5092 /* 5093 * Can't do lookups in non-directories. 5094 */ 5095 if (dvp->v_type != VDIR) 5096 return (ENOTDIR); 5097 5098 /* 5099 * If lookup is for ".", just return dvp. Don't need 5100 * to send it over the wire or look it up in the dnlc, 5101 * just need to check access. 5102 */ 5103 if (nm[0] == '.' && nm[1] == '\0') { 5104 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5105 if (error) 5106 return (error); 5107 VN_HOLD(dvp); 5108 *vpp = dvp; 5109 return (0); 5110 } 5111 5112 drp = VTOR4(dvp); 5113 if (!(drp->r_flags & R4LOOKUP)) { 5114 mutex_enter(&drp->r_statelock); 5115 drp->r_flags |= R4LOOKUP; 5116 mutex_exit(&drp->r_statelock); 5117 } 5118 5119 *vpp = NULL; 5120 /* 5121 * Lookup this name in the DNLC. If there is no entry 5122 * lookup over the wire. 5123 */ 5124 if (!skipdnlc) 5125 *vpp = dnlc_lookup(dvp, nm); 5126 if (*vpp == NULL) { 5127 /* 5128 * We need to go over the wire to lookup the name. 5129 */ 5130 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5131 } 5132 5133 /* 5134 * We hit on the dnlc 5135 */ 5136 if (*vpp != DNLC_NO_VNODE || 5137 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5138 /* 5139 * But our attrs may not be valid. 5140 */ 5141 if (ATTRCACHE4_VALID(dvp)) { 5142 error = nfs4_waitfor_purge_complete(dvp); 5143 if (error) { 5144 VN_RELE(*vpp); 5145 *vpp = NULL; 5146 return (error); 5147 } 5148 5149 /* 5150 * If after the purge completes, check to make sure 5151 * our attrs are still valid. 5152 */ 5153 if (ATTRCACHE4_VALID(dvp)) { 5154 /* 5155 * If we waited for a purge we may have 5156 * lost our vnode so look it up again. 5157 */ 5158 VN_RELE(*vpp); 5159 *vpp = dnlc_lookup(dvp, nm); 5160 if (*vpp == NULL) 5161 return (nfs4lookupnew_otw(dvp, 5162 nm, vpp, cr)); 5163 5164 /* 5165 * The access cache should almost always hit 5166 */ 5167 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5168 5169 if (error) { 5170 VN_RELE(*vpp); 5171 *vpp = NULL; 5172 return (error); 5173 } 5174 if (*vpp == DNLC_NO_VNODE) { 5175 VN_RELE(*vpp); 5176 *vpp = NULL; 5177 return (ENOENT); 5178 } 5179 return (0); 5180 } 5181 } 5182 } 5183 5184 ASSERT(*vpp != NULL); 5185 5186 /* 5187 * We may have gotten here we have one of the following cases: 5188 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5189 * need to validate them. 5190 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5191 * must validate. 5192 * 5193 * Go to the server and check if the directory has changed, if 5194 * it hasn't we are done and can use the dnlc entry. 5195 */ 5196 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5197 } 5198 5199 /* 5200 * Go to the server and check if the directory has changed, if 5201 * it hasn't we are done and can use the dnlc entry. If it 5202 * has changed we get a new copy of its attributes and check 5203 * the access for VEXEC, then relookup the filename and 5204 * get its filehandle and attributes. 5205 * 5206 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5207 * if the NVERIFY failed we must 5208 * purge the caches 5209 * cache new attributes (will set r_time_attr_inval) 5210 * cache new access 5211 * recheck VEXEC access 5212 * add name to dnlc, possibly negative 5213 * if LOOKUP succeeded 5214 * cache new attributes 5215 * else 5216 * set a new r_time_attr_inval for dvp 5217 * check to make sure we have access 5218 * 5219 * The vpp returned is the vnode passed in if the directory is valid, 5220 * a new vnode if successful lookup, or NULL on error. 5221 */ 5222 static int 5223 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5224 { 5225 COMPOUND4args_clnt args; 5226 COMPOUND4res_clnt res; 5227 fattr4 *ver_fattr; 5228 fattr4_change dchange; 5229 int32_t *ptr; 5230 int argoplist_size = 7 * sizeof (nfs_argop4); 5231 nfs_argop4 *argop; 5232 int doqueue; 5233 mntinfo4_t *mi; 5234 nfs4_recov_state_t recov_state; 5235 hrtime_t t; 5236 int isdotdot; 5237 vnode_t *nvp; 5238 nfs_fh4 *fhp; 5239 nfs4_sharedfh_t *sfhp; 5240 nfs4_access_type_t cacc; 5241 rnode4_t *nrp; 5242 rnode4_t *drp = VTOR4(dvp); 5243 nfs4_ga_res_t *garp = NULL; 5244 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5245 5246 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5247 ASSERT(nm != NULL); 5248 ASSERT(nm[0] != '\0'); 5249 ASSERT(dvp->v_type == VDIR); 5250 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5251 ASSERT(*vpp != NULL); 5252 5253 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5254 isdotdot = 1; 5255 args.ctag = TAG_LOOKUP_VPARENT; 5256 } else { 5257 /* 5258 * If dvp were a stub, it should have triggered and caused 5259 * a mount for us to get this far. 5260 */ 5261 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5262 5263 isdotdot = 0; 5264 args.ctag = TAG_LOOKUP_VALID; 5265 } 5266 5267 mi = VTOMI4(dvp); 5268 recov_state.rs_flags = 0; 5269 recov_state.rs_num_retry_despite_err = 0; 5270 5271 nvp = NULL; 5272 5273 /* Save the original mount point security information */ 5274 (void) save_mnt_secinfo(mi->mi_curr_serv); 5275 5276 recov_retry: 5277 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5278 &recov_state, NULL); 5279 if (e.error) { 5280 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5281 VN_RELE(*vpp); 5282 *vpp = NULL; 5283 return (e.error); 5284 } 5285 5286 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5287 5288 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5289 args.array_len = 7; 5290 args.array = argop; 5291 5292 /* 0. putfh file */ 5293 argop[0].argop = OP_CPUTFH; 5294 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5295 5296 /* 1. nverify the change info */ 5297 argop[1].argop = OP_NVERIFY; 5298 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5299 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5300 ver_fattr->attrlist4 = (char *)&dchange; 5301 ptr = (int32_t *)&dchange; 5302 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5303 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5304 5305 /* 2. getattr directory */ 5306 argop[2].argop = OP_GETATTR; 5307 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5308 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5309 5310 /* 3. access directory */ 5311 argop[3].argop = OP_ACCESS; 5312 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5313 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5314 5315 /* 4. lookup name */ 5316 if (isdotdot) { 5317 argop[4].argop = OP_LOOKUPP; 5318 } else { 5319 argop[4].argop = OP_CLOOKUP; 5320 argop[4].nfs_argop4_u.opclookup.cname = nm; 5321 } 5322 5323 /* 5. resulting file handle */ 5324 argop[5].argop = OP_GETFH; 5325 5326 /* 6. resulting file attributes */ 5327 argop[6].argop = OP_GETATTR; 5328 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5329 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5330 5331 doqueue = 1; 5332 t = gethrtime(); 5333 5334 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5335 5336 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5337 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5338 if (e.error != 0 && *vpp != NULL) 5339 VN_RELE(*vpp); 5340 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5341 &recov_state, FALSE); 5342 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5343 kmem_free(argop, argoplist_size); 5344 return (e.error); 5345 } 5346 5347 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5348 /* 5349 * For WRONGSEC of a non-dotdot case, send secinfo directly 5350 * from this thread, do not go thru the recovery thread since 5351 * we need the nm information. 5352 * 5353 * Not doing dotdot case because there is no specification 5354 * for (PUTFH, SECINFO "..") yet. 5355 */ 5356 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5357 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5358 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5359 &recov_state, FALSE); 5360 else 5361 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5362 &recov_state, TRUE); 5363 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5364 kmem_free(argop, argoplist_size); 5365 if (!e.error) 5366 goto recov_retry; 5367 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5368 VN_RELE(*vpp); 5369 *vpp = NULL; 5370 return (e.error); 5371 } 5372 5373 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5374 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5375 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5376 &recov_state, TRUE); 5377 5378 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5379 kmem_free(argop, argoplist_size); 5380 goto recov_retry; 5381 } 5382 } 5383 5384 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5385 5386 if (e.error || res.array_len == 0) { 5387 /* 5388 * If e.error isn't set, then reply has no ops (or we couldn't 5389 * be here). The only legal way to reply without an op array 5390 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5391 * be in the reply for all other status values. 5392 * 5393 * For valid replies without an ops array, return ENOTSUP 5394 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5395 * return EIO -- don't trust status. 5396 */ 5397 if (e.error == 0) 5398 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5399 ENOTSUP : EIO; 5400 VN_RELE(*vpp); 5401 *vpp = NULL; 5402 kmem_free(argop, argoplist_size); 5403 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5404 return (e.error); 5405 } 5406 5407 if (res.status != NFS4ERR_SAME) { 5408 e.error = geterrno4(res.status); 5409 5410 /* 5411 * The NVERIFY "failed" so the directory has changed 5412 * First make sure PUTFH succeeded and NVERIFY "failed" 5413 * cleanly. 5414 */ 5415 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5416 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5417 nfs4_purge_stale_fh(e.error, dvp, cr); 5418 VN_RELE(*vpp); 5419 *vpp = NULL; 5420 goto exit; 5421 } 5422 5423 /* 5424 * We know the NVERIFY "failed" so we must: 5425 * purge the caches (access and indirectly dnlc if needed) 5426 */ 5427 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5428 5429 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5430 nfs4_purge_stale_fh(e.error, dvp, cr); 5431 VN_RELE(*vpp); 5432 *vpp = NULL; 5433 goto exit; 5434 } 5435 5436 /* 5437 * Install new cached attributes for the directory 5438 */ 5439 nfs4_attr_cache(dvp, 5440 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5441 t, cr, FALSE, NULL); 5442 5443 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5444 nfs4_purge_stale_fh(e.error, dvp, cr); 5445 VN_RELE(*vpp); 5446 *vpp = NULL; 5447 e.error = geterrno4(res.status); 5448 goto exit; 5449 } 5450 5451 /* 5452 * Now we know the directory is valid, 5453 * cache new directory access 5454 */ 5455 nfs4_access_cache(drp, 5456 args.array[3].nfs_argop4_u.opaccess.access, 5457 res.array[3].nfs_resop4_u.opaccess.access, cr); 5458 5459 /* 5460 * recheck VEXEC access 5461 */ 5462 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5463 if (cacc != NFS4_ACCESS_ALLOWED) { 5464 /* 5465 * Directory permissions might have been revoked 5466 */ 5467 if (cacc == NFS4_ACCESS_DENIED) { 5468 e.error = EACCES; 5469 VN_RELE(*vpp); 5470 *vpp = NULL; 5471 goto exit; 5472 } 5473 5474 /* 5475 * Somehow we must not have asked for enough 5476 * so try a singleton ACCESS, should never happen. 5477 */ 5478 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5479 if (e.error) { 5480 VN_RELE(*vpp); 5481 *vpp = NULL; 5482 goto exit; 5483 } 5484 } 5485 5486 e.error = geterrno4(res.status); 5487 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5488 /* 5489 * The lookup failed, probably no entry 5490 */ 5491 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5492 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5493 } else { 5494 /* 5495 * Might be some other error, so remove 5496 * the dnlc entry to make sure we start all 5497 * over again, next time. 5498 */ 5499 dnlc_remove(dvp, nm); 5500 } 5501 VN_RELE(*vpp); 5502 *vpp = NULL; 5503 goto exit; 5504 } 5505 5506 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5507 /* 5508 * The file exists but we can't get its fh for 5509 * some unknown reason. Remove it from the dnlc 5510 * and error out to be safe. 5511 */ 5512 dnlc_remove(dvp, nm); 5513 VN_RELE(*vpp); 5514 *vpp = NULL; 5515 goto exit; 5516 } 5517 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5518 if (fhp->nfs_fh4_len == 0) { 5519 /* 5520 * The file exists but a bogus fh 5521 * some unknown reason. Remove it from the dnlc 5522 * and error out to be safe. 5523 */ 5524 e.error = ENOENT; 5525 dnlc_remove(dvp, nm); 5526 VN_RELE(*vpp); 5527 *vpp = NULL; 5528 goto exit; 5529 } 5530 sfhp = sfh4_get(fhp, mi); 5531 5532 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5533 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5534 5535 /* 5536 * Make the new rnode 5537 */ 5538 if (isdotdot) { 5539 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5540 if (e.error) { 5541 sfh4_rele(&sfhp); 5542 VN_RELE(*vpp); 5543 *vpp = NULL; 5544 goto exit; 5545 } 5546 /* 5547 * XXX if nfs4_make_dotdot uses an existing rnode 5548 * XXX it doesn't update the attributes. 5549 * XXX for now just save them again to save an OTW 5550 */ 5551 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5552 } else { 5553 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5554 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5555 /* 5556 * If v_type == VNON, then garp was NULL because 5557 * the last op in the compound failed and makenfs4node 5558 * could not find the vnode for sfhp. It created 5559 * a new vnode, so we have nothing to purge here. 5560 */ 5561 if (nvp->v_type == VNON) { 5562 vattr_t vattr; 5563 5564 vattr.va_mask = AT_TYPE; 5565 /* 5566 * N.B. We've already called nfs4_end_fop above. 5567 */ 5568 e.error = nfs4getattr(nvp, &vattr, cr); 5569 if (e.error) { 5570 sfh4_rele(&sfhp); 5571 VN_RELE(*vpp); 5572 *vpp = NULL; 5573 VN_RELE(nvp); 5574 goto exit; 5575 } 5576 nvp->v_type = vattr.va_type; 5577 } 5578 } 5579 sfh4_rele(&sfhp); 5580 5581 nrp = VTOR4(nvp); 5582 mutex_enter(&nrp->r_statev4_lock); 5583 if (!nrp->created_v4) { 5584 mutex_exit(&nrp->r_statev4_lock); 5585 dnlc_update(dvp, nm, nvp); 5586 } else 5587 mutex_exit(&nrp->r_statev4_lock); 5588 5589 VN_RELE(*vpp); 5590 *vpp = nvp; 5591 } else { 5592 hrtime_t now; 5593 hrtime_t delta = 0; 5594 5595 e.error = 0; 5596 5597 /* 5598 * Because the NVERIFY "succeeded" we know that the 5599 * directory attributes are still valid 5600 * so update r_time_attr_inval 5601 */ 5602 now = gethrtime(); 5603 mutex_enter(&drp->r_statelock); 5604 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5605 delta = now - drp->r_time_attr_saved; 5606 if (delta < mi->mi_acdirmin) 5607 delta = mi->mi_acdirmin; 5608 else if (delta > mi->mi_acdirmax) 5609 delta = mi->mi_acdirmax; 5610 } 5611 drp->r_time_attr_inval = now + delta; 5612 mutex_exit(&drp->r_statelock); 5613 dnlc_update(dvp, nm, *vpp); 5614 5615 /* 5616 * Even though we have a valid directory attr cache 5617 * and dnlc entry, we may not have access. 5618 * This should almost always hit the cache. 5619 */ 5620 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5621 if (e.error) { 5622 VN_RELE(*vpp); 5623 *vpp = NULL; 5624 } 5625 5626 if (*vpp == DNLC_NO_VNODE) { 5627 VN_RELE(*vpp); 5628 *vpp = NULL; 5629 e.error = ENOENT; 5630 } 5631 } 5632 5633 exit: 5634 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5635 kmem_free(argop, argoplist_size); 5636 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5637 return (e.error); 5638 } 5639 5640 /* 5641 * We need to go over the wire to lookup the name, but 5642 * while we are there verify the directory has not 5643 * changed but if it has, get new attributes and check access 5644 * 5645 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5646 * NVERIFY GETATTR ACCESS 5647 * 5648 * With the results: 5649 * if the NVERIFY failed we must purge the caches, add new attributes, 5650 * and cache new access. 5651 * set a new r_time_attr_inval 5652 * add name to dnlc, possibly negative 5653 * if LOOKUP succeeded 5654 * cache new attributes 5655 */ 5656 static int 5657 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5658 { 5659 COMPOUND4args_clnt args; 5660 COMPOUND4res_clnt res; 5661 fattr4 *ver_fattr; 5662 fattr4_change dchange; 5663 int32_t *ptr; 5664 nfs4_ga_res_t *garp = NULL; 5665 int argoplist_size = 9 * sizeof (nfs_argop4); 5666 nfs_argop4 *argop; 5667 int doqueue; 5668 mntinfo4_t *mi; 5669 nfs4_recov_state_t recov_state; 5670 hrtime_t t; 5671 int isdotdot; 5672 vnode_t *nvp; 5673 nfs_fh4 *fhp; 5674 nfs4_sharedfh_t *sfhp; 5675 nfs4_access_type_t cacc; 5676 rnode4_t *nrp; 5677 rnode4_t *drp = VTOR4(dvp); 5678 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5679 5680 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5681 ASSERT(nm != NULL); 5682 ASSERT(nm[0] != '\0'); 5683 ASSERT(dvp->v_type == VDIR); 5684 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5685 ASSERT(*vpp == NULL); 5686 5687 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5688 isdotdot = 1; 5689 args.ctag = TAG_LOOKUP_PARENT; 5690 } else { 5691 /* 5692 * If dvp were a stub, it should have triggered and caused 5693 * a mount for us to get this far. 5694 */ 5695 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5696 5697 isdotdot = 0; 5698 args.ctag = TAG_LOOKUP; 5699 } 5700 5701 mi = VTOMI4(dvp); 5702 recov_state.rs_flags = 0; 5703 recov_state.rs_num_retry_despite_err = 0; 5704 5705 nvp = NULL; 5706 5707 /* Save the original mount point security information */ 5708 (void) save_mnt_secinfo(mi->mi_curr_serv); 5709 5710 recov_retry: 5711 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5712 &recov_state, NULL); 5713 if (e.error) { 5714 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5715 return (e.error); 5716 } 5717 5718 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5719 5720 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5721 args.array_len = 9; 5722 args.array = argop; 5723 5724 /* 0. putfh file */ 5725 argop[0].argop = OP_CPUTFH; 5726 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5727 5728 /* 1. savefh for the nverify */ 5729 argop[1].argop = OP_SAVEFH; 5730 5731 /* 2. lookup name */ 5732 if (isdotdot) { 5733 argop[2].argop = OP_LOOKUPP; 5734 } else { 5735 argop[2].argop = OP_CLOOKUP; 5736 argop[2].nfs_argop4_u.opclookup.cname = nm; 5737 } 5738 5739 /* 3. resulting file handle */ 5740 argop[3].argop = OP_GETFH; 5741 5742 /* 4. resulting file attributes */ 5743 argop[4].argop = OP_GETATTR; 5744 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5745 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5746 5747 /* 5. restorefh back the directory for the nverify */ 5748 argop[5].argop = OP_RESTOREFH; 5749 5750 /* 6. nverify the change info */ 5751 argop[6].argop = OP_NVERIFY; 5752 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5753 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5754 ver_fattr->attrlist4 = (char *)&dchange; 5755 ptr = (int32_t *)&dchange; 5756 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5757 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5758 5759 /* 7. getattr directory */ 5760 argop[7].argop = OP_GETATTR; 5761 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5762 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5763 5764 /* 8. access directory */ 5765 argop[8].argop = OP_ACCESS; 5766 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5767 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5768 5769 doqueue = 1; 5770 t = gethrtime(); 5771 5772 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5773 5774 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5775 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5776 if (e.error != 0 && *vpp != NULL) 5777 VN_RELE(*vpp); 5778 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5779 &recov_state, FALSE); 5780 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5781 kmem_free(argop, argoplist_size); 5782 return (e.error); 5783 } 5784 5785 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5786 /* 5787 * For WRONGSEC of a non-dotdot case, send secinfo directly 5788 * from this thread, do not go thru the recovery thread since 5789 * we need the nm information. 5790 * 5791 * Not doing dotdot case because there is no specification 5792 * for (PUTFH, SECINFO "..") yet. 5793 */ 5794 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5795 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5796 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5797 &recov_state, FALSE); 5798 else 5799 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5800 &recov_state, TRUE); 5801 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5802 kmem_free(argop, argoplist_size); 5803 if (!e.error) 5804 goto recov_retry; 5805 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5806 return (e.error); 5807 } 5808 5809 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5810 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5811 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5812 &recov_state, TRUE); 5813 5814 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5815 kmem_free(argop, argoplist_size); 5816 goto recov_retry; 5817 } 5818 } 5819 5820 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5821 5822 if (e.error || res.array_len == 0) { 5823 /* 5824 * If e.error isn't set, then reply has no ops (or we couldn't 5825 * be here). The only legal way to reply without an op array 5826 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5827 * be in the reply for all other status values. 5828 * 5829 * For valid replies without an ops array, return ENOTSUP 5830 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5831 * return EIO -- don't trust status. 5832 */ 5833 if (e.error == 0) 5834 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5835 ENOTSUP : EIO; 5836 5837 kmem_free(argop, argoplist_size); 5838 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5839 return (e.error); 5840 } 5841 5842 e.error = geterrno4(res.status); 5843 5844 /* 5845 * The PUTFH and SAVEFH may have failed. 5846 */ 5847 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5848 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5849 nfs4_purge_stale_fh(e.error, dvp, cr); 5850 goto exit; 5851 } 5852 5853 /* 5854 * Check if the file exists, if it does delay entering 5855 * into the dnlc until after we update the directory 5856 * attributes so we don't cause it to get purged immediately. 5857 */ 5858 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5859 /* 5860 * The lookup failed, probably no entry 5861 */ 5862 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5863 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5864 goto exit; 5865 } 5866 5867 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5868 /* 5869 * The file exists but we can't get its fh for 5870 * some unknown reason. Error out to be safe. 5871 */ 5872 goto exit; 5873 } 5874 5875 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5876 if (fhp->nfs_fh4_len == 0) { 5877 /* 5878 * The file exists but a bogus fh 5879 * some unknown reason. Error out to be safe. 5880 */ 5881 e.error = EIO; 5882 goto exit; 5883 } 5884 sfhp = sfh4_get(fhp, mi); 5885 5886 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5887 sfh4_rele(&sfhp); 5888 goto exit; 5889 } 5890 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5891 5892 /* 5893 * The RESTOREFH may have failed 5894 */ 5895 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5896 sfh4_rele(&sfhp); 5897 e.error = EIO; 5898 goto exit; 5899 } 5900 5901 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5902 /* 5903 * First make sure the NVERIFY failed as we expected, 5904 * if it didn't then be conservative and error out 5905 * as we can't trust the directory. 5906 */ 5907 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5908 sfh4_rele(&sfhp); 5909 e.error = EIO; 5910 goto exit; 5911 } 5912 5913 /* 5914 * We know the NVERIFY "failed" so the directory has changed, 5915 * so we must: 5916 * purge the caches (access and indirectly dnlc if needed) 5917 */ 5918 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5919 5920 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5921 sfh4_rele(&sfhp); 5922 goto exit; 5923 } 5924 nfs4_attr_cache(dvp, 5925 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5926 t, cr, FALSE, NULL); 5927 5928 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5929 nfs4_purge_stale_fh(e.error, dvp, cr); 5930 sfh4_rele(&sfhp); 5931 e.error = geterrno4(res.status); 5932 goto exit; 5933 } 5934 5935 /* 5936 * Now we know the directory is valid, 5937 * cache new directory access 5938 */ 5939 nfs4_access_cache(drp, 5940 args.array[8].nfs_argop4_u.opaccess.access, 5941 res.array[8].nfs_resop4_u.opaccess.access, cr); 5942 5943 /* 5944 * recheck VEXEC access 5945 */ 5946 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5947 if (cacc != NFS4_ACCESS_ALLOWED) { 5948 /* 5949 * Directory permissions might have been revoked 5950 */ 5951 if (cacc == NFS4_ACCESS_DENIED) { 5952 sfh4_rele(&sfhp); 5953 e.error = EACCES; 5954 goto exit; 5955 } 5956 5957 /* 5958 * Somehow we must not have asked for enough 5959 * so try a singleton ACCESS should never happen 5960 */ 5961 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5962 if (e.error) { 5963 sfh4_rele(&sfhp); 5964 goto exit; 5965 } 5966 } 5967 5968 e.error = geterrno4(res.status); 5969 } else { 5970 hrtime_t now; 5971 hrtime_t delta = 0; 5972 5973 e.error = 0; 5974 5975 /* 5976 * Because the NVERIFY "succeeded" we know that the 5977 * directory attributes are still valid 5978 * so update r_time_attr_inval 5979 */ 5980 now = gethrtime(); 5981 mutex_enter(&drp->r_statelock); 5982 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5983 delta = now - drp->r_time_attr_saved; 5984 if (delta < mi->mi_acdirmin) 5985 delta = mi->mi_acdirmin; 5986 else if (delta > mi->mi_acdirmax) 5987 delta = mi->mi_acdirmax; 5988 } 5989 drp->r_time_attr_inval = now + delta; 5990 mutex_exit(&drp->r_statelock); 5991 5992 /* 5993 * Even though we have a valid directory attr cache, 5994 * we may not have access. 5995 * This should almost always hit the cache. 5996 */ 5997 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5998 if (e.error) { 5999 sfh4_rele(&sfhp); 6000 goto exit; 6001 } 6002 } 6003 6004 /* 6005 * Now we have successfully completed the lookup, if the 6006 * directory has changed we now have the valid attributes. 6007 * We also know we have directory access. 6008 * Create the new rnode and insert it in the dnlc. 6009 */ 6010 if (isdotdot) { 6011 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 6012 if (e.error) { 6013 sfh4_rele(&sfhp); 6014 goto exit; 6015 } 6016 /* 6017 * XXX if nfs4_make_dotdot uses an existing rnode 6018 * XXX it doesn't update the attributes. 6019 * XXX for now just save them again to save an OTW 6020 */ 6021 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 6022 } else { 6023 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 6024 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 6025 } 6026 sfh4_rele(&sfhp); 6027 6028 nrp = VTOR4(nvp); 6029 mutex_enter(&nrp->r_statev4_lock); 6030 if (!nrp->created_v4) { 6031 mutex_exit(&nrp->r_statev4_lock); 6032 dnlc_update(dvp, nm, nvp); 6033 } else 6034 mutex_exit(&nrp->r_statev4_lock); 6035 6036 *vpp = nvp; 6037 6038 exit: 6039 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6040 kmem_free(argop, argoplist_size); 6041 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6042 return (e.error); 6043 } 6044 6045 #ifdef DEBUG 6046 void 6047 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6048 { 6049 uint_t i, len; 6050 zoneid_t zoneid = getzoneid(); 6051 char *s; 6052 6053 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6054 for (i = 0; i < argcnt; i++) { 6055 nfs_argop4 *op = &argbase[i]; 6056 switch (op->argop) { 6057 case OP_CPUTFH: 6058 case OP_PUTFH: 6059 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6060 break; 6061 case OP_PUTROOTFH: 6062 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6063 break; 6064 case OP_CLOOKUP: 6065 s = op->nfs_argop4_u.opclookup.cname; 6066 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6067 break; 6068 case OP_LOOKUP: 6069 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6070 &len, NULL); 6071 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6072 kmem_free(s, len); 6073 break; 6074 case OP_LOOKUPP: 6075 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6076 break; 6077 case OP_GETFH: 6078 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6079 break; 6080 case OP_GETATTR: 6081 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6082 break; 6083 case OP_OPENATTR: 6084 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6085 break; 6086 default: 6087 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6088 op->argop); 6089 break; 6090 } 6091 } 6092 } 6093 #endif 6094 6095 /* 6096 * nfs4lookup_setup - constructs a multi-lookup compound request. 6097 * 6098 * Given the path "nm1/nm2/.../nmn", the following compound requests 6099 * may be created: 6100 * 6101 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6102 * is faster, for now. 6103 * 6104 * l4_getattrs indicates the type of compound requested. 6105 * 6106 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6107 * 6108 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6109 * 6110 * total number of ops is n + 1. 6111 * 6112 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6113 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6114 * before the last component, and only get attributes 6115 * for the last component. Note that the second-to-last 6116 * pathname component is XATTR_RPATH, which does NOT go 6117 * over-the-wire as a lookup. 6118 * 6119 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6120 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6121 * 6122 * and total number of ops is n + 5. 6123 * 6124 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6125 * attribute directory: create lookups plus an OPENATTR 6126 * replacing the last lookup. Note that the last pathname 6127 * component is XATTR_RPATH, which does NOT go over-the-wire 6128 * as a lookup. 6129 * 6130 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6131 * Openattr; Getfh; Getattr } 6132 * 6133 * and total number of ops is n + 5. 6134 * 6135 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6136 * nodes too. 6137 * 6138 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6139 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6140 * 6141 * and total number of ops is 3*n + 1. 6142 * 6143 * All cases: returns the index in the arg array of the final LOOKUP op, or 6144 * -1 if no LOOKUPs were used. 6145 */ 6146 int 6147 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6148 { 6149 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6150 nfs_argop4 *argbase, *argop; 6151 int arglen, argcnt; 6152 int n = 1; /* number of components */ 6153 int nga = 1; /* number of Getattr's in request */ 6154 char c = '\0', *s, *p; 6155 int lookup_idx = -1; 6156 int argoplist_size; 6157 6158 /* set lookuparg response result to 0 */ 6159 lookupargp->resp->status = NFS4_OK; 6160 6161 /* skip leading "/" or "." e.g. ".//./" if there is */ 6162 for (; ; nm++) { 6163 if (*nm != '/' && *nm != '.') 6164 break; 6165 6166 /* ".." is counted as 1 component */ 6167 if (*nm == '.' && *(nm + 1) != '/') 6168 break; 6169 } 6170 6171 /* 6172 * Find n = number of components - nm must be null terminated 6173 * Skip "." components. 6174 */ 6175 if (*nm != '\0') 6176 for (n = 1, s = nm; *s != '\0'; s++) { 6177 if ((*s == '/') && (*(s + 1) != '/') && 6178 (*(s + 1) != '\0') && 6179 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6180 *(s + 2) == '\0'))) 6181 n++; 6182 } 6183 else 6184 n = 0; 6185 6186 /* 6187 * nga is number of components that need Getfh+Getattr 6188 */ 6189 switch (l4_getattrs) { 6190 case LKP4_NO_ATTRIBUTES: 6191 nga = 0; 6192 break; 6193 case LKP4_ALL_ATTRIBUTES: 6194 nga = n; 6195 /* 6196 * Always have at least 1 getfh, getattr pair 6197 */ 6198 if (nga == 0) 6199 nga++; 6200 break; 6201 case LKP4_LAST_ATTRDIR: 6202 case LKP4_LAST_NAMED_ATTR: 6203 nga = n+1; 6204 break; 6205 } 6206 6207 /* 6208 * If change to use the filehandle attr instead of getfh 6209 * the following line can be deleted. 6210 */ 6211 nga *= 2; 6212 6213 /* 6214 * calculate number of ops in request as 6215 * header + trailer + lookups + getattrs 6216 */ 6217 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6218 6219 argoplist_size = arglen * sizeof (nfs_argop4); 6220 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6221 lookupargp->argsp->array = argop; 6222 6223 argcnt = lookupargp->header_len; 6224 argop += argcnt; 6225 6226 /* 6227 * loop and create a lookup op and possibly getattr/getfh for 6228 * each component. Skip "." components. 6229 */ 6230 for (s = nm; *s != '\0'; s = p) { 6231 /* 6232 * Set up a pathname struct for each component if needed 6233 */ 6234 while (*s == '/') 6235 s++; 6236 if (*s == '\0') 6237 break; 6238 6239 for (p = s; (*p != '/') && (*p != '\0'); p++) 6240 ; 6241 c = *p; 6242 *p = '\0'; 6243 6244 if (s[0] == '.' && s[1] == '\0') { 6245 *p = c; 6246 continue; 6247 } 6248 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6249 strcmp(s, XATTR_RPATH) == 0) { 6250 /* getfh XXX may not be needed in future */ 6251 argop->argop = OP_GETFH; 6252 argop++; 6253 argcnt++; 6254 6255 /* getattr */ 6256 argop->argop = OP_GETATTR; 6257 argop->nfs_argop4_u.opgetattr.attr_request = 6258 lookupargp->ga_bits; 6259 argop->nfs_argop4_u.opgetattr.mi = 6260 lookupargp->mi; 6261 argop++; 6262 argcnt++; 6263 6264 /* openattr */ 6265 argop->argop = OP_OPENATTR; 6266 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6267 strcmp(s, XATTR_RPATH) == 0) { 6268 /* openattr */ 6269 argop->argop = OP_OPENATTR; 6270 argop++; 6271 argcnt++; 6272 6273 /* getfh XXX may not be needed in future */ 6274 argop->argop = OP_GETFH; 6275 argop++; 6276 argcnt++; 6277 6278 /* getattr */ 6279 argop->argop = OP_GETATTR; 6280 argop->nfs_argop4_u.opgetattr.attr_request = 6281 lookupargp->ga_bits; 6282 argop->nfs_argop4_u.opgetattr.mi = 6283 lookupargp->mi; 6284 argop++; 6285 argcnt++; 6286 *p = c; 6287 continue; 6288 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6289 /* lookupp */ 6290 argop->argop = OP_LOOKUPP; 6291 } else { 6292 /* lookup */ 6293 argop->argop = OP_LOOKUP; 6294 (void) str_to_utf8(s, 6295 &argop->nfs_argop4_u.oplookup.objname); 6296 } 6297 lookup_idx = argcnt; 6298 argop++; 6299 argcnt++; 6300 6301 *p = c; 6302 6303 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6304 /* getfh XXX may not be needed in future */ 6305 argop->argop = OP_GETFH; 6306 argop++; 6307 argcnt++; 6308 6309 /* getattr */ 6310 argop->argop = OP_GETATTR; 6311 argop->nfs_argop4_u.opgetattr.attr_request = 6312 lookupargp->ga_bits; 6313 argop->nfs_argop4_u.opgetattr.mi = 6314 lookupargp->mi; 6315 argop++; 6316 argcnt++; 6317 } 6318 } 6319 6320 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6321 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6322 if (needgetfh) { 6323 /* stick in a post-lookup getfh */ 6324 argop->argop = OP_GETFH; 6325 argcnt++; 6326 argop++; 6327 } 6328 /* post-lookup getattr */ 6329 argop->argop = OP_GETATTR; 6330 argop->nfs_argop4_u.opgetattr.attr_request = 6331 lookupargp->ga_bits; 6332 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6333 argcnt++; 6334 } 6335 argcnt += lookupargp->trailer_len; /* actual op count */ 6336 lookupargp->argsp->array_len = argcnt; 6337 lookupargp->arglen = arglen; 6338 6339 #ifdef DEBUG 6340 if (nfs4_client_lookup_debug) 6341 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6342 #endif 6343 6344 return (lookup_idx); 6345 } 6346 6347 static int 6348 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6349 { 6350 COMPOUND4args_clnt args; 6351 COMPOUND4res_clnt res; 6352 GETFH4res *gf_res = NULL; 6353 nfs_argop4 argop[4]; 6354 nfs_resop4 *resop = NULL; 6355 nfs4_sharedfh_t *sfhp; 6356 hrtime_t t; 6357 nfs4_error_t e; 6358 6359 rnode4_t *drp; 6360 int doqueue = 1; 6361 vnode_t *vp; 6362 int needrecov = 0; 6363 nfs4_recov_state_t recov_state; 6364 6365 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6366 6367 *avp = NULL; 6368 recov_state.rs_flags = 0; 6369 recov_state.rs_num_retry_despite_err = 0; 6370 6371 recov_retry: 6372 /* COMPOUND: putfh, openattr, getfh, getattr */ 6373 args.array_len = 4; 6374 args.array = argop; 6375 args.ctag = TAG_OPENATTR; 6376 6377 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6378 if (e.error) 6379 return (e.error); 6380 6381 drp = VTOR4(dvp); 6382 6383 /* putfh */ 6384 argop[0].argop = OP_CPUTFH; 6385 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6386 6387 /* openattr */ 6388 argop[1].argop = OP_OPENATTR; 6389 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6390 6391 /* getfh */ 6392 argop[2].argop = OP_GETFH; 6393 6394 /* getattr */ 6395 argop[3].argop = OP_GETATTR; 6396 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6397 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6398 6399 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6400 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6401 rnode4info(drp))); 6402 6403 t = gethrtime(); 6404 6405 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6406 6407 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6408 if (needrecov) { 6409 bool_t abort; 6410 6411 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6412 "nfs4openattr: initiating recovery\n")); 6413 6414 abort = nfs4_start_recovery(&e, 6415 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6416 OP_OPENATTR, NULL, NULL, NULL); 6417 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6418 if (!e.error) { 6419 e.error = geterrno4(res.status); 6420 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6421 } 6422 if (abort == FALSE) 6423 goto recov_retry; 6424 return (e.error); 6425 } 6426 6427 if (e.error) { 6428 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6429 return (e.error); 6430 } 6431 6432 if (res.status) { 6433 /* 6434 * If OTW errro is NOTSUPP, then it should be 6435 * translated to EINVAL. All Solaris file system 6436 * implementations return EINVAL to the syscall layer 6437 * when the attrdir cannot be created due to an 6438 * implementation restriction or noxattr mount option. 6439 */ 6440 if (res.status == NFS4ERR_NOTSUPP) { 6441 mutex_enter(&drp->r_statelock); 6442 if (drp->r_xattr_dir) 6443 VN_RELE(drp->r_xattr_dir); 6444 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6445 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6446 mutex_exit(&drp->r_statelock); 6447 6448 e.error = EINVAL; 6449 } else { 6450 e.error = geterrno4(res.status); 6451 } 6452 6453 if (e.error) { 6454 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6455 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6456 needrecov); 6457 return (e.error); 6458 } 6459 } 6460 6461 resop = &res.array[0]; /* putfh res */ 6462 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6463 6464 resop = &res.array[1]; /* openattr res */ 6465 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6466 6467 resop = &res.array[2]; /* getfh res */ 6468 gf_res = &resop->nfs_resop4_u.opgetfh; 6469 if (gf_res->object.nfs_fh4_len == 0) { 6470 *avp = NULL; 6471 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6472 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6473 return (ENOENT); 6474 } 6475 6476 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6477 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6478 dvp->v_vfsp, t, cr, dvp, 6479 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6480 sfh4_rele(&sfhp); 6481 6482 if (e.error) 6483 PURGE_ATTRCACHE4(vp); 6484 6485 mutex_enter(&vp->v_lock); 6486 vp->v_flag |= V_XATTRDIR; 6487 mutex_exit(&vp->v_lock); 6488 6489 *avp = vp; 6490 6491 mutex_enter(&drp->r_statelock); 6492 if (drp->r_xattr_dir) 6493 VN_RELE(drp->r_xattr_dir); 6494 VN_HOLD(vp); 6495 drp->r_xattr_dir = vp; 6496 6497 /* 6498 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6499 * NULL. xattrs could be created at any time, and we have no 6500 * way to update pc4_xattr_exists in the base object if/when 6501 * it happens. 6502 */ 6503 drp->r_pathconf.pc4_xattr_valid = 0; 6504 6505 mutex_exit(&drp->r_statelock); 6506 6507 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6508 6509 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6510 6511 return (0); 6512 } 6513 6514 /* ARGSUSED */ 6515 static int 6516 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6517 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6518 vsecattr_t *vsecp) 6519 { 6520 int error; 6521 vnode_t *vp = NULL; 6522 rnode4_t *rp; 6523 struct vattr vattr; 6524 rnode4_t *drp; 6525 vnode_t *tempvp; 6526 enum createmode4 createmode; 6527 bool_t must_trunc = FALSE; 6528 int truncating = 0; 6529 6530 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6531 return (EPERM); 6532 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6533 return (EINVAL); 6534 } 6535 6536 /* . and .. have special meaning in the protocol, reject them. */ 6537 6538 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6539 return (EISDIR); 6540 6541 drp = VTOR4(dvp); 6542 6543 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6544 return (EINTR); 6545 6546 top: 6547 /* 6548 * We make a copy of the attributes because the caller does not 6549 * expect us to change what va points to. 6550 */ 6551 vattr = *va; 6552 6553 /* 6554 * If the pathname is "", then dvp is the root vnode of 6555 * a remote file mounted over a local directory. 6556 * All that needs to be done is access 6557 * checking and truncation. Note that we avoid doing 6558 * open w/ create because the parent directory might 6559 * be in pseudo-fs and the open would fail. 6560 */ 6561 if (*nm == '\0') { 6562 error = 0; 6563 VN_HOLD(dvp); 6564 vp = dvp; 6565 must_trunc = TRUE; 6566 } else { 6567 /* 6568 * We need to go over the wire, just to be sure whether the 6569 * file exists or not. Using the DNLC can be dangerous in 6570 * this case when making a decision regarding existence. 6571 */ 6572 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6573 } 6574 6575 if (exclusive) 6576 createmode = EXCLUSIVE4; 6577 else 6578 createmode = GUARDED4; 6579 6580 /* 6581 * error would be set if the file does not exist on the 6582 * server, so lets go create it. 6583 */ 6584 if (error) { 6585 goto create_otw; 6586 } 6587 6588 /* 6589 * File does exist on the server 6590 */ 6591 if (exclusive == EXCL) 6592 error = EEXIST; 6593 else if (vp->v_type == VDIR && (mode & VWRITE)) 6594 error = EISDIR; 6595 else { 6596 /* 6597 * If vnode is a device, create special vnode. 6598 */ 6599 if (ISVDEV(vp->v_type)) { 6600 tempvp = vp; 6601 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6602 VN_RELE(tempvp); 6603 } 6604 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6605 if ((vattr.va_mask & AT_SIZE) && 6606 vp->v_type == VREG) { 6607 rp = VTOR4(vp); 6608 /* 6609 * Check here for large file handled 6610 * by LF-unaware process (as 6611 * ufs_create() does) 6612 */ 6613 if (!(flags & FOFFMAX)) { 6614 mutex_enter(&rp->r_statelock); 6615 if (rp->r_size > MAXOFF32_T) 6616 error = EOVERFLOW; 6617 mutex_exit(&rp->r_statelock); 6618 } 6619 6620 /* if error is set then we need to return */ 6621 if (error) { 6622 nfs_rw_exit(&drp->r_rwlock); 6623 VN_RELE(vp); 6624 return (error); 6625 } 6626 6627 if (must_trunc) { 6628 vattr.va_mask = AT_SIZE; 6629 error = nfs4setattr(vp, &vattr, 0, cr, 6630 NULL); 6631 } else { 6632 /* 6633 * we know we have a regular file that already 6634 * exists and we may end up truncating the file 6635 * as a result of the open_otw, so flush out 6636 * any dirty pages for this file first. 6637 */ 6638 if (nfs4_has_pages(vp) && 6639 ((rp->r_flags & R4DIRTY) || 6640 rp->r_count > 0 || 6641 rp->r_mapcnt > 0)) { 6642 error = nfs4_putpage(vp, 6643 (offset_t)0, 0, 0, cr, ct); 6644 if (error && (error == ENOSPC || 6645 error == EDQUOT)) { 6646 mutex_enter( 6647 &rp->r_statelock); 6648 if (!rp->r_error) 6649 rp->r_error = 6650 error; 6651 mutex_exit( 6652 &rp->r_statelock); 6653 } 6654 } 6655 vattr.va_mask = (AT_SIZE | 6656 AT_TYPE | AT_MODE); 6657 vattr.va_type = VREG; 6658 createmode = UNCHECKED4; 6659 truncating = 1; 6660 goto create_otw; 6661 } 6662 } 6663 } 6664 } 6665 nfs_rw_exit(&drp->r_rwlock); 6666 if (error) { 6667 VN_RELE(vp); 6668 } else { 6669 vnode_t *tvp; 6670 rnode4_t *trp; 6671 tvp = vp; 6672 if (vp->v_type == VREG) { 6673 trp = VTOR4(vp); 6674 if (IS_SHADOW(vp, trp)) 6675 tvp = RTOV4(trp); 6676 } 6677 6678 if (must_trunc) { 6679 /* 6680 * existing file got truncated, notify. 6681 */ 6682 vnevent_create(tvp, ct); 6683 } 6684 6685 *vpp = vp; 6686 } 6687 return (error); 6688 6689 create_otw: 6690 dnlc_remove(dvp, nm); 6691 6692 ASSERT(vattr.va_mask & AT_TYPE); 6693 6694 /* 6695 * If not a regular file let nfs4mknod() handle it. 6696 */ 6697 if (vattr.va_type != VREG) { 6698 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6699 nfs_rw_exit(&drp->r_rwlock); 6700 return (error); 6701 } 6702 6703 /* 6704 * It _is_ a regular file. 6705 */ 6706 ASSERT(vattr.va_mask & AT_MODE); 6707 if (MANDMODE(vattr.va_mode)) { 6708 nfs_rw_exit(&drp->r_rwlock); 6709 return (EACCES); 6710 } 6711 6712 /* 6713 * If this happens to be a mknod of a regular file, then flags will 6714 * have neither FREAD or FWRITE. However, we must set at least one 6715 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6716 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6717 * set (based on openmode specified by app). 6718 */ 6719 if ((flags & (FREAD|FWRITE)) == 0) 6720 flags |= (FREAD|FWRITE); 6721 6722 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6723 6724 if (vp != NULL) { 6725 /* if create was successful, throw away the file's pages */ 6726 if (!error && (vattr.va_mask & AT_SIZE)) 6727 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6728 cr); 6729 /* release the lookup hold */ 6730 VN_RELE(vp); 6731 vp = NULL; 6732 } 6733 6734 /* 6735 * validate that we opened a regular file. This handles a misbehaving 6736 * server that returns an incorrect FH. 6737 */ 6738 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6739 error = EISDIR; 6740 VN_RELE(*vpp); 6741 } 6742 6743 /* 6744 * If this is not an exclusive create, then the CREATE 6745 * request will be made with the GUARDED mode set. This 6746 * means that the server will return EEXIST if the file 6747 * exists. The file could exist because of a retransmitted 6748 * request. In this case, we recover by starting over and 6749 * checking to see whether the file exists. This second 6750 * time through it should and a CREATE request will not be 6751 * sent. 6752 * 6753 * This handles the problem of a dangling CREATE request 6754 * which contains attributes which indicate that the file 6755 * should be truncated. This retransmitted request could 6756 * possibly truncate valid data in the file if not caught 6757 * by the duplicate request mechanism on the server or if 6758 * not caught by other means. The scenario is: 6759 * 6760 * Client transmits CREATE request with size = 0 6761 * Client times out, retransmits request. 6762 * Response to the first request arrives from the server 6763 * and the client proceeds on. 6764 * Client writes data to the file. 6765 * The server now processes retransmitted CREATE request 6766 * and truncates file. 6767 * 6768 * The use of the GUARDED CREATE request prevents this from 6769 * happening because the retransmitted CREATE would fail 6770 * with EEXIST and would not truncate the file. 6771 */ 6772 if (error == EEXIST && exclusive == NONEXCL) { 6773 #ifdef DEBUG 6774 nfs4_create_misses++; 6775 #endif 6776 goto top; 6777 } 6778 nfs_rw_exit(&drp->r_rwlock); 6779 if (truncating && !error && *vpp) { 6780 vnode_t *tvp; 6781 rnode4_t *trp; 6782 /* 6783 * existing file got truncated, notify. 6784 */ 6785 tvp = *vpp; 6786 trp = VTOR4(tvp); 6787 if (IS_SHADOW(tvp, trp)) 6788 tvp = RTOV4(trp); 6789 vnevent_create(tvp, ct); 6790 } 6791 return (error); 6792 } 6793 6794 /* 6795 * Create compound (for mkdir, mknod, symlink): 6796 * { Putfh <dfh>; Create; Getfh; Getattr } 6797 * It's okay if setattr failed to set gid - this is not considered 6798 * an error, but purge attrs in that case. 6799 */ 6800 static int 6801 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6802 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6803 { 6804 int need_end_op = FALSE; 6805 COMPOUND4args_clnt args; 6806 COMPOUND4res_clnt res, *resp = NULL; 6807 nfs_argop4 *argop; 6808 nfs_resop4 *resop; 6809 int doqueue; 6810 mntinfo4_t *mi; 6811 rnode4_t *drp = VTOR4(dvp); 6812 change_info4 *cinfo; 6813 GETFH4res *gf_res; 6814 struct vattr vattr; 6815 vnode_t *vp; 6816 fattr4 *crattr; 6817 bool_t needrecov = FALSE; 6818 nfs4_recov_state_t recov_state; 6819 nfs4_sharedfh_t *sfhp = NULL; 6820 hrtime_t t; 6821 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6822 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6823 dirattr_info_t dinfo, *dinfop; 6824 servinfo4_t *svp; 6825 bitmap4 supp_attrs; 6826 6827 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6828 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6829 6830 mi = VTOMI4(dvp); 6831 6832 /* 6833 * Make sure we properly deal with setting the right gid 6834 * on a new directory to reflect the parent's setgid bit 6835 */ 6836 setgid_flag = 0; 6837 if (type == NF4DIR) { 6838 struct vattr dva; 6839 6840 va->va_mode &= ~VSGID; 6841 dva.va_mask = AT_MODE | AT_GID; 6842 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6843 6844 /* 6845 * If the parent's directory has the setgid bit set 6846 * _and_ the client was able to get a valid mapping 6847 * for the parent dir's owner_group, we want to 6848 * append NVERIFY(owner_group == dva.va_gid) and 6849 * SETTATTR to the CREATE compound. 6850 */ 6851 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6852 setgid_flag = 1; 6853 va->va_mode |= VSGID; 6854 if (dva.va_gid != GID_NOBODY) { 6855 va->va_mask |= AT_GID; 6856 va->va_gid = dva.va_gid; 6857 } 6858 } 6859 } 6860 } 6861 6862 /* 6863 * Create ops: 6864 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6865 * 5:restorefh(dir) 6:getattr(dir) 6866 * 6867 * if (setgid) 6868 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6869 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6870 * 8:nverify 9:setattr 6871 */ 6872 if (setgid_flag) { 6873 numops = 10; 6874 idx_create = 1; 6875 idx_fattr = 3; 6876 } else { 6877 numops = 7; 6878 idx_create = 2; 6879 idx_fattr = 4; 6880 } 6881 6882 ASSERT(nfs_zone() == mi->mi_zone); 6883 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6884 return (EINTR); 6885 } 6886 recov_state.rs_flags = 0; 6887 recov_state.rs_num_retry_despite_err = 0; 6888 6889 argoplist_size = numops * sizeof (nfs_argop4); 6890 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6891 6892 recov_retry: 6893 if (type == NF4LNK) 6894 args.ctag = TAG_SYMLINK; 6895 else if (type == NF4DIR) 6896 args.ctag = TAG_MKDIR; 6897 else 6898 args.ctag = TAG_MKNOD; 6899 6900 args.array_len = numops; 6901 args.array = argop; 6902 6903 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6904 nfs_rw_exit(&drp->r_rwlock); 6905 kmem_free(argop, argoplist_size); 6906 return (e.error); 6907 } 6908 need_end_op = TRUE; 6909 6910 6911 /* 0: putfh directory */ 6912 argop[0].argop = OP_CPUTFH; 6913 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6914 6915 /* 1/2: Create object */ 6916 argop[idx_create].argop = OP_CCREATE; 6917 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6918 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6919 if (type == NF4LNK) { 6920 /* 6921 * symlink, treat name as data 6922 */ 6923 ASSERT(data != NULL); 6924 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6925 (char *)data; 6926 } 6927 if (type == NF4BLK || type == NF4CHR) { 6928 ASSERT(data != NULL); 6929 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6930 *((specdata4 *)data); 6931 } 6932 6933 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6934 6935 svp = drp->r_server; 6936 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6937 supp_attrs = svp->sv_supp_attrs; 6938 nfs_rw_exit(&svp->sv_lock); 6939 6940 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6941 nfs_rw_exit(&drp->r_rwlock); 6942 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6943 e.error = EINVAL; 6944 kmem_free(argop, argoplist_size); 6945 return (e.error); 6946 } 6947 6948 /* 2/3: getfh fh of created object */ 6949 ASSERT(idx_create + 1 == idx_fattr - 1); 6950 argop[idx_create + 1].argop = OP_GETFH; 6951 6952 /* 3/4: getattr of new object */ 6953 argop[idx_fattr].argop = OP_GETATTR; 6954 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6955 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6956 6957 if (setgid_flag) { 6958 vattr_t _v; 6959 6960 argop[4].argop = OP_SAVEFH; 6961 6962 argop[5].argop = OP_CPUTFH; 6963 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6964 6965 argop[6].argop = OP_GETATTR; 6966 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6967 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6968 6969 argop[7].argop = OP_RESTOREFH; 6970 6971 /* 6972 * nverify 6973 * 6974 * XXX - Revisit the last argument to nfs4_end_op() 6975 * once 5020486 is fixed. 6976 */ 6977 _v.va_mask = AT_GID; 6978 _v.va_gid = va->va_gid; 6979 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6980 supp_attrs)) { 6981 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6982 nfs_rw_exit(&drp->r_rwlock); 6983 nfs4_fattr4_free(crattr); 6984 kmem_free(argop, argoplist_size); 6985 return (e.error); 6986 } 6987 6988 /* 6989 * setattr 6990 * 6991 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 6992 * so no need for stateid or flags. Also we specify NULL 6993 * rp since we're only interested in setting owner_group 6994 * attributes. 6995 */ 6996 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 6997 &e.error, 0); 6998 6999 if (e.error) { 7000 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 7001 nfs_rw_exit(&drp->r_rwlock); 7002 nfs4_fattr4_free(crattr); 7003 nfs4args_verify_free(&argop[8]); 7004 kmem_free(argop, argoplist_size); 7005 return (e.error); 7006 } 7007 } else { 7008 argop[1].argop = OP_SAVEFH; 7009 7010 argop[5].argop = OP_RESTOREFH; 7011 7012 argop[6].argop = OP_GETATTR; 7013 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7014 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7015 } 7016 7017 dnlc_remove(dvp, nm); 7018 7019 doqueue = 1; 7020 t = gethrtime(); 7021 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7022 7023 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7024 if (e.error) { 7025 PURGE_ATTRCACHE4(dvp); 7026 if (!needrecov) 7027 goto out; 7028 } 7029 7030 if (needrecov) { 7031 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 7032 OP_CREATE, NULL, NULL, NULL) == FALSE) { 7033 nfs4_end_op(mi, dvp, NULL, &recov_state, 7034 needrecov); 7035 need_end_op = FALSE; 7036 nfs4_fattr4_free(crattr); 7037 if (setgid_flag) { 7038 nfs4args_verify_free(&argop[8]); 7039 nfs4args_setattr_free(&argop[9]); 7040 } 7041 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7042 goto recov_retry; 7043 } 7044 } 7045 7046 resp = &res; 7047 7048 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7049 7050 if (res.status == NFS4ERR_BADOWNER) 7051 nfs4_log_badowner(mi, OP_CREATE); 7052 7053 e.error = geterrno4(res.status); 7054 7055 /* 7056 * This check is left over from when create was implemented 7057 * using a setattr op (instead of createattrs). If the 7058 * putfh/create/getfh failed, the error was returned. If 7059 * setattr/getattr failed, we keep going. 7060 * 7061 * It might be better to get rid of the GETFH also, and just 7062 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7063 * Then if any of the operations failed, we could return the 7064 * error now, and remove much of the error code below. 7065 */ 7066 if (res.array_len <= idx_fattr) { 7067 /* 7068 * Either Putfh, Create or Getfh failed. 7069 */ 7070 PURGE_ATTRCACHE4(dvp); 7071 /* 7072 * nfs4_purge_stale_fh() may generate otw calls through 7073 * nfs4_invalidate_pages. Hence the need to call 7074 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7075 */ 7076 nfs4_end_op(mi, dvp, NULL, &recov_state, 7077 needrecov); 7078 need_end_op = FALSE; 7079 nfs4_purge_stale_fh(e.error, dvp, cr); 7080 goto out; 7081 } 7082 } 7083 7084 resop = &res.array[idx_create]; /* create res */ 7085 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7086 7087 resop = &res.array[idx_create + 1]; /* getfh res */ 7088 gf_res = &resop->nfs_resop4_u.opgetfh; 7089 7090 sfhp = sfh4_get(&gf_res->object, mi); 7091 if (e.error) { 7092 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7093 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7094 if (vp->v_type == VNON) { 7095 vattr.va_mask = AT_TYPE; 7096 /* 7097 * Need to call nfs4_end_op before nfs4getattr to avoid 7098 * potential nfs4_start_op deadlock. See RFE 4777612. 7099 */ 7100 nfs4_end_op(mi, dvp, NULL, &recov_state, 7101 needrecov); 7102 need_end_op = FALSE; 7103 e.error = nfs4getattr(vp, &vattr, cr); 7104 if (e.error) { 7105 VN_RELE(vp); 7106 *vpp = NULL; 7107 goto out; 7108 } 7109 vp->v_type = vattr.va_type; 7110 } 7111 e.error = 0; 7112 } else { 7113 *vpp = vp = makenfs4node(sfhp, 7114 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7115 dvp->v_vfsp, t, cr, 7116 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7117 } 7118 7119 /* 7120 * If compound succeeded, then update dir attrs 7121 */ 7122 if (res.status == NFS4_OK) { 7123 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7124 dinfo.di_cred = cr; 7125 dinfo.di_time_call = t; 7126 dinfop = &dinfo; 7127 } else 7128 dinfop = NULL; 7129 7130 /* Update directory cache attribute, readdir and dnlc caches */ 7131 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7132 7133 out: 7134 if (sfhp != NULL) 7135 sfh4_rele(&sfhp); 7136 nfs_rw_exit(&drp->r_rwlock); 7137 nfs4_fattr4_free(crattr); 7138 if (setgid_flag) { 7139 nfs4args_verify_free(&argop[8]); 7140 nfs4args_setattr_free(&argop[9]); 7141 } 7142 if (resp) 7143 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7144 if (need_end_op) 7145 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7146 7147 kmem_free(argop, argoplist_size); 7148 return (e.error); 7149 } 7150 7151 /* ARGSUSED */ 7152 static int 7153 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7154 int mode, vnode_t **vpp, cred_t *cr) 7155 { 7156 int error; 7157 vnode_t *vp; 7158 nfs_ftype4 type; 7159 specdata4 spec, *specp = NULL; 7160 7161 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7162 7163 switch (va->va_type) { 7164 case VCHR: 7165 case VBLK: 7166 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7167 spec.specdata1 = getmajor(va->va_rdev); 7168 spec.specdata2 = getminor(va->va_rdev); 7169 specp = &spec; 7170 break; 7171 7172 case VFIFO: 7173 type = NF4FIFO; 7174 break; 7175 case VSOCK: 7176 type = NF4SOCK; 7177 break; 7178 7179 default: 7180 return (EINVAL); 7181 } 7182 7183 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7184 if (error) { 7185 return (error); 7186 } 7187 7188 /* 7189 * This might not be needed any more; special case to deal 7190 * with problematic v2/v3 servers. Since create was unable 7191 * to set group correctly, not sure what hope setattr has. 7192 */ 7193 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7194 va->va_mask = AT_GID; 7195 (void) nfs4setattr(vp, va, 0, cr, NULL); 7196 } 7197 7198 /* 7199 * If vnode is a device create special vnode 7200 */ 7201 if (ISVDEV(vp->v_type)) { 7202 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7203 VN_RELE(vp); 7204 } else { 7205 *vpp = vp; 7206 } 7207 return (error); 7208 } 7209 7210 /* 7211 * Remove requires that the current fh be the target directory. 7212 * After the operation, the current fh is unchanged. 7213 * The compound op structure is: 7214 * PUTFH(targetdir), REMOVE 7215 * 7216 * Weirdness: if the vnode to be removed is open 7217 * we rename it instead of removing it and nfs_inactive 7218 * will remove the new name. 7219 */ 7220 /* ARGSUSED */ 7221 static int 7222 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7223 { 7224 COMPOUND4args_clnt args; 7225 COMPOUND4res_clnt res, *resp = NULL; 7226 REMOVE4res *rm_res; 7227 nfs_argop4 argop[3]; 7228 nfs_resop4 *resop; 7229 vnode_t *vp; 7230 char *tmpname; 7231 int doqueue; 7232 mntinfo4_t *mi; 7233 rnode4_t *rp; 7234 rnode4_t *drp; 7235 int needrecov = 0; 7236 nfs4_recov_state_t recov_state; 7237 int isopen; 7238 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7239 dirattr_info_t dinfo; 7240 7241 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7242 return (EPERM); 7243 drp = VTOR4(dvp); 7244 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7245 return (EINTR); 7246 7247 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7248 if (e.error) { 7249 nfs_rw_exit(&drp->r_rwlock); 7250 return (e.error); 7251 } 7252 7253 if (vp->v_type == VDIR) { 7254 VN_RELE(vp); 7255 nfs_rw_exit(&drp->r_rwlock); 7256 return (EISDIR); 7257 } 7258 7259 /* 7260 * First just remove the entry from the name cache, as it 7261 * is most likely the only entry for this vp. 7262 */ 7263 dnlc_remove(dvp, nm); 7264 7265 rp = VTOR4(vp); 7266 7267 /* 7268 * For regular file types, check to see if the file is open by looking 7269 * at the open streams. 7270 * For all other types, check the reference count on the vnode. Since 7271 * they are not opened OTW they never have an open stream. 7272 * 7273 * If the file is open, rename it to .nfsXXXX. 7274 */ 7275 if (vp->v_type != VREG) { 7276 /* 7277 * If the file has a v_count > 1 then there may be more than one 7278 * entry in the name cache due multiple links or an open file, 7279 * but we don't have the real reference count so flush all 7280 * possible entries. 7281 */ 7282 if (vp->v_count > 1) 7283 dnlc_purge_vp(vp); 7284 7285 /* 7286 * Now we have the real reference count. 7287 */ 7288 isopen = vp->v_count > 1; 7289 } else { 7290 mutex_enter(&rp->r_os_lock); 7291 isopen = list_head(&rp->r_open_streams) != NULL; 7292 mutex_exit(&rp->r_os_lock); 7293 } 7294 7295 mutex_enter(&rp->r_statelock); 7296 if (isopen && 7297 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7298 mutex_exit(&rp->r_statelock); 7299 tmpname = newname(); 7300 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7301 if (e.error) 7302 kmem_free(tmpname, MAXNAMELEN); 7303 else { 7304 mutex_enter(&rp->r_statelock); 7305 if (rp->r_unldvp == NULL) { 7306 VN_HOLD(dvp); 7307 rp->r_unldvp = dvp; 7308 if (rp->r_unlcred != NULL) 7309 crfree(rp->r_unlcred); 7310 crhold(cr); 7311 rp->r_unlcred = cr; 7312 rp->r_unlname = tmpname; 7313 } else { 7314 kmem_free(rp->r_unlname, MAXNAMELEN); 7315 rp->r_unlname = tmpname; 7316 } 7317 mutex_exit(&rp->r_statelock); 7318 } 7319 VN_RELE(vp); 7320 nfs_rw_exit(&drp->r_rwlock); 7321 return (e.error); 7322 } 7323 /* 7324 * Actually remove the file/dir 7325 */ 7326 mutex_exit(&rp->r_statelock); 7327 7328 /* 7329 * We need to flush any dirty pages which happen to 7330 * be hanging around before removing the file. 7331 * This shouldn't happen very often since in NFSv4 7332 * we should be close to open consistent. 7333 */ 7334 if (nfs4_has_pages(vp) && 7335 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7336 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7337 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7338 mutex_enter(&rp->r_statelock); 7339 if (!rp->r_error) 7340 rp->r_error = e.error; 7341 mutex_exit(&rp->r_statelock); 7342 } 7343 } 7344 7345 mi = VTOMI4(dvp); 7346 7347 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7348 recov_state.rs_flags = 0; 7349 recov_state.rs_num_retry_despite_err = 0; 7350 7351 recov_retry: 7352 /* 7353 * Remove ops: putfh dir; remove 7354 */ 7355 args.ctag = TAG_REMOVE; 7356 args.array_len = 3; 7357 args.array = argop; 7358 7359 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7360 if (e.error) { 7361 nfs_rw_exit(&drp->r_rwlock); 7362 VN_RELE(vp); 7363 return (e.error); 7364 } 7365 7366 /* putfh directory */ 7367 argop[0].argop = OP_CPUTFH; 7368 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7369 7370 /* remove */ 7371 argop[1].argop = OP_CREMOVE; 7372 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7373 7374 /* getattr dir */ 7375 argop[2].argop = OP_GETATTR; 7376 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7377 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7378 7379 doqueue = 1; 7380 dinfo.di_time_call = gethrtime(); 7381 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7382 7383 PURGE_ATTRCACHE4(vp); 7384 7385 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7386 if (e.error) 7387 PURGE_ATTRCACHE4(dvp); 7388 7389 if (needrecov) { 7390 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7391 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 7392 if (!e.error) 7393 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7394 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7395 needrecov); 7396 goto recov_retry; 7397 } 7398 } 7399 7400 /* 7401 * Matching nfs4_end_op() for start_op() above. 7402 * There is a path in the code below which calls 7403 * nfs4_purge_stale_fh(), which may generate otw calls through 7404 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7405 * here to avoid nfs4_start_op() deadlock. 7406 */ 7407 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7408 7409 if (!e.error) { 7410 resp = &res; 7411 7412 if (res.status) { 7413 e.error = geterrno4(res.status); 7414 PURGE_ATTRCACHE4(dvp); 7415 nfs4_purge_stale_fh(e.error, dvp, cr); 7416 } else { 7417 resop = &res.array[1]; /* remove res */ 7418 rm_res = &resop->nfs_resop4_u.opremove; 7419 7420 dinfo.di_garp = 7421 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7422 dinfo.di_cred = cr; 7423 7424 /* Update directory attr, readdir and dnlc caches */ 7425 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7426 &dinfo); 7427 } 7428 } 7429 nfs_rw_exit(&drp->r_rwlock); 7430 if (resp) 7431 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7432 7433 if (e.error == 0) { 7434 vnode_t *tvp; 7435 rnode4_t *trp; 7436 trp = VTOR4(vp); 7437 tvp = vp; 7438 if (IS_SHADOW(vp, trp)) 7439 tvp = RTOV4(trp); 7440 vnevent_remove(tvp, dvp, nm, ct); 7441 } 7442 VN_RELE(vp); 7443 return (e.error); 7444 } 7445 7446 /* 7447 * Link requires that the current fh be the target directory and the 7448 * saved fh be the source fh. After the operation, the current fh is unchanged. 7449 * Thus the compound op structure is: 7450 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7451 * GETATTR(file) 7452 */ 7453 /* ARGSUSED */ 7454 static int 7455 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7456 caller_context_t *ct, int flags) 7457 { 7458 COMPOUND4args_clnt args; 7459 COMPOUND4res_clnt res, *resp = NULL; 7460 LINK4res *ln_res; 7461 int argoplist_size = 7 * sizeof (nfs_argop4); 7462 nfs_argop4 *argop; 7463 nfs_resop4 *resop; 7464 vnode_t *realvp, *nvp; 7465 int doqueue; 7466 mntinfo4_t *mi; 7467 rnode4_t *tdrp; 7468 bool_t needrecov = FALSE; 7469 nfs4_recov_state_t recov_state; 7470 hrtime_t t; 7471 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7472 dirattr_info_t dinfo; 7473 7474 ASSERT(*tnm != '\0'); 7475 ASSERT(tdvp->v_type == VDIR); 7476 ASSERT(nfs4_consistent_type(tdvp)); 7477 ASSERT(nfs4_consistent_type(svp)); 7478 7479 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7480 return (EPERM); 7481 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7482 svp = realvp; 7483 ASSERT(nfs4_consistent_type(svp)); 7484 } 7485 7486 tdrp = VTOR4(tdvp); 7487 mi = VTOMI4(svp); 7488 7489 if (!(mi->mi_flags & MI4_LINK)) { 7490 return (EOPNOTSUPP); 7491 } 7492 recov_state.rs_flags = 0; 7493 recov_state.rs_num_retry_despite_err = 0; 7494 7495 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7496 return (EINTR); 7497 7498 recov_retry: 7499 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7500 7501 args.ctag = TAG_LINK; 7502 7503 /* 7504 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7505 * restorefh; getattr(fl) 7506 */ 7507 args.array_len = 7; 7508 args.array = argop; 7509 7510 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7511 if (e.error) { 7512 kmem_free(argop, argoplist_size); 7513 nfs_rw_exit(&tdrp->r_rwlock); 7514 return (e.error); 7515 } 7516 7517 /* 0. putfh file */ 7518 argop[0].argop = OP_CPUTFH; 7519 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7520 7521 /* 1. save current fh to free up the space for the dir */ 7522 argop[1].argop = OP_SAVEFH; 7523 7524 /* 2. putfh targetdir */ 7525 argop[2].argop = OP_CPUTFH; 7526 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7527 7528 /* 3. link: current_fh is targetdir, saved_fh is source */ 7529 argop[3].argop = OP_CLINK; 7530 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7531 7532 /* 4. Get attributes of dir */ 7533 argop[4].argop = OP_GETATTR; 7534 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7535 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7536 7537 /* 5. If link was successful, restore current vp to file */ 7538 argop[5].argop = OP_RESTOREFH; 7539 7540 /* 6. Get attributes of linked object */ 7541 argop[6].argop = OP_GETATTR; 7542 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7543 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7544 7545 dnlc_remove(tdvp, tnm); 7546 7547 doqueue = 1; 7548 t = gethrtime(); 7549 7550 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7551 7552 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7553 if (e.error != 0 && !needrecov) { 7554 PURGE_ATTRCACHE4(tdvp); 7555 PURGE_ATTRCACHE4(svp); 7556 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7557 goto out; 7558 } 7559 7560 if (needrecov) { 7561 bool_t abort; 7562 7563 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7564 NULL, NULL, OP_LINK, NULL, NULL, NULL); 7565 if (abort == FALSE) { 7566 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7567 needrecov); 7568 kmem_free(argop, argoplist_size); 7569 if (!e.error) 7570 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7571 goto recov_retry; 7572 } else { 7573 if (e.error != 0) { 7574 PURGE_ATTRCACHE4(tdvp); 7575 PURGE_ATTRCACHE4(svp); 7576 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7577 &recov_state, needrecov); 7578 goto out; 7579 } 7580 /* fall through for res.status case */ 7581 } 7582 } 7583 7584 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7585 7586 resp = &res; 7587 if (res.status) { 7588 /* If link succeeded, then don't return error */ 7589 e.error = geterrno4(res.status); 7590 if (res.array_len <= 4) { 7591 /* 7592 * Either Putfh, Savefh, Putfh dir, or Link failed 7593 */ 7594 PURGE_ATTRCACHE4(svp); 7595 PURGE_ATTRCACHE4(tdvp); 7596 if (e.error == EOPNOTSUPP) { 7597 mutex_enter(&mi->mi_lock); 7598 mi->mi_flags &= ~MI4_LINK; 7599 mutex_exit(&mi->mi_lock); 7600 } 7601 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7602 /* XXX-LP */ 7603 if (e.error == EISDIR && crgetuid(cr) != 0) 7604 e.error = EPERM; 7605 goto out; 7606 } 7607 } 7608 7609 /* either no error or one of the postop getattr failed */ 7610 7611 /* 7612 * XXX - if LINK succeeded, but no attrs were returned for link 7613 * file, purge its cache. 7614 * 7615 * XXX Perform a simplified version of wcc checking. Instead of 7616 * have another getattr to get pre-op, just purge cache if 7617 * any of the ops prior to and including the getattr failed. 7618 * If the getattr succeeded then update the attrcache accordingly. 7619 */ 7620 7621 /* 7622 * update cache with link file postattrs. 7623 * Note: at this point resop points to link res. 7624 */ 7625 resop = &res.array[3]; /* link res */ 7626 ln_res = &resop->nfs_resop4_u.oplink; 7627 if (res.status == NFS4_OK) 7628 e.error = nfs4_update_attrcache(res.status, 7629 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7630 t, svp, cr); 7631 7632 /* 7633 * Call makenfs4node to create the new shadow vp for tnm. 7634 * We pass NULL attrs because we just cached attrs for 7635 * the src object. All we're trying to accomplish is to 7636 * to create the new shadow vnode. 7637 */ 7638 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7639 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7640 7641 /* Update target cache attribute, readdir and dnlc caches */ 7642 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7643 dinfo.di_time_call = t; 7644 dinfo.di_cred = cr; 7645 7646 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7647 ASSERT(nfs4_consistent_type(tdvp)); 7648 ASSERT(nfs4_consistent_type(svp)); 7649 ASSERT(nfs4_consistent_type(nvp)); 7650 VN_RELE(nvp); 7651 7652 if (!e.error) { 7653 vnode_t *tvp; 7654 rnode4_t *trp; 7655 /* 7656 * Notify the source file of this link operation. 7657 */ 7658 trp = VTOR4(svp); 7659 tvp = svp; 7660 if (IS_SHADOW(svp, trp)) 7661 tvp = RTOV4(trp); 7662 vnevent_link(tvp, ct); 7663 } 7664 out: 7665 kmem_free(argop, argoplist_size); 7666 if (resp) 7667 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7668 7669 nfs_rw_exit(&tdrp->r_rwlock); 7670 7671 return (e.error); 7672 } 7673 7674 /* ARGSUSED */ 7675 static int 7676 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7677 caller_context_t *ct, int flags) 7678 { 7679 vnode_t *realvp; 7680 7681 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7682 return (EPERM); 7683 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7684 ndvp = realvp; 7685 7686 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7687 } 7688 7689 /* 7690 * nfs4rename does the real work of renaming in NFS Version 4. 7691 * 7692 * A file handle is considered volatile for renaming purposes if either 7693 * of the volatile bits are turned on. However, the compound may differ 7694 * based on the likelihood of the filehandle to change during rename. 7695 */ 7696 static int 7697 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7698 caller_context_t *ct) 7699 { 7700 int error; 7701 mntinfo4_t *mi; 7702 vnode_t *nvp = NULL; 7703 vnode_t *ovp = NULL; 7704 char *tmpname = NULL; 7705 rnode4_t *rp; 7706 rnode4_t *odrp; 7707 rnode4_t *ndrp; 7708 int did_link = 0; 7709 int do_link = 1; 7710 nfsstat4 stat = NFS4_OK; 7711 7712 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7713 ASSERT(nfs4_consistent_type(odvp)); 7714 ASSERT(nfs4_consistent_type(ndvp)); 7715 7716 if (onm[0] == '.' && (onm[1] == '\0' || 7717 (onm[1] == '.' && onm[2] == '\0'))) 7718 return (EINVAL); 7719 7720 if (nnm[0] == '.' && (nnm[1] == '\0' || 7721 (nnm[1] == '.' && nnm[2] == '\0'))) 7722 return (EINVAL); 7723 7724 odrp = VTOR4(odvp); 7725 ndrp = VTOR4(ndvp); 7726 if ((intptr_t)odrp < (intptr_t)ndrp) { 7727 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7728 return (EINTR); 7729 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7730 nfs_rw_exit(&odrp->r_rwlock); 7731 return (EINTR); 7732 } 7733 } else { 7734 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7735 return (EINTR); 7736 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7737 nfs_rw_exit(&ndrp->r_rwlock); 7738 return (EINTR); 7739 } 7740 } 7741 7742 /* 7743 * Lookup the target file. If it exists, it needs to be 7744 * checked to see whether it is a mount point and whether 7745 * it is active (open). 7746 */ 7747 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7748 if (!error) { 7749 int isactive; 7750 7751 ASSERT(nfs4_consistent_type(nvp)); 7752 /* 7753 * If this file has been mounted on, then just 7754 * return busy because renaming to it would remove 7755 * the mounted file system from the name space. 7756 */ 7757 if (vn_ismntpt(nvp)) { 7758 VN_RELE(nvp); 7759 nfs_rw_exit(&odrp->r_rwlock); 7760 nfs_rw_exit(&ndrp->r_rwlock); 7761 return (EBUSY); 7762 } 7763 7764 /* 7765 * First just remove the entry from the name cache, as it 7766 * is most likely the only entry for this vp. 7767 */ 7768 dnlc_remove(ndvp, nnm); 7769 7770 rp = VTOR4(nvp); 7771 7772 if (nvp->v_type != VREG) { 7773 /* 7774 * Purge the name cache of all references to this vnode 7775 * so that we can check the reference count to infer 7776 * whether it is active or not. 7777 */ 7778 if (nvp->v_count > 1) 7779 dnlc_purge_vp(nvp); 7780 7781 isactive = nvp->v_count > 1; 7782 } else { 7783 mutex_enter(&rp->r_os_lock); 7784 isactive = list_head(&rp->r_open_streams) != NULL; 7785 mutex_exit(&rp->r_os_lock); 7786 } 7787 7788 /* 7789 * If the vnode is active and is not a directory, 7790 * arrange to rename it to a 7791 * temporary file so that it will continue to be 7792 * accessible. This implements the "unlink-open-file" 7793 * semantics for the target of a rename operation. 7794 * Before doing this though, make sure that the 7795 * source and target files are not already the same. 7796 */ 7797 if (isactive && nvp->v_type != VDIR) { 7798 /* 7799 * Lookup the source name. 7800 */ 7801 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7802 7803 /* 7804 * The source name *should* already exist. 7805 */ 7806 if (error) { 7807 VN_RELE(nvp); 7808 nfs_rw_exit(&odrp->r_rwlock); 7809 nfs_rw_exit(&ndrp->r_rwlock); 7810 return (error); 7811 } 7812 7813 ASSERT(nfs4_consistent_type(ovp)); 7814 7815 /* 7816 * Compare the two vnodes. If they are the same, 7817 * just release all held vnodes and return success. 7818 */ 7819 if (VN_CMP(ovp, nvp)) { 7820 VN_RELE(ovp); 7821 VN_RELE(nvp); 7822 nfs_rw_exit(&odrp->r_rwlock); 7823 nfs_rw_exit(&ndrp->r_rwlock); 7824 return (0); 7825 } 7826 7827 /* 7828 * Can't mix and match directories and non- 7829 * directories in rename operations. We already 7830 * know that the target is not a directory. If 7831 * the source is a directory, return an error. 7832 */ 7833 if (ovp->v_type == VDIR) { 7834 VN_RELE(ovp); 7835 VN_RELE(nvp); 7836 nfs_rw_exit(&odrp->r_rwlock); 7837 nfs_rw_exit(&ndrp->r_rwlock); 7838 return (ENOTDIR); 7839 } 7840 link_call: 7841 /* 7842 * The target file exists, is not the same as 7843 * the source file, and is active. We first 7844 * try to Link it to a temporary filename to 7845 * avoid having the server removing the file 7846 * completely (which could cause data loss to 7847 * the user's POV in the event the Rename fails 7848 * -- see bug 1165874). 7849 */ 7850 /* 7851 * The do_link and did_link booleans are 7852 * introduced in the event we get NFS4ERR_FILE_OPEN 7853 * returned for the Rename. Some servers can 7854 * not Rename over an Open file, so they return 7855 * this error. The client needs to Remove the 7856 * newly created Link and do two Renames, just 7857 * as if the server didn't support LINK. 7858 */ 7859 tmpname = newname(); 7860 error = 0; 7861 7862 if (do_link) { 7863 error = nfs4_link(ndvp, nvp, tmpname, cr, 7864 NULL, 0); 7865 } 7866 if (error == EOPNOTSUPP || !do_link) { 7867 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7868 cr, NULL, 0); 7869 did_link = 0; 7870 } else { 7871 did_link = 1; 7872 } 7873 if (error) { 7874 kmem_free(tmpname, MAXNAMELEN); 7875 VN_RELE(ovp); 7876 VN_RELE(nvp); 7877 nfs_rw_exit(&odrp->r_rwlock); 7878 nfs_rw_exit(&ndrp->r_rwlock); 7879 return (error); 7880 } 7881 7882 mutex_enter(&rp->r_statelock); 7883 if (rp->r_unldvp == NULL) { 7884 VN_HOLD(ndvp); 7885 rp->r_unldvp = ndvp; 7886 if (rp->r_unlcred != NULL) 7887 crfree(rp->r_unlcred); 7888 crhold(cr); 7889 rp->r_unlcred = cr; 7890 rp->r_unlname = tmpname; 7891 } else { 7892 if (rp->r_unlname) 7893 kmem_free(rp->r_unlname, MAXNAMELEN); 7894 rp->r_unlname = tmpname; 7895 } 7896 mutex_exit(&rp->r_statelock); 7897 } 7898 7899 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7900 7901 ASSERT(nfs4_consistent_type(nvp)); 7902 } 7903 7904 if (ovp == NULL) { 7905 /* 7906 * When renaming directories to be a subdirectory of a 7907 * different parent, the dnlc entry for ".." will no 7908 * longer be valid, so it must be removed. 7909 * 7910 * We do a lookup here to determine whether we are renaming 7911 * a directory and we need to check if we are renaming 7912 * an unlinked file. This might have already been done 7913 * in previous code, so we check ovp == NULL to avoid 7914 * doing it twice. 7915 */ 7916 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7917 /* 7918 * The source name *should* already exist. 7919 */ 7920 if (error) { 7921 nfs_rw_exit(&odrp->r_rwlock); 7922 nfs_rw_exit(&ndrp->r_rwlock); 7923 if (nvp) { 7924 VN_RELE(nvp); 7925 } 7926 return (error); 7927 } 7928 ASSERT(ovp != NULL); 7929 ASSERT(nfs4_consistent_type(ovp)); 7930 } 7931 7932 /* 7933 * Is the object being renamed a dir, and if so, is 7934 * it being renamed to a child of itself? The underlying 7935 * fs should ultimately return EINVAL for this case; 7936 * however, buggy beta non-Solaris NFSv4 servers at 7937 * interop testing events have allowed this behavior, 7938 * and it caused our client to panic due to a recursive 7939 * mutex_enter in fn_move. 7940 * 7941 * The tedious locking in fn_move could be changed to 7942 * deal with this case, and the client could avoid the 7943 * panic; however, the client would just confuse itself 7944 * later and misbehave. A better way to handle the broken 7945 * server is to detect this condition and return EINVAL 7946 * without ever sending the the bogus rename to the server. 7947 * We know the rename is invalid -- just fail it now. 7948 */ 7949 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7950 VN_RELE(ovp); 7951 nfs_rw_exit(&odrp->r_rwlock); 7952 nfs_rw_exit(&ndrp->r_rwlock); 7953 if (nvp) { 7954 VN_RELE(nvp); 7955 } 7956 return (EINVAL); 7957 } 7958 7959 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7960 7961 /* 7962 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7963 * possible for the filehandle to change due to the rename. 7964 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7965 * the fh will not change because of the rename, but we still need 7966 * to update its rnode entry with the new name for 7967 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7968 * has no effect on these for now, but for future improvements, 7969 * we might want to use it too to simplify handling of files 7970 * that are open with that flag on. (XXX) 7971 */ 7972 mi = VTOMI4(odvp); 7973 if (NFS4_VOLATILE_FH(mi)) 7974 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7975 &stat); 7976 else 7977 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7978 &stat); 7979 7980 ASSERT(nfs4_consistent_type(odvp)); 7981 ASSERT(nfs4_consistent_type(ndvp)); 7982 ASSERT(nfs4_consistent_type(ovp)); 7983 7984 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7985 do_link = 0; 7986 /* 7987 * Before the 'link_call' code, we did a nfs4_lookup 7988 * that puts a VN_HOLD on nvp. After the nfs4_link 7989 * call we call VN_RELE to match that hold. We need 7990 * to place an additional VN_HOLD here since we will 7991 * be hitting that VN_RELE again. 7992 */ 7993 VN_HOLD(nvp); 7994 7995 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 7996 7997 /* Undo the unlinked file naming stuff we just did */ 7998 mutex_enter(&rp->r_statelock); 7999 if (rp->r_unldvp) { 8000 VN_RELE(ndvp); 8001 rp->r_unldvp = NULL; 8002 if (rp->r_unlcred != NULL) 8003 crfree(rp->r_unlcred); 8004 rp->r_unlcred = NULL; 8005 /* rp->r_unlanme points to tmpname */ 8006 if (rp->r_unlname) 8007 kmem_free(rp->r_unlname, MAXNAMELEN); 8008 rp->r_unlname = NULL; 8009 } 8010 mutex_exit(&rp->r_statelock); 8011 8012 if (nvp) { 8013 VN_RELE(nvp); 8014 } 8015 goto link_call; 8016 } 8017 8018 if (error) { 8019 VN_RELE(ovp); 8020 nfs_rw_exit(&odrp->r_rwlock); 8021 nfs_rw_exit(&ndrp->r_rwlock); 8022 if (nvp) { 8023 VN_RELE(nvp); 8024 } 8025 return (error); 8026 } 8027 8028 /* 8029 * when renaming directories to be a subdirectory of a 8030 * different parent, the dnlc entry for ".." will no 8031 * longer be valid, so it must be removed 8032 */ 8033 rp = VTOR4(ovp); 8034 if (ndvp != odvp) { 8035 if (ovp->v_type == VDIR) { 8036 dnlc_remove(ovp, ".."); 8037 if (rp->r_dir != NULL) 8038 nfs4_purge_rddir_cache(ovp); 8039 } 8040 } 8041 8042 /* 8043 * If we are renaming the unlinked file, update the 8044 * r_unldvp and r_unlname as needed. 8045 */ 8046 mutex_enter(&rp->r_statelock); 8047 if (rp->r_unldvp != NULL) { 8048 if (strcmp(rp->r_unlname, onm) == 0) { 8049 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8050 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8051 if (ndvp != rp->r_unldvp) { 8052 VN_RELE(rp->r_unldvp); 8053 rp->r_unldvp = ndvp; 8054 VN_HOLD(ndvp); 8055 } 8056 } 8057 } 8058 mutex_exit(&rp->r_statelock); 8059 8060 /* 8061 * Notify the rename vnevents to source vnode, and to the target 8062 * vnode if it already existed. 8063 */ 8064 if (error == 0) { 8065 vnode_t *tvp; 8066 rnode4_t *trp; 8067 /* 8068 * Notify the vnode. Each links is represented by 8069 * a different vnode, in nfsv4. 8070 */ 8071 if (nvp) { 8072 trp = VTOR4(nvp); 8073 tvp = nvp; 8074 if (IS_SHADOW(nvp, trp)) 8075 tvp = RTOV4(trp); 8076 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8077 } 8078 8079 /* 8080 * if the source and destination directory are not the 8081 * same notify the destination directory. 8082 */ 8083 if (VTOR4(odvp) != VTOR4(ndvp)) { 8084 trp = VTOR4(ndvp); 8085 tvp = ndvp; 8086 if (IS_SHADOW(ndvp, trp)) 8087 tvp = RTOV4(trp); 8088 vnevent_rename_dest_dir(tvp, ct); 8089 } 8090 8091 trp = VTOR4(ovp); 8092 tvp = ovp; 8093 if (IS_SHADOW(ovp, trp)) 8094 tvp = RTOV4(trp); 8095 vnevent_rename_src(tvp, odvp, onm, ct); 8096 } 8097 8098 if (nvp) { 8099 VN_RELE(nvp); 8100 } 8101 VN_RELE(ovp); 8102 8103 nfs_rw_exit(&odrp->r_rwlock); 8104 nfs_rw_exit(&ndrp->r_rwlock); 8105 8106 return (error); 8107 } 8108 8109 /* 8110 * When the parent directory has changed, sv_dfh must be updated 8111 */ 8112 static void 8113 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8114 { 8115 svnode_t *sv = VTOSV(vp); 8116 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8117 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8118 8119 sfh4_hold(new_dfh); 8120 sv->sv_dfh = new_dfh; 8121 sfh4_rele(&old_dfh); 8122 } 8123 8124 /* 8125 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8126 * when it is known that the filehandle is persistent through rename. 8127 * 8128 * Rename requires that the current fh be the target directory and the 8129 * saved fh be the source directory. After the operation, the current fh 8130 * is unchanged. 8131 * The compound op structure for persistent fh rename is: 8132 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8133 * Rather than bother with the directory postop args, we'll simply 8134 * update that a change occurred in the cache, so no post-op getattrs. 8135 */ 8136 static int 8137 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8138 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8139 { 8140 COMPOUND4args_clnt args; 8141 COMPOUND4res_clnt res, *resp = NULL; 8142 nfs_argop4 *argop; 8143 nfs_resop4 *resop; 8144 int doqueue, argoplist_size; 8145 mntinfo4_t *mi; 8146 rnode4_t *odrp = VTOR4(odvp); 8147 rnode4_t *ndrp = VTOR4(ndvp); 8148 RENAME4res *rn_res; 8149 bool_t needrecov; 8150 nfs4_recov_state_t recov_state; 8151 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8152 dirattr_info_t dinfo, *dinfop; 8153 8154 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8155 8156 recov_state.rs_flags = 0; 8157 recov_state.rs_num_retry_despite_err = 0; 8158 8159 /* 8160 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8161 * 8162 * If source/target are different dirs, then append putfh(src); getattr 8163 */ 8164 args.array_len = (odvp == ndvp) ? 5 : 7; 8165 argoplist_size = args.array_len * sizeof (nfs_argop4); 8166 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8167 8168 recov_retry: 8169 *statp = NFS4_OK; 8170 8171 /* No need to Lookup the file, persistent fh */ 8172 args.ctag = TAG_RENAME; 8173 8174 mi = VTOMI4(odvp); 8175 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8176 if (e.error) { 8177 kmem_free(argop, argoplist_size); 8178 return (e.error); 8179 } 8180 8181 /* 0: putfh source directory */ 8182 argop[0].argop = OP_CPUTFH; 8183 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8184 8185 /* 1: Save source fh to free up current for target */ 8186 argop[1].argop = OP_SAVEFH; 8187 8188 /* 2: putfh targetdir */ 8189 argop[2].argop = OP_CPUTFH; 8190 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8191 8192 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8193 argop[3].argop = OP_CRENAME; 8194 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8195 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8196 8197 /* 4: getattr (targetdir) */ 8198 argop[4].argop = OP_GETATTR; 8199 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8200 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8201 8202 if (ndvp != odvp) { 8203 8204 /* 5: putfh (sourcedir) */ 8205 argop[5].argop = OP_CPUTFH; 8206 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8207 8208 /* 6: getattr (sourcedir) */ 8209 argop[6].argop = OP_GETATTR; 8210 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8211 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8212 } 8213 8214 dnlc_remove(odvp, onm); 8215 dnlc_remove(ndvp, nnm); 8216 8217 doqueue = 1; 8218 dinfo.di_time_call = gethrtime(); 8219 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8220 8221 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8222 if (e.error) { 8223 PURGE_ATTRCACHE4(odvp); 8224 PURGE_ATTRCACHE4(ndvp); 8225 } else { 8226 *statp = res.status; 8227 } 8228 8229 if (needrecov) { 8230 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8231 OP_RENAME, NULL, NULL, NULL) == FALSE) { 8232 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8233 if (!e.error) 8234 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 8235 goto recov_retry; 8236 } 8237 } 8238 8239 if (!e.error) { 8240 resp = &res; 8241 /* 8242 * as long as OP_RENAME 8243 */ 8244 if (res.status != NFS4_OK && res.array_len <= 4) { 8245 e.error = geterrno4(res.status); 8246 PURGE_ATTRCACHE4(odvp); 8247 PURGE_ATTRCACHE4(ndvp); 8248 /* 8249 * System V defines rename to return EEXIST, not 8250 * ENOTEMPTY if the target directory is not empty. 8251 * Over the wire, the error is NFSERR_ENOTEMPTY 8252 * which geterrno4 maps to ENOTEMPTY. 8253 */ 8254 if (e.error == ENOTEMPTY) 8255 e.error = EEXIST; 8256 } else { 8257 8258 resop = &res.array[3]; /* rename res */ 8259 rn_res = &resop->nfs_resop4_u.oprename; 8260 8261 if (res.status == NFS4_OK) { 8262 /* 8263 * Update target attribute, readdir and dnlc 8264 * caches. 8265 */ 8266 dinfo.di_garp = 8267 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8268 dinfo.di_cred = cr; 8269 dinfop = &dinfo; 8270 } else 8271 dinfop = NULL; 8272 8273 nfs4_update_dircaches(&rn_res->target_cinfo, 8274 ndvp, NULL, NULL, dinfop); 8275 8276 /* 8277 * Update source attribute, readdir and dnlc caches 8278 * 8279 */ 8280 if (ndvp != odvp) { 8281 update_parentdir_sfh(renvp, ndvp); 8282 8283 if (dinfop) 8284 dinfo.di_garp = 8285 &(res.array[6].nfs_resop4_u. 8286 opgetattr.ga_res); 8287 8288 nfs4_update_dircaches(&rn_res->source_cinfo, 8289 odvp, NULL, NULL, dinfop); 8290 } 8291 8292 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8293 nnm); 8294 } 8295 } 8296 8297 if (resp) 8298 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8299 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8300 kmem_free(argop, argoplist_size); 8301 8302 return (e.error); 8303 } 8304 8305 /* 8306 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8307 * it is possible for the filehandle to change due to the rename. 8308 * 8309 * The compound req in this case includes a post-rename lookup and getattr 8310 * to ensure that we have the correct fh and attributes for the object. 8311 * 8312 * Rename requires that the current fh be the target directory and the 8313 * saved fh be the source directory. After the operation, the current fh 8314 * is unchanged. 8315 * 8316 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8317 * update the filehandle for the renamed object. We also get the old 8318 * filehandle for historical reasons; this should be taken out sometime. 8319 * This results in a rather cumbersome compound... 8320 * 8321 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8322 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8323 * 8324 */ 8325 static int 8326 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8327 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8328 { 8329 COMPOUND4args_clnt args; 8330 COMPOUND4res_clnt res, *resp = NULL; 8331 int argoplist_size; 8332 nfs_argop4 *argop; 8333 nfs_resop4 *resop; 8334 int doqueue; 8335 mntinfo4_t *mi; 8336 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8337 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8338 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8339 RENAME4res *rn_res; 8340 GETFH4res *ngf_res; 8341 bool_t needrecov; 8342 nfs4_recov_state_t recov_state; 8343 hrtime_t t; 8344 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8345 dirattr_info_t dinfo, *dinfop = &dinfo; 8346 8347 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8348 8349 recov_state.rs_flags = 0; 8350 recov_state.rs_num_retry_despite_err = 0; 8351 8352 recov_retry: 8353 *statp = NFS4_OK; 8354 8355 /* 8356 * There is a window between the RPC and updating the path and 8357 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8358 * code, so that it doesn't try to use the old path during that 8359 * window. 8360 */ 8361 mutex_enter(&orp->r_statelock); 8362 while (orp->r_flags & R4RECEXPFH) { 8363 klwp_t *lwp = ttolwp(curthread); 8364 8365 if (lwp != NULL) 8366 lwp->lwp_nostop++; 8367 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8368 mutex_exit(&orp->r_statelock); 8369 if (lwp != NULL) 8370 lwp->lwp_nostop--; 8371 return (EINTR); 8372 } 8373 if (lwp != NULL) 8374 lwp->lwp_nostop--; 8375 } 8376 orp->r_flags |= R4RECEXPFH; 8377 mutex_exit(&orp->r_statelock); 8378 8379 mi = VTOMI4(odvp); 8380 8381 args.ctag = TAG_RENAME_VFH; 8382 args.array_len = (odvp == ndvp) ? 10 : 12; 8383 argoplist_size = args.array_len * sizeof (nfs_argop4); 8384 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8385 8386 /* 8387 * Rename ops: 8388 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8389 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8390 * LOOKUP(trgt), GETFH(new), GETATTR, 8391 * 8392 * if (odvp != ndvp) 8393 * add putfh(sourcedir), getattr(sourcedir) } 8394 */ 8395 args.array = argop; 8396 8397 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8398 &recov_state, NULL); 8399 if (e.error) { 8400 kmem_free(argop, argoplist_size); 8401 mutex_enter(&orp->r_statelock); 8402 orp->r_flags &= ~R4RECEXPFH; 8403 cv_broadcast(&orp->r_cv); 8404 mutex_exit(&orp->r_statelock); 8405 return (e.error); 8406 } 8407 8408 /* 0: putfh source directory */ 8409 argop[0].argop = OP_CPUTFH; 8410 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8411 8412 /* 1: Save source fh to free up current for target */ 8413 argop[1].argop = OP_SAVEFH; 8414 8415 /* 2: Lookup pre-rename fh of renamed object */ 8416 argop[2].argop = OP_CLOOKUP; 8417 argop[2].nfs_argop4_u.opclookup.cname = onm; 8418 8419 /* 3: getfh fh of renamed object (before rename) */ 8420 argop[3].argop = OP_GETFH; 8421 8422 /* 4: putfh targetdir */ 8423 argop[4].argop = OP_CPUTFH; 8424 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8425 8426 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8427 argop[5].argop = OP_CRENAME; 8428 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8429 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8430 8431 /* 6: getattr of target dir (post op attrs) */ 8432 argop[6].argop = OP_GETATTR; 8433 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8434 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8435 8436 /* 7: Lookup post-rename fh of renamed object */ 8437 argop[7].argop = OP_CLOOKUP; 8438 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8439 8440 /* 8: getfh fh of renamed object (after rename) */ 8441 argop[8].argop = OP_GETFH; 8442 8443 /* 9: getattr of renamed object */ 8444 argop[9].argop = OP_GETATTR; 8445 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8446 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8447 8448 /* 8449 * If source/target dirs are different, then get new post-op 8450 * attrs for source dir also. 8451 */ 8452 if (ndvp != odvp) { 8453 /* 10: putfh (sourcedir) */ 8454 argop[10].argop = OP_CPUTFH; 8455 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8456 8457 /* 11: getattr (sourcedir) */ 8458 argop[11].argop = OP_GETATTR; 8459 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8460 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8461 } 8462 8463 dnlc_remove(odvp, onm); 8464 dnlc_remove(ndvp, nnm); 8465 8466 doqueue = 1; 8467 t = gethrtime(); 8468 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8469 8470 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8471 if (e.error) { 8472 PURGE_ATTRCACHE4(odvp); 8473 PURGE_ATTRCACHE4(ndvp); 8474 if (!needrecov) { 8475 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8476 &recov_state, needrecov); 8477 goto out; 8478 } 8479 } else { 8480 *statp = res.status; 8481 } 8482 8483 if (needrecov) { 8484 bool_t abort; 8485 8486 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8487 OP_RENAME, NULL, NULL, NULL); 8488 if (abort == FALSE) { 8489 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8490 &recov_state, needrecov); 8491 kmem_free(argop, argoplist_size); 8492 if (!e.error) 8493 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 8494 mutex_enter(&orp->r_statelock); 8495 orp->r_flags &= ~R4RECEXPFH; 8496 cv_broadcast(&orp->r_cv); 8497 mutex_exit(&orp->r_statelock); 8498 goto recov_retry; 8499 } else { 8500 if (e.error != 0) { 8501 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8502 &recov_state, needrecov); 8503 goto out; 8504 } 8505 /* fall through for res.status case */ 8506 } 8507 } 8508 8509 resp = &res; 8510 /* 8511 * If OP_RENAME (or any prev op) failed, then return an error. 8512 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8513 */ 8514 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8515 /* 8516 * Error in an op other than last Getattr 8517 */ 8518 e.error = geterrno4(res.status); 8519 PURGE_ATTRCACHE4(odvp); 8520 PURGE_ATTRCACHE4(ndvp); 8521 /* 8522 * System V defines rename to return EEXIST, not 8523 * ENOTEMPTY if the target directory is not empty. 8524 * Over the wire, the error is NFSERR_ENOTEMPTY 8525 * which geterrno4 maps to ENOTEMPTY. 8526 */ 8527 if (e.error == ENOTEMPTY) 8528 e.error = EEXIST; 8529 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8530 needrecov); 8531 goto out; 8532 } 8533 8534 /* rename results */ 8535 rn_res = &res.array[5].nfs_resop4_u.oprename; 8536 8537 if (res.status == NFS4_OK) { 8538 /* Update target attribute, readdir and dnlc caches */ 8539 dinfo.di_garp = 8540 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8541 dinfo.di_cred = cr; 8542 dinfo.di_time_call = t; 8543 } else 8544 dinfop = NULL; 8545 8546 /* Update source cache attribute, readdir and dnlc caches */ 8547 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8548 8549 /* Update source cache attribute, readdir and dnlc caches */ 8550 if (ndvp != odvp) { 8551 update_parentdir_sfh(ovp, ndvp); 8552 8553 /* 8554 * If dinfop is non-NULL, then compound succeded, so 8555 * set di_garp to attrs for source dir. dinfop is only 8556 * set to NULL when compound fails. 8557 */ 8558 if (dinfop) 8559 dinfo.di_garp = 8560 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8561 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8562 dinfop); 8563 } 8564 8565 /* 8566 * Update the rnode with the new component name and args, 8567 * and if the file handle changed, also update it with the new fh. 8568 * This is only necessary if the target object has an rnode 8569 * entry and there is no need to create one for it. 8570 */ 8571 resop = &res.array[8]; /* getfh new res */ 8572 ngf_res = &resop->nfs_resop4_u.opgetfh; 8573 8574 /* 8575 * Update the path and filehandle for the renamed object. 8576 */ 8577 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8578 8579 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8580 8581 if (res.status == NFS4_OK) { 8582 resop++; /* getattr res */ 8583 e.error = nfs4_update_attrcache(res.status, 8584 &resop->nfs_resop4_u.opgetattr.ga_res, 8585 t, ovp, cr); 8586 } 8587 8588 out: 8589 kmem_free(argop, argoplist_size); 8590 if (resp) 8591 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8592 mutex_enter(&orp->r_statelock); 8593 orp->r_flags &= ~R4RECEXPFH; 8594 cv_broadcast(&orp->r_cv); 8595 mutex_exit(&orp->r_statelock); 8596 8597 return (e.error); 8598 } 8599 8600 /* ARGSUSED */ 8601 static int 8602 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8603 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8604 { 8605 int error; 8606 vnode_t *vp; 8607 8608 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8609 return (EPERM); 8610 /* 8611 * As ".." has special meaning and rather than send a mkdir 8612 * over the wire to just let the server freak out, we just 8613 * short circuit it here and return EEXIST 8614 */ 8615 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8616 return (EEXIST); 8617 8618 /* 8619 * Decision to get the right gid and setgid bit of the 8620 * new directory is now made in call_nfs4_create_req. 8621 */ 8622 va->va_mask |= AT_MODE; 8623 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8624 if (error) 8625 return (error); 8626 8627 *vpp = vp; 8628 return (0); 8629 } 8630 8631 8632 /* 8633 * rmdir is using the same remove v4 op as does remove. 8634 * Remove requires that the current fh be the target directory. 8635 * After the operation, the current fh is unchanged. 8636 * The compound op structure is: 8637 * PUTFH(targetdir), REMOVE 8638 */ 8639 /*ARGSUSED4*/ 8640 static int 8641 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8642 caller_context_t *ct, int flags) 8643 { 8644 int need_end_op = FALSE; 8645 COMPOUND4args_clnt args; 8646 COMPOUND4res_clnt res, *resp = NULL; 8647 REMOVE4res *rm_res; 8648 nfs_argop4 argop[3]; 8649 nfs_resop4 *resop; 8650 vnode_t *vp; 8651 int doqueue; 8652 mntinfo4_t *mi; 8653 rnode4_t *drp; 8654 bool_t needrecov = FALSE; 8655 nfs4_recov_state_t recov_state; 8656 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8657 dirattr_info_t dinfo, *dinfop; 8658 8659 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8660 return (EPERM); 8661 /* 8662 * As ".." has special meaning and rather than send a rmdir 8663 * over the wire to just let the server freak out, we just 8664 * short circuit it here and return EEXIST 8665 */ 8666 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8667 return (EEXIST); 8668 8669 drp = VTOR4(dvp); 8670 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8671 return (EINTR); 8672 8673 /* 8674 * Attempt to prevent a rmdir(".") from succeeding. 8675 */ 8676 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8677 if (e.error) { 8678 nfs_rw_exit(&drp->r_rwlock); 8679 return (e.error); 8680 } 8681 if (vp == cdir) { 8682 VN_RELE(vp); 8683 nfs_rw_exit(&drp->r_rwlock); 8684 return (EINVAL); 8685 } 8686 8687 /* 8688 * Since nfsv4 remove op works on both files and directories, 8689 * check that the removed object is indeed a directory. 8690 */ 8691 if (vp->v_type != VDIR) { 8692 VN_RELE(vp); 8693 nfs_rw_exit(&drp->r_rwlock); 8694 return (ENOTDIR); 8695 } 8696 8697 /* 8698 * First just remove the entry from the name cache, as it 8699 * is most likely an entry for this vp. 8700 */ 8701 dnlc_remove(dvp, nm); 8702 8703 /* 8704 * If there vnode reference count is greater than one, then 8705 * there may be additional references in the DNLC which will 8706 * need to be purged. First, trying removing the entry for 8707 * the parent directory and see if that removes the additional 8708 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8709 * to completely remove any references to the directory which 8710 * might still exist in the DNLC. 8711 */ 8712 if (vp->v_count > 1) { 8713 dnlc_remove(vp, ".."); 8714 if (vp->v_count > 1) 8715 dnlc_purge_vp(vp); 8716 } 8717 8718 mi = VTOMI4(dvp); 8719 recov_state.rs_flags = 0; 8720 recov_state.rs_num_retry_despite_err = 0; 8721 8722 recov_retry: 8723 args.ctag = TAG_RMDIR; 8724 8725 /* 8726 * Rmdir ops: putfh dir; remove 8727 */ 8728 args.array_len = 3; 8729 args.array = argop; 8730 8731 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8732 if (e.error) { 8733 nfs_rw_exit(&drp->r_rwlock); 8734 return (e.error); 8735 } 8736 need_end_op = TRUE; 8737 8738 /* putfh directory */ 8739 argop[0].argop = OP_CPUTFH; 8740 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8741 8742 /* remove */ 8743 argop[1].argop = OP_CREMOVE; 8744 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8745 8746 /* getattr (postop attrs for dir that contained removed dir) */ 8747 argop[2].argop = OP_GETATTR; 8748 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8749 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8750 8751 dinfo.di_time_call = gethrtime(); 8752 doqueue = 1; 8753 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8754 8755 PURGE_ATTRCACHE4(vp); 8756 8757 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8758 if (e.error) { 8759 PURGE_ATTRCACHE4(dvp); 8760 } 8761 8762 if (needrecov) { 8763 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8764 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 8765 if (!e.error) 8766 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 8767 8768 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8769 needrecov); 8770 need_end_op = FALSE; 8771 goto recov_retry; 8772 } 8773 } 8774 8775 if (!e.error) { 8776 resp = &res; 8777 8778 /* 8779 * Only return error if first 2 ops (OP_REMOVE or earlier) 8780 * failed. 8781 */ 8782 if (res.status != NFS4_OK && res.array_len <= 2) { 8783 e.error = geterrno4(res.status); 8784 PURGE_ATTRCACHE4(dvp); 8785 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8786 &recov_state, needrecov); 8787 need_end_op = FALSE; 8788 nfs4_purge_stale_fh(e.error, dvp, cr); 8789 /* 8790 * System V defines rmdir to return EEXIST, not 8791 * ENOTEMPTY if the directory is not empty. Over 8792 * the wire, the error is NFSERR_ENOTEMPTY which 8793 * geterrno4 maps to ENOTEMPTY. 8794 */ 8795 if (e.error == ENOTEMPTY) 8796 e.error = EEXIST; 8797 } else { 8798 resop = &res.array[1]; /* remove res */ 8799 rm_res = &resop->nfs_resop4_u.opremove; 8800 8801 if (res.status == NFS4_OK) { 8802 resop = &res.array[2]; /* dir attrs */ 8803 dinfo.di_garp = 8804 &resop->nfs_resop4_u.opgetattr.ga_res; 8805 dinfo.di_cred = cr; 8806 dinfop = &dinfo; 8807 } else 8808 dinfop = NULL; 8809 8810 /* Update dir attribute, readdir and dnlc caches */ 8811 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8812 dinfop); 8813 8814 /* destroy rddir cache for dir that was removed */ 8815 if (VTOR4(vp)->r_dir != NULL) 8816 nfs4_purge_rddir_cache(vp); 8817 } 8818 } 8819 8820 if (need_end_op) 8821 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8822 8823 nfs_rw_exit(&drp->r_rwlock); 8824 8825 if (resp) 8826 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8827 8828 if (e.error == 0) { 8829 vnode_t *tvp; 8830 rnode4_t *trp; 8831 trp = VTOR4(vp); 8832 tvp = vp; 8833 if (IS_SHADOW(vp, trp)) 8834 tvp = RTOV4(trp); 8835 vnevent_rmdir(tvp, dvp, nm, ct); 8836 } 8837 8838 VN_RELE(vp); 8839 8840 return (e.error); 8841 } 8842 8843 /* ARGSUSED */ 8844 static int 8845 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8846 caller_context_t *ct, int flags) 8847 { 8848 int error; 8849 vnode_t *vp; 8850 rnode4_t *rp; 8851 char *contents; 8852 mntinfo4_t *mi = VTOMI4(dvp); 8853 8854 if (nfs_zone() != mi->mi_zone) 8855 return (EPERM); 8856 if (!(mi->mi_flags & MI4_SYMLINK)) 8857 return (EOPNOTSUPP); 8858 8859 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8860 if (error) 8861 return (error); 8862 8863 ASSERT(nfs4_consistent_type(vp)); 8864 rp = VTOR4(vp); 8865 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8866 8867 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8868 8869 if (contents != NULL) { 8870 mutex_enter(&rp->r_statelock); 8871 if (rp->r_symlink.contents == NULL) { 8872 rp->r_symlink.len = strlen(tnm); 8873 bcopy(tnm, contents, rp->r_symlink.len); 8874 rp->r_symlink.contents = contents; 8875 rp->r_symlink.size = MAXPATHLEN; 8876 mutex_exit(&rp->r_statelock); 8877 } else { 8878 mutex_exit(&rp->r_statelock); 8879 kmem_free((void *)contents, MAXPATHLEN); 8880 } 8881 } 8882 } 8883 VN_RELE(vp); 8884 8885 return (error); 8886 } 8887 8888 8889 /* 8890 * Read directory entries. 8891 * There are some weird things to look out for here. The uio_loffset 8892 * field is either 0 or it is the offset returned from a previous 8893 * readdir. It is an opaque value used by the server to find the 8894 * correct directory block to read. The count field is the number 8895 * of blocks to read on the server. This is advisory only, the server 8896 * may return only one block's worth of entries. Entries may be compressed 8897 * on the server. 8898 */ 8899 /* ARGSUSED */ 8900 static int 8901 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8902 caller_context_t *ct, int flags) 8903 { 8904 int error; 8905 uint_t count; 8906 rnode4_t *rp; 8907 rddir4_cache *rdc; 8908 rddir4_cache *rrdc; 8909 8910 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8911 return (EIO); 8912 rp = VTOR4(vp); 8913 8914 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8915 8916 /* 8917 * Make sure that the directory cache is valid. 8918 */ 8919 if (rp->r_dir != NULL) { 8920 if (nfs_disable_rddir_cache != 0) { 8921 /* 8922 * Setting nfs_disable_rddir_cache in /etc/system 8923 * allows interoperability with servers that do not 8924 * properly update the attributes of directories. 8925 * Any cached information gets purged before an 8926 * access is made to it. 8927 */ 8928 nfs4_purge_rddir_cache(vp); 8929 } 8930 8931 error = nfs4_validate_caches(vp, cr); 8932 if (error) 8933 return (error); 8934 } 8935 8936 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8937 8938 /* 8939 * Short circuit last readdir which always returns 0 bytes. 8940 * This can be done after the directory has been read through 8941 * completely at least once. This will set r_direof which 8942 * can be used to find the value of the last cookie. 8943 */ 8944 mutex_enter(&rp->r_statelock); 8945 if (rp->r_direof != NULL && 8946 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8947 mutex_exit(&rp->r_statelock); 8948 #ifdef DEBUG 8949 nfs4_readdir_cache_shorts++; 8950 #endif 8951 if (eofp) 8952 *eofp = 1; 8953 return (0); 8954 } 8955 8956 /* 8957 * Look for a cache entry. Cache entries are identified 8958 * by the NFS cookie value and the byte count requested. 8959 */ 8960 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8961 8962 /* 8963 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8964 */ 8965 if (rdc == NULL) { 8966 mutex_exit(&rp->r_statelock); 8967 return (EINTR); 8968 } 8969 8970 /* 8971 * Check to see if we need to fill this entry in. 8972 */ 8973 if (rdc->flags & RDDIRREQ) { 8974 rdc->flags &= ~RDDIRREQ; 8975 rdc->flags |= RDDIR; 8976 mutex_exit(&rp->r_statelock); 8977 8978 /* 8979 * Do the readdir. 8980 */ 8981 nfs4readdir(vp, rdc, cr); 8982 8983 /* 8984 * Reacquire the lock, so that we can continue 8985 */ 8986 mutex_enter(&rp->r_statelock); 8987 /* 8988 * The entry is now complete 8989 */ 8990 rdc->flags &= ~RDDIR; 8991 } 8992 8993 ASSERT(!(rdc->flags & RDDIR)); 8994 8995 /* 8996 * If an error occurred while attempting 8997 * to fill the cache entry, mark the entry invalid and 8998 * just return the error. 8999 */ 9000 if (rdc->error) { 9001 error = rdc->error; 9002 rdc->flags |= RDDIRREQ; 9003 rddir4_cache_rele(rp, rdc); 9004 mutex_exit(&rp->r_statelock); 9005 return (error); 9006 } 9007 9008 /* 9009 * The cache entry is complete and good, 9010 * copyout the dirent structs to the calling 9011 * thread. 9012 */ 9013 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 9014 9015 /* 9016 * If no error occurred during the copyout, 9017 * update the offset in the uio struct to 9018 * contain the value of the next NFS 4 cookie 9019 * and set the eof value appropriately. 9020 */ 9021 if (!error) { 9022 uiop->uio_loffset = rdc->nfs4_ncookie; 9023 if (eofp) 9024 *eofp = rdc->eof; 9025 } 9026 9027 /* 9028 * Decide whether to do readahead. Don't if we 9029 * have already read to the end of directory. 9030 */ 9031 if (rdc->eof) { 9032 /* 9033 * Make the entry the direof only if it is cached 9034 */ 9035 if (rdc->flags & RDDIRCACHED) 9036 rp->r_direof = rdc; 9037 rddir4_cache_rele(rp, rdc); 9038 mutex_exit(&rp->r_statelock); 9039 return (error); 9040 } 9041 9042 /* Determine if a readdir readahead should be done */ 9043 if (!(rp->r_flags & R4LOOKUP)) { 9044 rddir4_cache_rele(rp, rdc); 9045 mutex_exit(&rp->r_statelock); 9046 return (error); 9047 } 9048 9049 /* 9050 * Now look for a readahead entry. 9051 * 9052 * Check to see whether we found an entry for the readahead. 9053 * If so, we don't need to do anything further, so free the new 9054 * entry if one was allocated. Otherwise, allocate a new entry, add 9055 * it to the cache, and then initiate an asynchronous readdir 9056 * operation to fill it. 9057 */ 9058 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9059 9060 /* 9061 * A readdir cache entry could not be obtained for the readahead. In 9062 * this case we skip the readahead and return. 9063 */ 9064 if (rrdc == NULL) { 9065 rddir4_cache_rele(rp, rdc); 9066 mutex_exit(&rp->r_statelock); 9067 return (error); 9068 } 9069 9070 /* 9071 * Check to see if we need to fill this entry in. 9072 */ 9073 if (rrdc->flags & RDDIRREQ) { 9074 rrdc->flags &= ~RDDIRREQ; 9075 rrdc->flags |= RDDIR; 9076 rddir4_cache_rele(rp, rdc); 9077 mutex_exit(&rp->r_statelock); 9078 #ifdef DEBUG 9079 nfs4_readdir_readahead++; 9080 #endif 9081 /* 9082 * Do the readdir. 9083 */ 9084 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9085 return (error); 9086 } 9087 9088 rddir4_cache_rele(rp, rrdc); 9089 rddir4_cache_rele(rp, rdc); 9090 mutex_exit(&rp->r_statelock); 9091 return (error); 9092 } 9093 9094 static int 9095 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9096 { 9097 int error; 9098 rnode4_t *rp; 9099 9100 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9101 9102 rp = VTOR4(vp); 9103 9104 /* 9105 * Obtain the readdir results for the caller. 9106 */ 9107 nfs4readdir(vp, rdc, cr); 9108 9109 mutex_enter(&rp->r_statelock); 9110 /* 9111 * The entry is now complete 9112 */ 9113 rdc->flags &= ~RDDIR; 9114 9115 error = rdc->error; 9116 if (error) 9117 rdc->flags |= RDDIRREQ; 9118 rddir4_cache_rele(rp, rdc); 9119 mutex_exit(&rp->r_statelock); 9120 9121 return (error); 9122 } 9123 9124 /* 9125 * Read directory entries. 9126 * There are some weird things to look out for here. The uio_loffset 9127 * field is either 0 or it is the offset returned from a previous 9128 * readdir. It is an opaque value used by the server to find the 9129 * correct directory block to read. The count field is the number 9130 * of blocks to read on the server. This is advisory only, the server 9131 * may return only one block's worth of entries. Entries may be compressed 9132 * on the server. 9133 * 9134 * Generates the following compound request: 9135 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9136 * must include a Lookupp as well. In this case, send: 9137 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9138 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9139 * 9140 * Get complete attributes and filehandles for entries if this is the 9141 * first read of the directory. Otherwise, just get fileid's. 9142 */ 9143 static void 9144 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9145 { 9146 COMPOUND4args_clnt args; 9147 COMPOUND4res_clnt res; 9148 READDIR4args *rargs; 9149 READDIR4res_clnt *rd_res; 9150 bitmap4 rd_bitsval; 9151 nfs_argop4 argop[5]; 9152 nfs_resop4 *resop; 9153 rnode4_t *rp = VTOR4(vp); 9154 mntinfo4_t *mi = VTOMI4(vp); 9155 int doqueue; 9156 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9157 vnode_t *dvp; 9158 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9159 int num_ops, res_opcnt; 9160 bool_t needrecov = FALSE; 9161 nfs4_recov_state_t recov_state; 9162 hrtime_t t; 9163 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9164 9165 ASSERT(nfs_zone() == mi->mi_zone); 9166 ASSERT(rdc->flags & RDDIR); 9167 ASSERT(rdc->entries == NULL); 9168 9169 /* 9170 * If rp were a stub, it should have triggered and caused 9171 * a mount for us to get this far. 9172 */ 9173 ASSERT(!RP_ISSTUB(rp)); 9174 9175 num_ops = 2; 9176 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9177 /* 9178 * Since nfsv4 readdir may not return entries for "." and "..", 9179 * the client must recreate them: 9180 * To find the correct nodeid, do the following: 9181 * For current node, get nodeid from dnlc. 9182 * - if current node is rootvp, set pnodeid to nodeid. 9183 * - else if parent is in the dnlc, get its nodeid from there. 9184 * - else add LOOKUPP+GETATTR to compound. 9185 */ 9186 nodeid = rp->r_attr.va_nodeid; 9187 if (vp->v_flag & VROOT) { 9188 pnodeid = nodeid; /* root of mount point */ 9189 } else { 9190 dvp = dnlc_lookup(vp, ".."); 9191 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9192 /* parent in dnlc cache - no need for otw */ 9193 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9194 } else { 9195 /* 9196 * parent not in dnlc cache, 9197 * do lookupp to get its id 9198 */ 9199 num_ops = 5; 9200 pnodeid = 0; /* set later by getattr parent */ 9201 } 9202 if (dvp) 9203 VN_RELE(dvp); 9204 } 9205 } 9206 recov_state.rs_flags = 0; 9207 recov_state.rs_num_retry_despite_err = 0; 9208 9209 /* Save the original mount point security flavor */ 9210 (void) save_mnt_secinfo(mi->mi_curr_serv); 9211 9212 recov_retry: 9213 args.ctag = TAG_READDIR; 9214 9215 args.array = argop; 9216 args.array_len = num_ops; 9217 9218 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9219 &recov_state, NULL)) { 9220 /* 9221 * If readdir a node that is a stub for a crossed mount point, 9222 * keep the original secinfo flavor for the current file 9223 * system, not the crossed one. 9224 */ 9225 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9226 rdc->error = e.error; 9227 return; 9228 } 9229 9230 /* 9231 * Determine which attrs to request for dirents. This code 9232 * must be protected by nfs4_start/end_fop because of r_server 9233 * (which will change during failover recovery). 9234 * 9235 */ 9236 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9237 /* 9238 * Get all vattr attrs plus filehandle and rdattr_error 9239 */ 9240 rd_bitsval = NFS4_VATTR_MASK | 9241 FATTR4_RDATTR_ERROR_MASK | 9242 FATTR4_FILEHANDLE_MASK; 9243 9244 if (rp->r_flags & R4READDIRWATTR) { 9245 mutex_enter(&rp->r_statelock); 9246 rp->r_flags &= ~R4READDIRWATTR; 9247 mutex_exit(&rp->r_statelock); 9248 } 9249 } else { 9250 servinfo4_t *svp = rp->r_server; 9251 9252 /* 9253 * Already read directory. Use readdir with 9254 * no attrs (except for mounted_on_fileid) for updates. 9255 */ 9256 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9257 9258 /* 9259 * request mounted on fileid if supported, else request 9260 * fileid. maybe we should verify that fileid is supported 9261 * and request something else if not. 9262 */ 9263 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9264 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9265 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9266 nfs_rw_exit(&svp->sv_lock); 9267 } 9268 9269 /* putfh directory fh */ 9270 argop[0].argop = OP_CPUTFH; 9271 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9272 9273 argop[1].argop = OP_READDIR; 9274 rargs = &argop[1].nfs_argop4_u.opreaddir; 9275 /* 9276 * 1 and 2 are reserved for client "." and ".." entry offset. 9277 * cookie 0 should be used over-the-wire to start reading at 9278 * the beginning of the directory excluding "." and "..". 9279 */ 9280 if (rdc->nfs4_cookie == 0 || 9281 rdc->nfs4_cookie == 1 || 9282 rdc->nfs4_cookie == 2) { 9283 rargs->cookie = (nfs_cookie4)0; 9284 rargs->cookieverf = 0; 9285 } else { 9286 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9287 mutex_enter(&rp->r_statelock); 9288 rargs->cookieverf = rp->r_cookieverf4; 9289 mutex_exit(&rp->r_statelock); 9290 } 9291 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9292 rargs->maxcount = mi->mi_tsize; 9293 rargs->attr_request = rd_bitsval; 9294 rargs->rdc = rdc; 9295 rargs->dvp = vp; 9296 rargs->mi = mi; 9297 rargs->cr = cr; 9298 9299 9300 /* 9301 * If count < than the minimum required, we return no entries 9302 * and fail with EINVAL 9303 */ 9304 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9305 rdc->error = EINVAL; 9306 goto out; 9307 } 9308 9309 if (args.array_len == 5) { 9310 /* 9311 * Add lookupp and getattr for parent nodeid. 9312 */ 9313 argop[2].argop = OP_LOOKUPP; 9314 9315 argop[3].argop = OP_GETFH; 9316 9317 /* getattr parent */ 9318 argop[4].argop = OP_GETATTR; 9319 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9320 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9321 } 9322 9323 doqueue = 1; 9324 9325 if (mi->mi_io_kstats) { 9326 mutex_enter(&mi->mi_lock); 9327 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9328 mutex_exit(&mi->mi_lock); 9329 } 9330 9331 /* capture the time of this call */ 9332 rargs->t = t = gethrtime(); 9333 9334 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9335 9336 if (mi->mi_io_kstats) { 9337 mutex_enter(&mi->mi_lock); 9338 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9339 mutex_exit(&mi->mi_lock); 9340 } 9341 9342 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9343 9344 /* 9345 * If RPC error occurred and it isn't an error that 9346 * triggers recovery, then go ahead and fail now. 9347 */ 9348 if (e.error != 0 && !needrecov) { 9349 rdc->error = e.error; 9350 goto out; 9351 } 9352 9353 if (needrecov) { 9354 bool_t abort; 9355 9356 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9357 "nfs4readdir: initiating recovery.\n")); 9358 9359 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9360 NULL, OP_READDIR, NULL, NULL, NULL); 9361 if (abort == FALSE) { 9362 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9363 &recov_state, needrecov); 9364 if (!e.error) 9365 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9366 if (rdc->entries != NULL) { 9367 kmem_free(rdc->entries, rdc->entlen); 9368 rdc->entries = NULL; 9369 } 9370 goto recov_retry; 9371 } 9372 9373 if (e.error != 0) { 9374 rdc->error = e.error; 9375 goto out; 9376 } 9377 9378 /* fall through for res.status case */ 9379 } 9380 9381 res_opcnt = res.array_len; 9382 9383 /* 9384 * If compound failed first 2 ops (PUTFH+READDIR), then return 9385 * failure here. Subsequent ops are for filling out dot-dot 9386 * dirent, and if they fail, we still want to give the caller 9387 * the dirents returned by (the successful) READDIR op, so we need 9388 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9389 * 9390 * One example where PUTFH+READDIR ops would succeed but 9391 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9392 * but lacks x. In this case, a POSIX server's VOP_READDIR 9393 * would succeed; however, VOP_LOOKUP(..) would fail since no 9394 * x perm. We need to come up with a non-vendor-specific way 9395 * for a POSIX server to return d_ino from dotdot's dirent if 9396 * client only requests mounted_on_fileid, and just say the 9397 * LOOKUPP succeeded and fill out the GETATTR. However, if 9398 * client requested any mandatory attrs, server would be required 9399 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9400 * for dotdot. 9401 */ 9402 9403 if (res.status) { 9404 if (res_opcnt <= 2) { 9405 e.error = geterrno4(res.status); 9406 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9407 &recov_state, needrecov); 9408 nfs4_purge_stale_fh(e.error, vp, cr); 9409 rdc->error = e.error; 9410 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9411 if (rdc->entries != NULL) { 9412 kmem_free(rdc->entries, rdc->entlen); 9413 rdc->entries = NULL; 9414 } 9415 /* 9416 * If readdir a node that is a stub for a 9417 * crossed mount point, keep the original 9418 * secinfo flavor for the current file system, 9419 * not the crossed one. 9420 */ 9421 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9422 return; 9423 } 9424 } 9425 9426 resop = &res.array[1]; /* readdir res */ 9427 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9428 9429 mutex_enter(&rp->r_statelock); 9430 rp->r_cookieverf4 = rd_res->cookieverf; 9431 mutex_exit(&rp->r_statelock); 9432 9433 /* 9434 * For "." and ".." entries 9435 * e.g. 9436 * seek(cookie=0) -> "." entry with d_off = 1 9437 * seek(cookie=1) -> ".." entry with d_off = 2 9438 */ 9439 if (cookie == (nfs_cookie4) 0) { 9440 if (rd_res->dotp) 9441 rd_res->dotp->d_ino = nodeid; 9442 if (rd_res->dotdotp) 9443 rd_res->dotdotp->d_ino = pnodeid; 9444 } 9445 if (cookie == (nfs_cookie4) 1) { 9446 if (rd_res->dotdotp) 9447 rd_res->dotdotp->d_ino = pnodeid; 9448 } 9449 9450 9451 /* LOOKUPP+GETATTR attemped */ 9452 if (args.array_len == 5 && rd_res->dotdotp) { 9453 if (res.status == NFS4_OK && res_opcnt == 5) { 9454 nfs_fh4 *fhp; 9455 nfs4_sharedfh_t *sfhp; 9456 vnode_t *pvp; 9457 nfs4_ga_res_t *garp; 9458 9459 resop++; /* lookupp */ 9460 resop++; /* getfh */ 9461 fhp = &resop->nfs_resop4_u.opgetfh.object; 9462 9463 resop++; /* getattr of parent */ 9464 9465 /* 9466 * First, take care of finishing the 9467 * readdir results. 9468 */ 9469 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9470 /* 9471 * The d_ino of .. must be the inode number 9472 * of the mounted filesystem. 9473 */ 9474 if (garp->n4g_va.va_mask & AT_NODEID) 9475 rd_res->dotdotp->d_ino = 9476 garp->n4g_va.va_nodeid; 9477 9478 9479 /* 9480 * Next, create the ".." dnlc entry 9481 */ 9482 sfhp = sfh4_get(fhp, mi); 9483 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9484 dnlc_update(vp, "..", pvp); 9485 VN_RELE(pvp); 9486 } 9487 sfh4_rele(&sfhp); 9488 } 9489 } 9490 9491 if (mi->mi_io_kstats) { 9492 mutex_enter(&mi->mi_lock); 9493 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9494 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9495 mutex_exit(&mi->mi_lock); 9496 } 9497 9498 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9499 9500 out: 9501 /* 9502 * If readdir a node that is a stub for a crossed mount point, 9503 * keep the original secinfo flavor for the current file system, 9504 * not the crossed one. 9505 */ 9506 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9507 9508 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9509 } 9510 9511 9512 static int 9513 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9514 { 9515 rnode4_t *rp = VTOR4(bp->b_vp); 9516 int count; 9517 int error; 9518 cred_t *cred_otw = NULL; 9519 offset_t offset; 9520 nfs4_open_stream_t *osp = NULL; 9521 bool_t first_time = TRUE; /* first time getting otw cred */ 9522 bool_t last_time = FALSE; /* last time getting otw cred */ 9523 9524 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9525 9526 DTRACE_IO1(start, struct buf *, bp); 9527 offset = ldbtob(bp->b_lblkno); 9528 9529 if (bp->b_flags & B_READ) { 9530 read_again: 9531 /* 9532 * Releases the osp, if it is provided. 9533 * Puts a hold on the cred_otw and the new osp (if found). 9534 */ 9535 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9536 &first_time, &last_time); 9537 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9538 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9539 readahead, NULL); 9540 crfree(cred_otw); 9541 if (!error) { 9542 if (bp->b_resid) { 9543 /* 9544 * Didn't get it all because we hit EOF, 9545 * zero all the memory beyond the EOF. 9546 */ 9547 /* bzero(rdaddr + */ 9548 bzero(bp->b_un.b_addr + 9549 bp->b_bcount - bp->b_resid, bp->b_resid); 9550 } 9551 mutex_enter(&rp->r_statelock); 9552 if (bp->b_resid == bp->b_bcount && 9553 offset >= rp->r_size) { 9554 /* 9555 * We didn't read anything at all as we are 9556 * past EOF. Return an error indicator back 9557 * but don't destroy the pages (yet). 9558 */ 9559 error = NFS_EOF; 9560 } 9561 mutex_exit(&rp->r_statelock); 9562 } else if (error == EACCES && last_time == FALSE) { 9563 goto read_again; 9564 } 9565 } else { 9566 if (!(rp->r_flags & R4STALE)) { 9567 write_again: 9568 /* 9569 * Releases the osp, if it is provided. 9570 * Puts a hold on the cred_otw and the new 9571 * osp (if found). 9572 */ 9573 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9574 &first_time, &last_time); 9575 mutex_enter(&rp->r_statelock); 9576 count = MIN(bp->b_bcount, rp->r_size - offset); 9577 mutex_exit(&rp->r_statelock); 9578 if (count < 0) 9579 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9580 #ifdef DEBUG 9581 if (count == 0) { 9582 zoneid_t zoneid = getzoneid(); 9583 9584 zcmn_err(zoneid, CE_WARN, 9585 "nfs4_bio: zero length write at %lld", 9586 offset); 9587 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9588 "b_bcount=%ld, file size=%lld", 9589 rp->r_flags, (long)bp->b_bcount, 9590 rp->r_size); 9591 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9592 if (nfs4_bio_do_stop) 9593 debug_enter("nfs4_bio"); 9594 } 9595 #endif 9596 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9597 count, cred_otw, stab_comm); 9598 if (error == EACCES && last_time == FALSE) { 9599 crfree(cred_otw); 9600 goto write_again; 9601 } 9602 bp->b_error = error; 9603 if (error && error != EINTR && 9604 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9605 /* 9606 * Don't print EDQUOT errors on the console. 9607 * Don't print asynchronous EACCES errors. 9608 * Don't print EFBIG errors. 9609 * Print all other write errors. 9610 */ 9611 if (error != EDQUOT && error != EFBIG && 9612 (error != EACCES || 9613 !(bp->b_flags & B_ASYNC))) 9614 nfs4_write_error(bp->b_vp, 9615 error, cred_otw); 9616 /* 9617 * Update r_error and r_flags as appropriate. 9618 * If the error was ESTALE, then mark the 9619 * rnode as not being writeable and save 9620 * the error status. Otherwise, save any 9621 * errors which occur from asynchronous 9622 * page invalidations. Any errors occurring 9623 * from other operations should be saved 9624 * by the caller. 9625 */ 9626 mutex_enter(&rp->r_statelock); 9627 if (error == ESTALE) { 9628 rp->r_flags |= R4STALE; 9629 if (!rp->r_error) 9630 rp->r_error = error; 9631 } else if (!rp->r_error && 9632 (bp->b_flags & 9633 (B_INVAL|B_FORCE|B_ASYNC)) == 9634 (B_INVAL|B_FORCE|B_ASYNC)) { 9635 rp->r_error = error; 9636 } 9637 mutex_exit(&rp->r_statelock); 9638 } 9639 crfree(cred_otw); 9640 } else { 9641 error = rp->r_error; 9642 /* 9643 * A close may have cleared r_error, if so, 9644 * propagate ESTALE error return properly 9645 */ 9646 if (error == 0) 9647 error = ESTALE; 9648 } 9649 } 9650 9651 if (error != 0 && error != NFS_EOF) 9652 bp->b_flags |= B_ERROR; 9653 9654 if (osp) 9655 open_stream_rele(osp, rp); 9656 9657 DTRACE_IO1(done, struct buf *, bp); 9658 9659 return (error); 9660 } 9661 9662 /* ARGSUSED */ 9663 int 9664 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9665 { 9666 return (EREMOTE); 9667 } 9668 9669 /* ARGSUSED2 */ 9670 int 9671 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9672 { 9673 rnode4_t *rp = VTOR4(vp); 9674 9675 if (!write_lock) { 9676 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9677 return (V_WRITELOCK_FALSE); 9678 } 9679 9680 if ((rp->r_flags & R4DIRECTIO) || 9681 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9682 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9683 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9684 return (V_WRITELOCK_FALSE); 9685 nfs_rw_exit(&rp->r_rwlock); 9686 } 9687 9688 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9689 return (V_WRITELOCK_TRUE); 9690 } 9691 9692 /* ARGSUSED */ 9693 void 9694 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9695 { 9696 rnode4_t *rp = VTOR4(vp); 9697 9698 nfs_rw_exit(&rp->r_rwlock); 9699 } 9700 9701 /* ARGSUSED */ 9702 static int 9703 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9704 { 9705 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9706 return (EIO); 9707 9708 /* 9709 * Because we stuff the readdir cookie into the offset field 9710 * someone may attempt to do an lseek with the cookie which 9711 * we want to succeed. 9712 */ 9713 if (vp->v_type == VDIR) 9714 return (0); 9715 if (*noffp < 0) 9716 return (EINVAL); 9717 return (0); 9718 } 9719 9720 9721 /* 9722 * Return all the pages from [off..off+len) in file 9723 */ 9724 /* ARGSUSED */ 9725 static int 9726 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9727 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9728 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9729 { 9730 rnode4_t *rp; 9731 int error; 9732 mntinfo4_t *mi; 9733 9734 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9735 return (EIO); 9736 rp = VTOR4(vp); 9737 if (IS_SHADOW(vp, rp)) 9738 vp = RTOV4(rp); 9739 9740 if (vp->v_flag & VNOMAP) 9741 return (ENOSYS); 9742 9743 if (protp != NULL) 9744 *protp = PROT_ALL; 9745 9746 /* 9747 * Now validate that the caches are up to date. 9748 */ 9749 if (error = nfs4_validate_caches(vp, cr)) 9750 return (error); 9751 9752 mi = VTOMI4(vp); 9753 retry: 9754 mutex_enter(&rp->r_statelock); 9755 9756 /* 9757 * Don't create dirty pages faster than they 9758 * can be cleaned so that the system doesn't 9759 * get imbalanced. If the async queue is 9760 * maxed out, then wait for it to drain before 9761 * creating more dirty pages. Also, wait for 9762 * any threads doing pagewalks in the vop_getattr 9763 * entry points so that they don't block for 9764 * long periods. 9765 */ 9766 if (rw == S_CREATE) { 9767 while ((mi->mi_max_threads != 0 && 9768 rp->r_awcount > 2 * mi->mi_max_threads) || 9769 rp->r_gcount > 0) 9770 cv_wait(&rp->r_cv, &rp->r_statelock); 9771 } 9772 9773 /* 9774 * If we are getting called as a side effect of an nfs_write() 9775 * operation the local file size might not be extended yet. 9776 * In this case we want to be able to return pages of zeroes. 9777 */ 9778 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9779 NFS4_DEBUG(nfs4_pageio_debug, 9780 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9781 "len=%llu, size=%llu, attrsize =%llu", off, 9782 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9783 mutex_exit(&rp->r_statelock); 9784 return (EFAULT); /* beyond EOF */ 9785 } 9786 9787 mutex_exit(&rp->r_statelock); 9788 9789 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9790 pl, plsz, seg, addr, rw, cr); 9791 NFS4_DEBUG(nfs4_pageio_debug && error, 9792 (CE_NOTE, "getpages error %d; off=%lld, len=%lld", 9793 error, off, (u_longlong_t)len)); 9794 9795 switch (error) { 9796 case NFS_EOF: 9797 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9798 goto retry; 9799 case ESTALE: 9800 nfs4_purge_stale_fh(error, vp, cr); 9801 } 9802 9803 return (error); 9804 } 9805 9806 /* 9807 * Called from pvn_getpages to get a particular page. 9808 */ 9809 /* ARGSUSED */ 9810 static int 9811 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9812 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9813 enum seg_rw rw, cred_t *cr) 9814 { 9815 rnode4_t *rp; 9816 uint_t bsize; 9817 struct buf *bp; 9818 page_t *pp; 9819 u_offset_t lbn; 9820 u_offset_t io_off; 9821 u_offset_t blkoff; 9822 u_offset_t rablkoff; 9823 size_t io_len; 9824 uint_t blksize; 9825 int error; 9826 int readahead; 9827 int readahead_issued = 0; 9828 int ra_window; /* readahead window */ 9829 page_t *pagefound; 9830 page_t *savepp; 9831 9832 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9833 return (EIO); 9834 9835 rp = VTOR4(vp); 9836 ASSERT(!IS_SHADOW(vp, rp)); 9837 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9838 9839 reread: 9840 bp = NULL; 9841 pp = NULL; 9842 pagefound = NULL; 9843 9844 if (pl != NULL) 9845 pl[0] = NULL; 9846 9847 error = 0; 9848 lbn = off / bsize; 9849 blkoff = lbn * bsize; 9850 9851 /* 9852 * Queueing up the readahead before doing the synchronous read 9853 * results in a significant increase in read throughput because 9854 * of the increased parallelism between the async threads and 9855 * the process context. 9856 */ 9857 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9858 rw != S_CREATE && 9859 !(vp->v_flag & VNOCACHE)) { 9860 mutex_enter(&rp->r_statelock); 9861 9862 /* 9863 * Calculate the number of readaheads to do. 9864 * a) No readaheads at offset = 0. 9865 * b) Do maximum(nfs4_nra) readaheads when the readahead 9866 * window is closed. 9867 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9868 * upon how far the readahead window is open or close. 9869 * d) No readaheads if rp->r_nextr is not within the scope 9870 * of the readahead window (random i/o). 9871 */ 9872 9873 if (off == 0) 9874 readahead = 0; 9875 else if (blkoff == rp->r_nextr) 9876 readahead = nfs4_nra; 9877 else if (rp->r_nextr > blkoff && 9878 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9879 <= (nfs4_nra - 1))) 9880 readahead = nfs4_nra - ra_window; 9881 else 9882 readahead = 0; 9883 9884 rablkoff = rp->r_nextr; 9885 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9886 mutex_exit(&rp->r_statelock); 9887 if (nfs4_async_readahead(vp, rablkoff + bsize, 9888 addr + (rablkoff + bsize - off), 9889 seg, cr, nfs4_readahead) < 0) { 9890 mutex_enter(&rp->r_statelock); 9891 break; 9892 } 9893 readahead--; 9894 rablkoff += bsize; 9895 /* 9896 * Indicate that we did a readahead so 9897 * readahead offset is not updated 9898 * by the synchronous read below. 9899 */ 9900 readahead_issued = 1; 9901 mutex_enter(&rp->r_statelock); 9902 /* 9903 * set readahead offset to 9904 * offset of last async readahead 9905 * request. 9906 */ 9907 rp->r_nextr = rablkoff; 9908 } 9909 mutex_exit(&rp->r_statelock); 9910 } 9911 9912 again: 9913 if ((pagefound = page_exists(vp, off)) == NULL) { 9914 if (pl == NULL) { 9915 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9916 nfs4_readahead); 9917 } else if (rw == S_CREATE) { 9918 /* 9919 * Block for this page is not allocated, or the offset 9920 * is beyond the current allocation size, or we're 9921 * allocating a swap slot and the page was not found, 9922 * so allocate it and return a zero page. 9923 */ 9924 if ((pp = page_create_va(vp, off, 9925 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9926 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9927 io_len = PAGESIZE; 9928 mutex_enter(&rp->r_statelock); 9929 rp->r_nextr = off + PAGESIZE; 9930 mutex_exit(&rp->r_statelock); 9931 } else { 9932 /* 9933 * Need to go to server to get a block 9934 */ 9935 mutex_enter(&rp->r_statelock); 9936 if (blkoff < rp->r_size && 9937 blkoff + bsize > rp->r_size) { 9938 /* 9939 * If less than a block left in 9940 * file read less than a block. 9941 */ 9942 if (rp->r_size <= off) { 9943 /* 9944 * Trying to access beyond EOF, 9945 * set up to get at least one page. 9946 */ 9947 blksize = off + PAGESIZE - blkoff; 9948 } else 9949 blksize = rp->r_size - blkoff; 9950 } else if ((off == 0) || 9951 (off != rp->r_nextr && !readahead_issued)) { 9952 blksize = PAGESIZE; 9953 blkoff = off; /* block = page here */ 9954 } else 9955 blksize = bsize; 9956 mutex_exit(&rp->r_statelock); 9957 9958 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9959 &io_len, blkoff, blksize, 0); 9960 9961 /* 9962 * Some other thread has entered the page, 9963 * so just use it. 9964 */ 9965 if (pp == NULL) 9966 goto again; 9967 9968 /* 9969 * Now round the request size up to page boundaries. 9970 * This ensures that the entire page will be 9971 * initialized to zeroes if EOF is encountered. 9972 */ 9973 io_len = ptob(btopr(io_len)); 9974 9975 bp = pageio_setup(pp, io_len, vp, B_READ); 9976 ASSERT(bp != NULL); 9977 9978 /* 9979 * pageio_setup should have set b_addr to 0. This 9980 * is correct since we want to do I/O on a page 9981 * boundary. bp_mapin will use this addr to calculate 9982 * an offset, and then set b_addr to the kernel virtual 9983 * address it allocated for us. 9984 */ 9985 ASSERT(bp->b_un.b_addr == 0); 9986 9987 bp->b_edev = 0; 9988 bp->b_dev = 0; 9989 bp->b_lblkno = lbtodb(io_off); 9990 bp->b_file = vp; 9991 bp->b_offset = (offset_t)off; 9992 bp_mapin(bp); 9993 9994 /* 9995 * If doing a write beyond what we believe is EOF, 9996 * don't bother trying to read the pages from the 9997 * server, we'll just zero the pages here. We 9998 * don't check that the rw flag is S_WRITE here 9999 * because some implementations may attempt a 10000 * read access to the buffer before copying data. 10001 */ 10002 mutex_enter(&rp->r_statelock); 10003 if (io_off >= rp->r_size && seg == segkmap) { 10004 mutex_exit(&rp->r_statelock); 10005 bzero(bp->b_un.b_addr, io_len); 10006 } else { 10007 mutex_exit(&rp->r_statelock); 10008 error = nfs4_bio(bp, NULL, cr, FALSE); 10009 } 10010 10011 /* 10012 * Unmap the buffer before freeing it. 10013 */ 10014 bp_mapout(bp); 10015 pageio_done(bp); 10016 10017 savepp = pp; 10018 do { 10019 pp->p_fsdata = C_NOCOMMIT; 10020 } while ((pp = pp->p_next) != savepp); 10021 10022 if (error == NFS_EOF) { 10023 /* 10024 * If doing a write system call just return 10025 * zeroed pages, else user tried to get pages 10026 * beyond EOF, return error. We don't check 10027 * that the rw flag is S_WRITE here because 10028 * some implementations may attempt a read 10029 * access to the buffer before copying data. 10030 */ 10031 if (seg == segkmap) 10032 error = 0; 10033 else 10034 error = EFAULT; 10035 } 10036 10037 if (!readahead_issued && !error) { 10038 mutex_enter(&rp->r_statelock); 10039 rp->r_nextr = io_off + io_len; 10040 mutex_exit(&rp->r_statelock); 10041 } 10042 } 10043 } 10044 10045 out: 10046 if (pl == NULL) 10047 return (error); 10048 10049 if (error) { 10050 if (pp != NULL) 10051 pvn_read_done(pp, B_ERROR); 10052 return (error); 10053 } 10054 10055 if (pagefound) { 10056 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10057 10058 /* 10059 * Page exists in the cache, acquire the appropriate lock. 10060 * If this fails, start all over again. 10061 */ 10062 if ((pp = page_lookup(vp, off, se)) == NULL) { 10063 #ifdef DEBUG 10064 nfs4_lostpage++; 10065 #endif 10066 goto reread; 10067 } 10068 pl[0] = pp; 10069 pl[1] = NULL; 10070 return (0); 10071 } 10072 10073 if (pp != NULL) 10074 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10075 10076 return (error); 10077 } 10078 10079 static void 10080 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10081 cred_t *cr) 10082 { 10083 int error; 10084 page_t *pp; 10085 u_offset_t io_off; 10086 size_t io_len; 10087 struct buf *bp; 10088 uint_t bsize, blksize; 10089 rnode4_t *rp = VTOR4(vp); 10090 page_t *savepp; 10091 10092 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10093 10094 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10095 10096 mutex_enter(&rp->r_statelock); 10097 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10098 /* 10099 * If less than a block left in file read less 10100 * than a block. 10101 */ 10102 blksize = rp->r_size - blkoff; 10103 } else 10104 blksize = bsize; 10105 mutex_exit(&rp->r_statelock); 10106 10107 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10108 &io_off, &io_len, blkoff, blksize, 1); 10109 /* 10110 * The isra flag passed to the kluster function is 1, we may have 10111 * gotten a return value of NULL for a variety of reasons (# of free 10112 * pages < minfree, someone entered the page on the vnode etc). In all 10113 * cases, we want to punt on the readahead. 10114 */ 10115 if (pp == NULL) 10116 return; 10117 10118 /* 10119 * Now round the request size up to page boundaries. 10120 * This ensures that the entire page will be 10121 * initialized to zeroes if EOF is encountered. 10122 */ 10123 io_len = ptob(btopr(io_len)); 10124 10125 bp = pageio_setup(pp, io_len, vp, B_READ); 10126 ASSERT(bp != NULL); 10127 10128 /* 10129 * pageio_setup should have set b_addr to 0. This is correct since 10130 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10131 * to calculate an offset, and then set b_addr to the kernel virtual 10132 * address it allocated for us. 10133 */ 10134 ASSERT(bp->b_un.b_addr == 0); 10135 10136 bp->b_edev = 0; 10137 bp->b_dev = 0; 10138 bp->b_lblkno = lbtodb(io_off); 10139 bp->b_file = vp; 10140 bp->b_offset = (offset_t)blkoff; 10141 bp_mapin(bp); 10142 10143 /* 10144 * If doing a write beyond what we believe is EOF, don't bother trying 10145 * to read the pages from the server, we'll just zero the pages here. 10146 * We don't check that the rw flag is S_WRITE here because some 10147 * implementations may attempt a read access to the buffer before 10148 * copying data. 10149 */ 10150 mutex_enter(&rp->r_statelock); 10151 if (io_off >= rp->r_size && seg == segkmap) { 10152 mutex_exit(&rp->r_statelock); 10153 bzero(bp->b_un.b_addr, io_len); 10154 error = 0; 10155 } else { 10156 mutex_exit(&rp->r_statelock); 10157 error = nfs4_bio(bp, NULL, cr, TRUE); 10158 if (error == NFS_EOF) 10159 error = 0; 10160 } 10161 10162 /* 10163 * Unmap the buffer before freeing it. 10164 */ 10165 bp_mapout(bp); 10166 pageio_done(bp); 10167 10168 savepp = pp; 10169 do { 10170 pp->p_fsdata = C_NOCOMMIT; 10171 } while ((pp = pp->p_next) != savepp); 10172 10173 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10174 10175 /* 10176 * In case of error set readahead offset 10177 * to the lowest offset. 10178 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10179 */ 10180 if (error && rp->r_nextr > io_off) { 10181 mutex_enter(&rp->r_statelock); 10182 if (rp->r_nextr > io_off) 10183 rp->r_nextr = io_off; 10184 mutex_exit(&rp->r_statelock); 10185 } 10186 } 10187 10188 /* 10189 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10190 * If len == 0, do from off to EOF. 10191 * 10192 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10193 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10194 * (from pageout). 10195 */ 10196 /* ARGSUSED */ 10197 static int 10198 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10199 caller_context_t *ct) 10200 { 10201 int error; 10202 rnode4_t *rp; 10203 10204 ASSERT(cr != NULL); 10205 10206 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10207 return (EIO); 10208 10209 rp = VTOR4(vp); 10210 if (IS_SHADOW(vp, rp)) 10211 vp = RTOV4(rp); 10212 10213 /* 10214 * XXX - Why should this check be made here? 10215 */ 10216 if (vp->v_flag & VNOMAP) 10217 return (ENOSYS); 10218 10219 if (len == 0 && !(flags & B_INVAL) && 10220 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10221 return (0); 10222 10223 mutex_enter(&rp->r_statelock); 10224 rp->r_count++; 10225 mutex_exit(&rp->r_statelock); 10226 error = nfs4_putpages(vp, off, len, flags, cr); 10227 mutex_enter(&rp->r_statelock); 10228 rp->r_count--; 10229 cv_broadcast(&rp->r_cv); 10230 mutex_exit(&rp->r_statelock); 10231 10232 return (error); 10233 } 10234 10235 /* 10236 * Write out a single page, possibly klustering adjacent dirty pages. 10237 */ 10238 int 10239 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10240 int flags, cred_t *cr) 10241 { 10242 u_offset_t io_off; 10243 u_offset_t lbn_off; 10244 u_offset_t lbn; 10245 size_t io_len; 10246 uint_t bsize; 10247 int error; 10248 rnode4_t *rp; 10249 10250 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10251 ASSERT(pp != NULL); 10252 ASSERT(cr != NULL); 10253 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10254 10255 rp = VTOR4(vp); 10256 ASSERT(rp->r_count > 0); 10257 ASSERT(!IS_SHADOW(vp, rp)); 10258 10259 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10260 lbn = pp->p_offset / bsize; 10261 lbn_off = lbn * bsize; 10262 10263 /* 10264 * Find a kluster that fits in one block, or in 10265 * one page if pages are bigger than blocks. If 10266 * there is less file space allocated than a whole 10267 * page, we'll shorten the i/o request below. 10268 */ 10269 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10270 roundup(bsize, PAGESIZE), flags); 10271 10272 /* 10273 * pvn_write_kluster shouldn't have returned a page with offset 10274 * behind the original page we were given. Verify that. 10275 */ 10276 ASSERT((pp->p_offset / bsize) >= lbn); 10277 10278 /* 10279 * Now pp will have the list of kept dirty pages marked for 10280 * write back. It will also handle invalidation and freeing 10281 * of pages that are not dirty. Check for page length rounding 10282 * problems. 10283 */ 10284 if (io_off + io_len > lbn_off + bsize) { 10285 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10286 io_len = lbn_off + bsize - io_off; 10287 } 10288 /* 10289 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10290 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10291 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10292 * progress and the r_size has not been made consistent with the 10293 * new size of the file. When the uiomove() completes the r_size is 10294 * updated and the R4MODINPROGRESS flag is cleared. 10295 * 10296 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10297 * consistent value of r_size. Without this handshaking, it is 10298 * possible that nfs4_bio() picks up the old value of r_size 10299 * before the uiomove() in writerp4() completes. This will result 10300 * in the write through nfs4_bio() being dropped. 10301 * 10302 * More precisely, there is a window between the time the uiomove() 10303 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10304 * operation intervenes in this window, the page will be picked up, 10305 * because it is dirty (it will be unlocked, unless it was 10306 * pagecreate'd). When the page is picked up as dirty, the dirty 10307 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10308 * checked. This will still be the old size. Therefore the page will 10309 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10310 * the page will be found to be clean and the write will be dropped. 10311 */ 10312 if (rp->r_flags & R4MODINPROGRESS) { 10313 mutex_enter(&rp->r_statelock); 10314 if ((rp->r_flags & R4MODINPROGRESS) && 10315 rp->r_modaddr + MAXBSIZE > io_off && 10316 rp->r_modaddr < io_off + io_len) { 10317 page_t *plist; 10318 /* 10319 * A write is in progress for this region of the file. 10320 * If we did not detect R4MODINPROGRESS here then this 10321 * path through nfs_putapage() would eventually go to 10322 * nfs4_bio() and may not write out all of the data 10323 * in the pages. We end up losing data. So we decide 10324 * to set the modified bit on each page in the page 10325 * list and mark the rnode with R4DIRTY. This write 10326 * will be restarted at some later time. 10327 */ 10328 plist = pp; 10329 while (plist != NULL) { 10330 pp = plist; 10331 page_sub(&plist, pp); 10332 hat_setmod(pp); 10333 page_io_unlock(pp); 10334 page_unlock(pp); 10335 } 10336 rp->r_flags |= R4DIRTY; 10337 mutex_exit(&rp->r_statelock); 10338 if (offp) 10339 *offp = io_off; 10340 if (lenp) 10341 *lenp = io_len; 10342 return (0); 10343 } 10344 mutex_exit(&rp->r_statelock); 10345 } 10346 10347 if (flags & B_ASYNC) { 10348 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10349 nfs4_sync_putapage); 10350 } else 10351 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10352 10353 if (offp) 10354 *offp = io_off; 10355 if (lenp) 10356 *lenp = io_len; 10357 return (error); 10358 } 10359 10360 static int 10361 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10362 int flags, cred_t *cr) 10363 { 10364 int error; 10365 rnode4_t *rp; 10366 10367 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10368 10369 flags |= B_WRITE; 10370 10371 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10372 10373 rp = VTOR4(vp); 10374 10375 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10376 error == EACCES) && 10377 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10378 if (!(rp->r_flags & R4OUTOFSPACE)) { 10379 mutex_enter(&rp->r_statelock); 10380 rp->r_flags |= R4OUTOFSPACE; 10381 mutex_exit(&rp->r_statelock); 10382 } 10383 flags |= B_ERROR; 10384 pvn_write_done(pp, flags); 10385 /* 10386 * If this was not an async thread, then try again to 10387 * write out the pages, but this time, also destroy 10388 * them whether or not the write is successful. This 10389 * will prevent memory from filling up with these 10390 * pages and destroying them is the only alternative 10391 * if they can't be written out. 10392 * 10393 * Don't do this if this is an async thread because 10394 * when the pages are unlocked in pvn_write_done, 10395 * some other thread could have come along, locked 10396 * them, and queued for an async thread. It would be 10397 * possible for all of the async threads to be tied 10398 * up waiting to lock the pages again and they would 10399 * all already be locked and waiting for an async 10400 * thread to handle them. Deadlock. 10401 */ 10402 if (!(flags & B_ASYNC)) { 10403 error = nfs4_putpage(vp, io_off, io_len, 10404 B_INVAL | B_FORCE, cr, NULL); 10405 } 10406 } else { 10407 if (error) 10408 flags |= B_ERROR; 10409 else if (rp->r_flags & R4OUTOFSPACE) { 10410 mutex_enter(&rp->r_statelock); 10411 rp->r_flags &= ~R4OUTOFSPACE; 10412 mutex_exit(&rp->r_statelock); 10413 } 10414 pvn_write_done(pp, flags); 10415 if (freemem < desfree) 10416 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10417 NFS4_WRITE_NOWAIT); 10418 } 10419 10420 return (error); 10421 } 10422 10423 #ifdef DEBUG 10424 int nfs4_force_open_before_mmap = 0; 10425 #endif 10426 10427 /* ARGSUSED */ 10428 static int 10429 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10430 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10431 caller_context_t *ct) 10432 { 10433 struct segvn_crargs vn_a; 10434 int error = 0; 10435 rnode4_t *rp = VTOR4(vp); 10436 mntinfo4_t *mi = VTOMI4(vp); 10437 10438 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10439 return (EIO); 10440 10441 if (vp->v_flag & VNOMAP) 10442 return (ENOSYS); 10443 10444 if (off < 0 || (off + len) < 0) 10445 return (ENXIO); 10446 10447 if (vp->v_type != VREG) 10448 return (ENODEV); 10449 10450 /* 10451 * If the file is delegated to the client don't do anything. 10452 * If the file is not delegated, then validate the data cache. 10453 */ 10454 mutex_enter(&rp->r_statev4_lock); 10455 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10456 mutex_exit(&rp->r_statev4_lock); 10457 error = nfs4_validate_caches(vp, cr); 10458 if (error) 10459 return (error); 10460 } else { 10461 mutex_exit(&rp->r_statev4_lock); 10462 } 10463 10464 /* 10465 * Check to see if the vnode is currently marked as not cachable. 10466 * This means portions of the file are locked (through VOP_FRLOCK). 10467 * In this case the map request must be refused. We use 10468 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10469 * 10470 * Atomically increment r_inmap after acquiring r_rwlock. The 10471 * idea here is to acquire r_rwlock to block read/write and 10472 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10473 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10474 * and we can prevent the deadlock that would have occurred 10475 * when nfs4_addmap() would have acquired it out of order. 10476 * 10477 * Since we are not protecting r_inmap by any lock, we do not 10478 * hold any lock when we decrement it. We atomically decrement 10479 * r_inmap after we release r_lkserlock. 10480 */ 10481 10482 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10483 return (EINTR); 10484 atomic_inc_uint(&rp->r_inmap); 10485 nfs_rw_exit(&rp->r_rwlock); 10486 10487 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10488 atomic_dec_uint(&rp->r_inmap); 10489 return (EINTR); 10490 } 10491 10492 if (vp->v_flag & VNOCACHE) { 10493 error = EAGAIN; 10494 goto done; 10495 } 10496 10497 /* 10498 * Don't allow concurrent locks and mapping if mandatory locking is 10499 * enabled. 10500 */ 10501 if (flk_has_remote_locks(vp)) { 10502 struct vattr va; 10503 va.va_mask = AT_MODE; 10504 error = nfs4getattr(vp, &va, cr); 10505 if (error != 0) 10506 goto done; 10507 if (MANDLOCK(vp, va.va_mode)) { 10508 error = EAGAIN; 10509 goto done; 10510 } 10511 } 10512 10513 /* 10514 * It is possible that the rnode has a lost lock request that we 10515 * are still trying to recover, and that the request conflicts with 10516 * this map request. 10517 * 10518 * An alternative approach would be for nfs4_safemap() to consider 10519 * queued lock requests when deciding whether to set or clear 10520 * VNOCACHE. This would require the frlock code path to call 10521 * nfs4_safemap() after enqueing a lost request. 10522 */ 10523 if (nfs4_map_lost_lock_conflict(vp)) { 10524 error = EAGAIN; 10525 goto done; 10526 } 10527 10528 as_rangelock(as); 10529 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10530 if (error != 0) { 10531 as_rangeunlock(as); 10532 goto done; 10533 } 10534 10535 if (vp->v_type == VREG) { 10536 /* 10537 * We need to retrieve the open stream 10538 */ 10539 nfs4_open_stream_t *osp = NULL; 10540 nfs4_open_owner_t *oop = NULL; 10541 10542 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10543 if (oop != NULL) { 10544 /* returns with 'os_sync_lock' held */ 10545 osp = find_open_stream(oop, rp); 10546 open_owner_rele(oop); 10547 } 10548 if (osp == NULL) { 10549 #ifdef DEBUG 10550 if (nfs4_force_open_before_mmap) { 10551 error = EIO; 10552 goto done; 10553 } 10554 #endif 10555 /* returns with 'os_sync_lock' held */ 10556 error = open_and_get_osp(vp, cr, &osp); 10557 if (osp == NULL) { 10558 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10559 "nfs4_map: we tried to OPEN the file " 10560 "but again no osp, so fail with EIO")); 10561 goto done; 10562 } 10563 } 10564 10565 if (osp->os_failed_reopen) { 10566 mutex_exit(&osp->os_sync_lock); 10567 open_stream_rele(osp, rp); 10568 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10569 "nfs4_map: os_failed_reopen set on " 10570 "osp %p, cr %p, rp %s", (void *)osp, 10571 (void *)cr, rnode4info(rp))); 10572 error = EIO; 10573 goto done; 10574 } 10575 mutex_exit(&osp->os_sync_lock); 10576 open_stream_rele(osp, rp); 10577 } 10578 10579 vn_a.vp = vp; 10580 vn_a.offset = off; 10581 vn_a.type = (flags & MAP_TYPE); 10582 vn_a.prot = (uchar_t)prot; 10583 vn_a.maxprot = (uchar_t)maxprot; 10584 vn_a.flags = (flags & ~MAP_TYPE); 10585 vn_a.cred = cr; 10586 vn_a.amp = NULL; 10587 vn_a.szc = 0; 10588 vn_a.lgrp_mem_policy_flags = 0; 10589 10590 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10591 as_rangeunlock(as); 10592 10593 done: 10594 nfs_rw_exit(&rp->r_lkserlock); 10595 atomic_dec_uint(&rp->r_inmap); 10596 return (error); 10597 } 10598 10599 /* 10600 * We're most likely dealing with a kernel module that likes to READ 10601 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10602 * officially OPEN the file to create the necessary client state 10603 * for bookkeeping of os_mmap_read/write counts. 10604 * 10605 * Since VOP_MAP only passes in a pointer to the vnode rather than 10606 * a double pointer, we can't handle the case where nfs4open_otw() 10607 * returns a different vnode than the one passed into VOP_MAP (since 10608 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10609 * we return NULL and let nfs4_map() fail. Note: the only case where 10610 * this should happen is if the file got removed and replaced with the 10611 * same name on the server (in addition to the fact that we're trying 10612 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10613 */ 10614 static int 10615 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10616 { 10617 rnode4_t *rp, *drp; 10618 vnode_t *dvp, *open_vp; 10619 char file_name[MAXNAMELEN]; 10620 int just_created; 10621 nfs4_open_stream_t *osp; 10622 nfs4_open_owner_t *oop; 10623 int error; 10624 10625 *ospp = NULL; 10626 open_vp = map_vp; 10627 10628 rp = VTOR4(open_vp); 10629 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10630 return (error); 10631 drp = VTOR4(dvp); 10632 10633 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10634 VN_RELE(dvp); 10635 return (EINTR); 10636 } 10637 10638 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10639 nfs_rw_exit(&drp->r_rwlock); 10640 VN_RELE(dvp); 10641 return (error); 10642 } 10643 10644 mutex_enter(&rp->r_statev4_lock); 10645 if (rp->created_v4) { 10646 rp->created_v4 = 0; 10647 mutex_exit(&rp->r_statev4_lock); 10648 10649 dnlc_update(dvp, file_name, open_vp); 10650 /* This is needed so we don't bump the open ref count */ 10651 just_created = 1; 10652 } else { 10653 mutex_exit(&rp->r_statev4_lock); 10654 just_created = 0; 10655 } 10656 10657 VN_HOLD(map_vp); 10658 10659 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10660 just_created); 10661 if (error) { 10662 nfs_rw_exit(&drp->r_rwlock); 10663 VN_RELE(dvp); 10664 VN_RELE(map_vp); 10665 return (error); 10666 } 10667 10668 nfs_rw_exit(&drp->r_rwlock); 10669 VN_RELE(dvp); 10670 10671 /* 10672 * If nfs4open_otw() returned a different vnode then "undo" 10673 * the open and return failure to the caller. 10674 */ 10675 if (!VN_CMP(open_vp, map_vp)) { 10676 nfs4_error_t e; 10677 10678 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10679 "open returned a different vnode")); 10680 /* 10681 * If there's an error, ignore it, 10682 * and let VOP_INACTIVE handle it. 10683 */ 10684 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10685 CLOSE_NORM, 0, 0, 0); 10686 VN_RELE(map_vp); 10687 return (EIO); 10688 } 10689 10690 VN_RELE(map_vp); 10691 10692 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10693 if (!oop) { 10694 nfs4_error_t e; 10695 10696 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10697 "no open owner")); 10698 /* 10699 * If there's an error, ignore it, 10700 * and let VOP_INACTIVE handle it. 10701 */ 10702 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10703 CLOSE_NORM, 0, 0, 0); 10704 return (EIO); 10705 } 10706 osp = find_open_stream(oop, rp); 10707 open_owner_rele(oop); 10708 *ospp = osp; 10709 return (0); 10710 } 10711 10712 /* 10713 * Please be aware that when this function is called, the address space write 10714 * a_lock is held. Do not put over the wire calls in this function. 10715 */ 10716 /* ARGSUSED */ 10717 static int 10718 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10719 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10720 caller_context_t *ct) 10721 { 10722 rnode4_t *rp; 10723 int error = 0; 10724 mntinfo4_t *mi; 10725 10726 mi = VTOMI4(vp); 10727 rp = VTOR4(vp); 10728 10729 if (nfs_zone() != mi->mi_zone) 10730 return (EIO); 10731 if (vp->v_flag & VNOMAP) 10732 return (ENOSYS); 10733 10734 /* 10735 * Don't need to update the open stream first, since this 10736 * mmap can't add any additional share access that isn't 10737 * already contained in the open stream (for the case where we 10738 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10739 * take into account os_mmap_read[write] counts). 10740 */ 10741 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10742 10743 if (vp->v_type == VREG) { 10744 /* 10745 * We need to retrieve the open stream and update the counts. 10746 * If there is no open stream here, something is wrong. 10747 */ 10748 nfs4_open_stream_t *osp = NULL; 10749 nfs4_open_owner_t *oop = NULL; 10750 10751 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10752 if (oop != NULL) { 10753 /* returns with 'os_sync_lock' held */ 10754 osp = find_open_stream(oop, rp); 10755 open_owner_rele(oop); 10756 } 10757 if (osp == NULL) { 10758 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10759 "nfs4_addmap: we should have an osp" 10760 "but we don't, so fail with EIO")); 10761 error = EIO; 10762 goto out; 10763 } 10764 10765 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10766 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10767 10768 /* 10769 * Update the map count in the open stream. 10770 * This is necessary in the case where we 10771 * open/mmap/close/, then the server reboots, and we 10772 * attempt to reopen. If the mmap doesn't add share 10773 * access then we send an invalid reopen with 10774 * access = NONE. 10775 * 10776 * We need to specifically check each PROT_* so a mmap 10777 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10778 * read and write access. A simple comparison of prot 10779 * to ~PROT_WRITE to determine read access is insufficient 10780 * since prot can be |= with PROT_USER, etc. 10781 */ 10782 10783 /* 10784 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10785 */ 10786 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10787 osp->os_mmap_write += btopr(len); 10788 if (maxprot & PROT_READ) 10789 osp->os_mmap_read += btopr(len); 10790 if (maxprot & PROT_EXEC) 10791 osp->os_mmap_read += btopr(len); 10792 /* 10793 * Ensure that os_mmap_read gets incremented, even if 10794 * maxprot were to look like PROT_NONE. 10795 */ 10796 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10797 !(maxprot & PROT_EXEC)) 10798 osp->os_mmap_read += btopr(len); 10799 osp->os_mapcnt += btopr(len); 10800 mutex_exit(&osp->os_sync_lock); 10801 open_stream_rele(osp, rp); 10802 } 10803 10804 out: 10805 /* 10806 * If we got an error, then undo our 10807 * incrementing of 'r_mapcnt'. 10808 */ 10809 10810 if (error) { 10811 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10812 ASSERT(rp->r_mapcnt >= 0); 10813 } 10814 return (error); 10815 } 10816 10817 /* ARGSUSED */ 10818 static int 10819 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10820 { 10821 10822 return (VTOR4(vp1) == VTOR4(vp2)); 10823 } 10824 10825 /* ARGSUSED */ 10826 static int 10827 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10828 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10829 caller_context_t *ct) 10830 { 10831 int rc; 10832 u_offset_t start, end; 10833 rnode4_t *rp; 10834 int error = 0, intr = INTR4(vp); 10835 nfs4_error_t e; 10836 10837 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10838 return (EIO); 10839 10840 /* check for valid cmd parameter */ 10841 if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW) 10842 return (EINVAL); 10843 10844 /* Verify l_type. */ 10845 switch (bfp->l_type) { 10846 case F_RDLCK: 10847 if (cmd != F_GETLK && !(flag & FREAD)) 10848 return (EBADF); 10849 break; 10850 case F_WRLCK: 10851 if (cmd != F_GETLK && !(flag & FWRITE)) 10852 return (EBADF); 10853 break; 10854 case F_UNLCK: 10855 intr = 0; 10856 break; 10857 10858 default: 10859 return (EINVAL); 10860 } 10861 10862 /* check the validity of the lock range */ 10863 if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset)) 10864 return (rc); 10865 if (rc = flk_check_lock_data(start, end, MAXEND)) 10866 return (rc); 10867 10868 /* 10869 * If the filesystem is mounted using local locking, pass the 10870 * request off to the local locking code. 10871 */ 10872 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10873 if (cmd == F_SETLK || cmd == F_SETLKW) { 10874 /* 10875 * For complete safety, we should be holding 10876 * r_lkserlock. However, we can't call 10877 * nfs4_safelock and then fs_frlock while 10878 * holding r_lkserlock, so just invoke 10879 * nfs4_safelock and expect that this will 10880 * catch enough of the cases. 10881 */ 10882 if (!nfs4_safelock(vp, bfp, cr)) 10883 return (EAGAIN); 10884 } 10885 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10886 } 10887 10888 rp = VTOR4(vp); 10889 10890 /* 10891 * Check whether the given lock request can proceed, given the 10892 * current file mappings. 10893 */ 10894 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) 10895 return (EINTR); 10896 if (cmd == F_SETLK || cmd == F_SETLKW) { 10897 if (!nfs4_safelock(vp, bfp, cr)) { 10898 rc = EAGAIN; 10899 goto done; 10900 } 10901 } 10902 10903 /* 10904 * Flush the cache after waiting for async I/O to finish. For new 10905 * locks, this is so that the process gets the latest bits from the 10906 * server. For unlocks, this is so that other clients see the 10907 * latest bits once the file has been unlocked. If currently dirty 10908 * pages can't be flushed, then don't allow a lock to be set. But 10909 * allow unlocks to succeed, to avoid having orphan locks on the 10910 * server. 10911 */ 10912 if (cmd != F_GETLK) { 10913 mutex_enter(&rp->r_statelock); 10914 while (rp->r_count > 0) { 10915 if (intr) { 10916 klwp_t *lwp = ttolwp(curthread); 10917 10918 if (lwp != NULL) 10919 lwp->lwp_nostop++; 10920 if (cv_wait_sig(&rp->r_cv, 10921 &rp->r_statelock) == 0) { 10922 if (lwp != NULL) 10923 lwp->lwp_nostop--; 10924 rc = EINTR; 10925 break; 10926 } 10927 if (lwp != NULL) 10928 lwp->lwp_nostop--; 10929 } else { 10930 cv_wait(&rp->r_cv, &rp->r_statelock); 10931 } 10932 } 10933 mutex_exit(&rp->r_statelock); 10934 if (rc != 0) 10935 goto done; 10936 error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 10937 if (error) { 10938 if (error == ENOSPC || error == EDQUOT) { 10939 mutex_enter(&rp->r_statelock); 10940 if (!rp->r_error) 10941 rp->r_error = error; 10942 mutex_exit(&rp->r_statelock); 10943 } 10944 if (bfp->l_type != F_UNLCK) { 10945 rc = ENOLCK; 10946 goto done; 10947 } 10948 } 10949 } 10950 10951 /* 10952 * Call the lock manager to do the real work of contacting 10953 * the server and obtaining the lock. 10954 */ 10955 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset, 10956 cr, &e, NULL, NULL); 10957 rc = e.error; 10958 10959 if (rc == 0) 10960 nfs4_lockcompletion(vp, cmd); 10961 10962 done: 10963 nfs_rw_exit(&rp->r_lkserlock); 10964 10965 return (rc); 10966 } 10967 10968 /* 10969 * Free storage space associated with the specified vnode. The portion 10970 * to be freed is specified by bfp->l_start and bfp->l_len (already 10971 * normalized to a "whence" of 0). 10972 * 10973 * This is an experimental facility whose continued existence is not 10974 * guaranteed. Currently, we only support the special case 10975 * of l_len == 0, meaning free to end of file. 10976 */ 10977 /* ARGSUSED */ 10978 static int 10979 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10980 offset_t offset, cred_t *cr, caller_context_t *ct) 10981 { 10982 int error; 10983 10984 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10985 return (EIO); 10986 ASSERT(vp->v_type == VREG); 10987 if (cmd != F_FREESP) 10988 return (EINVAL); 10989 10990 error = convoff(vp, bfp, 0, offset); 10991 if (!error) { 10992 ASSERT(bfp->l_start >= 0); 10993 if (bfp->l_len == 0) { 10994 struct vattr va; 10995 10996 va.va_mask = AT_SIZE; 10997 va.va_size = bfp->l_start; 10998 error = nfs4setattr(vp, &va, 0, cr, NULL); 10999 11000 if (error == 0 && bfp->l_start == 0) 11001 vnevent_truncate(vp, ct); 11002 } else 11003 error = EINVAL; 11004 } 11005 11006 return (error); 11007 } 11008 11009 /* ARGSUSED */ 11010 int 11011 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 11012 { 11013 rnode4_t *rp; 11014 rp = VTOR4(vp); 11015 11016 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 11017 vp = RTOV4(rp); 11018 } 11019 *vpp = vp; 11020 return (0); 11021 } 11022 11023 /* 11024 * Setup and add an address space callback to do the work of the delmap call. 11025 * The callback will (and must be) deleted in the actual callback function. 11026 * 11027 * This is done in order to take care of the problem that we have with holding 11028 * the address space's a_lock for a long period of time (e.g. if the NFS server 11029 * is down). Callbacks will be executed in the address space code while the 11030 * a_lock is not held. Holding the address space's a_lock causes things such 11031 * as ps and fork to hang because they are trying to acquire this lock as well. 11032 */ 11033 /* ARGSUSED */ 11034 static int 11035 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11036 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11037 caller_context_t *ct) 11038 { 11039 int caller_found; 11040 int error; 11041 rnode4_t *rp; 11042 nfs4_delmap_args_t *dmapp; 11043 nfs4_delmapcall_t *delmap_call; 11044 11045 if (vp->v_flag & VNOMAP) 11046 return (ENOSYS); 11047 11048 /* 11049 * A process may not change zones if it has NFS pages mmap'ed 11050 * in, so we can't legitimately get here from the wrong zone. 11051 */ 11052 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11053 11054 rp = VTOR4(vp); 11055 11056 /* 11057 * The way that the address space of this process deletes its mapping 11058 * of this file is via the following call chains: 11059 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11060 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11061 * 11062 * With the use of address space callbacks we are allowed to drop the 11063 * address space lock, a_lock, while executing the NFS operations that 11064 * need to go over the wire. Returning EAGAIN to the caller of this 11065 * function is what drives the execution of the callback that we add 11066 * below. The callback will be executed by the address space code 11067 * after dropping the a_lock. When the callback is finished, since 11068 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11069 * is called again on the same segment to finish the rest of the work 11070 * that needs to happen during unmapping. 11071 * 11072 * This action of calling back into the segment driver causes 11073 * nfs4_delmap() to get called again, but since the callback was 11074 * already executed at this point, it already did the work and there 11075 * is nothing left for us to do. 11076 * 11077 * To Summarize: 11078 * - The first time nfs4_delmap is called by the current thread is when 11079 * we add the caller associated with this delmap to the delmap caller 11080 * list, add the callback, and return EAGAIN. 11081 * - The second time in this call chain when nfs4_delmap is called we 11082 * will find this caller in the delmap caller list and realize there 11083 * is no more work to do thus removing this caller from the list and 11084 * returning the error that was set in the callback execution. 11085 */ 11086 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11087 if (caller_found) { 11088 /* 11089 * 'error' is from the actual delmap operations. To avoid 11090 * hangs, we need to handle the return of EAGAIN differently 11091 * since this is what drives the callback execution. 11092 * In this case, we don't want to return EAGAIN and do the 11093 * callback execution because there are none to execute. 11094 */ 11095 if (error == EAGAIN) 11096 return (0); 11097 else 11098 return (error); 11099 } 11100 11101 /* current caller was not in the list */ 11102 delmap_call = nfs4_init_delmapcall(); 11103 11104 mutex_enter(&rp->r_statelock); 11105 list_insert_tail(&rp->r_indelmap, delmap_call); 11106 mutex_exit(&rp->r_statelock); 11107 11108 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11109 11110 dmapp->vp = vp; 11111 dmapp->off = off; 11112 dmapp->addr = addr; 11113 dmapp->len = len; 11114 dmapp->prot = prot; 11115 dmapp->maxprot = maxprot; 11116 dmapp->flags = flags; 11117 dmapp->cr = cr; 11118 dmapp->caller = delmap_call; 11119 11120 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11121 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11122 11123 return (error ? error : EAGAIN); 11124 } 11125 11126 static nfs4_delmapcall_t * 11127 nfs4_init_delmapcall() 11128 { 11129 nfs4_delmapcall_t *delmap_call; 11130 11131 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11132 delmap_call->call_id = curthread; 11133 delmap_call->error = 0; 11134 11135 return (delmap_call); 11136 } 11137 11138 static void 11139 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11140 { 11141 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11142 } 11143 11144 /* 11145 * Searches for the current delmap caller (based on curthread) in the list of 11146 * callers. If it is found, we remove it and free the delmap caller. 11147 * Returns: 11148 * 0 if the caller wasn't found 11149 * 1 if the caller was found, removed and freed. *errp will be set 11150 * to what the result of the delmap was. 11151 */ 11152 static int 11153 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11154 { 11155 nfs4_delmapcall_t *delmap_call; 11156 11157 /* 11158 * If the list doesn't exist yet, we create it and return 11159 * that the caller wasn't found. No list = no callers. 11160 */ 11161 mutex_enter(&rp->r_statelock); 11162 if (!(rp->r_flags & R4DELMAPLIST)) { 11163 /* The list does not exist */ 11164 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11165 offsetof(nfs4_delmapcall_t, call_node)); 11166 rp->r_flags |= R4DELMAPLIST; 11167 mutex_exit(&rp->r_statelock); 11168 return (0); 11169 } else { 11170 /* The list exists so search it */ 11171 for (delmap_call = list_head(&rp->r_indelmap); 11172 delmap_call != NULL; 11173 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11174 if (delmap_call->call_id == curthread) { 11175 /* current caller is in the list */ 11176 *errp = delmap_call->error; 11177 list_remove(&rp->r_indelmap, delmap_call); 11178 mutex_exit(&rp->r_statelock); 11179 nfs4_free_delmapcall(delmap_call); 11180 return (1); 11181 } 11182 } 11183 } 11184 mutex_exit(&rp->r_statelock); 11185 return (0); 11186 } 11187 11188 /* 11189 * Remove some pages from an mmap'd vnode. Just update the 11190 * count of pages. If doing close-to-open, then flush and 11191 * commit all of the pages associated with this file. 11192 * Otherwise, start an asynchronous page flush to write out 11193 * any dirty pages. This will also associate a credential 11194 * with the rnode which can be used to write the pages. 11195 */ 11196 /* ARGSUSED */ 11197 static void 11198 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11199 { 11200 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11201 rnode4_t *rp; 11202 mntinfo4_t *mi; 11203 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11204 11205 rp = VTOR4(dmapp->vp); 11206 mi = VTOMI4(dmapp->vp); 11207 11208 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11209 ASSERT(rp->r_mapcnt >= 0); 11210 11211 /* 11212 * Initiate a page flush and potential commit if there are 11213 * pages, the file system was not mounted readonly, the segment 11214 * was mapped shared, and the pages themselves were writeable. 11215 */ 11216 if (nfs4_has_pages(dmapp->vp) && 11217 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11218 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11219 mutex_enter(&rp->r_statelock); 11220 rp->r_flags |= R4DIRTY; 11221 mutex_exit(&rp->r_statelock); 11222 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11223 dmapp->len, dmapp->cr); 11224 if (!e.error) { 11225 mutex_enter(&rp->r_statelock); 11226 e.error = rp->r_error; 11227 rp->r_error = 0; 11228 mutex_exit(&rp->r_statelock); 11229 } 11230 } else 11231 e.error = 0; 11232 11233 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11234 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11235 B_INVAL, dmapp->cr, NULL); 11236 11237 if (e.error) { 11238 e.stat = puterrno4(e.error); 11239 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11240 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11241 dmapp->caller->error = e.error; 11242 } 11243 11244 /* Check to see if we need to close the file */ 11245 11246 if (dmapp->vp->v_type == VREG) { 11247 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11248 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11249 11250 if (e.error != 0 || e.stat != NFS4_OK) { 11251 /* 11252 * Since it is possible that e.error == 0 and 11253 * e.stat != NFS4_OK (and vice versa), 11254 * we do the proper checking in order to get both 11255 * e.error and e.stat reporting the correct info. 11256 */ 11257 if (e.stat == NFS4_OK) 11258 e.stat = puterrno4(e.error); 11259 if (e.error == 0) 11260 e.error = geterrno4(e.stat); 11261 11262 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11263 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11264 dmapp->caller->error = e.error; 11265 } 11266 } 11267 11268 (void) as_delete_callback(as, arg); 11269 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11270 } 11271 11272 11273 static uint_t 11274 fattr4_maxfilesize_to_bits(uint64_t ll) 11275 { 11276 uint_t l = 1; 11277 11278 if (ll == 0) { 11279 return (0); 11280 } 11281 11282 if (ll & 0xffffffff00000000) { 11283 l += 32; ll >>= 32; 11284 } 11285 if (ll & 0xffff0000) { 11286 l += 16; ll >>= 16; 11287 } 11288 if (ll & 0xff00) { 11289 l += 8; ll >>= 8; 11290 } 11291 if (ll & 0xf0) { 11292 l += 4; ll >>= 4; 11293 } 11294 if (ll & 0xc) { 11295 l += 2; ll >>= 2; 11296 } 11297 if (ll & 0x2) { 11298 l += 1; 11299 } 11300 return (l); 11301 } 11302 11303 static int 11304 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11305 { 11306 vnode_t *avp = NULL; 11307 int error; 11308 11309 if ((error = nfs4lookup_xattr(vp, "", &avp, 11310 LOOKUP_XATTR, cr)) == 0) 11311 error = do_xattr_exists_check(avp, valp, cr); 11312 if (avp) 11313 VN_RELE(avp); 11314 11315 return (error); 11316 } 11317 11318 /* ARGSUSED */ 11319 int 11320 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11321 caller_context_t *ct) 11322 { 11323 int error; 11324 hrtime_t t; 11325 rnode4_t *rp; 11326 nfs4_ga_res_t gar; 11327 nfs4_ga_ext_res_t ger; 11328 11329 gar.n4g_ext_res = &ger; 11330 11331 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11332 return (EIO); 11333 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11334 *valp = MAXPATHLEN; 11335 return (0); 11336 } 11337 if (cmd == _PC_ACL_ENABLED) { 11338 *valp = _ACL_ACE_ENABLED; 11339 return (0); 11340 } 11341 11342 rp = VTOR4(vp); 11343 if (cmd == _PC_XATTR_EXISTS) { 11344 /* 11345 * The existence of the xattr directory is not sufficient 11346 * for determining whether generic user attributes exists. 11347 * The attribute directory could only be a transient directory 11348 * used for Solaris sysattr support. Do a small readdir 11349 * to verify if the only entries are sysattrs or not. 11350 * 11351 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11352 * is NULL. Once the xadir vp exists, we can create xattrs, 11353 * and we don't have any way to update the "base" object's 11354 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11355 * could help out. 11356 */ 11357 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11358 rp->r_xattr_dir == NULL) { 11359 return (nfs4_have_xattrs(vp, valp, cr)); 11360 } 11361 } else { /* OLD CODE */ 11362 if (ATTRCACHE4_VALID(vp)) { 11363 mutex_enter(&rp->r_statelock); 11364 if (rp->r_pathconf.pc4_cache_valid) { 11365 error = 0; 11366 switch (cmd) { 11367 case _PC_FILESIZEBITS: 11368 *valp = 11369 rp->r_pathconf.pc4_filesizebits; 11370 break; 11371 case _PC_LINK_MAX: 11372 *valp = 11373 rp->r_pathconf.pc4_link_max; 11374 break; 11375 case _PC_NAME_MAX: 11376 *valp = 11377 rp->r_pathconf.pc4_name_max; 11378 break; 11379 case _PC_CHOWN_RESTRICTED: 11380 *valp = 11381 rp->r_pathconf.pc4_chown_restricted; 11382 break; 11383 case _PC_NO_TRUNC: 11384 *valp = 11385 rp->r_pathconf.pc4_no_trunc; 11386 break; 11387 default: 11388 error = EINVAL; 11389 break; 11390 } 11391 mutex_exit(&rp->r_statelock); 11392 #ifdef DEBUG 11393 nfs4_pathconf_cache_hits++; 11394 #endif 11395 return (error); 11396 } 11397 mutex_exit(&rp->r_statelock); 11398 } 11399 } 11400 #ifdef DEBUG 11401 nfs4_pathconf_cache_misses++; 11402 #endif 11403 11404 t = gethrtime(); 11405 11406 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11407 11408 if (error) { 11409 mutex_enter(&rp->r_statelock); 11410 rp->r_pathconf.pc4_cache_valid = FALSE; 11411 rp->r_pathconf.pc4_xattr_valid = FALSE; 11412 mutex_exit(&rp->r_statelock); 11413 return (error); 11414 } 11415 11416 /* interpret the max filesize */ 11417 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11418 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11419 11420 /* Store the attributes we just received */ 11421 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11422 11423 switch (cmd) { 11424 case _PC_FILESIZEBITS: 11425 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11426 break; 11427 case _PC_LINK_MAX: 11428 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11429 break; 11430 case _PC_NAME_MAX: 11431 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11432 break; 11433 case _PC_CHOWN_RESTRICTED: 11434 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11435 break; 11436 case _PC_NO_TRUNC: 11437 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11438 break; 11439 case _PC_XATTR_EXISTS: 11440 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11441 if (error = nfs4_have_xattrs(vp, valp, cr)) 11442 return (error); 11443 } 11444 break; 11445 default: 11446 return (EINVAL); 11447 } 11448 11449 return (0); 11450 } 11451 11452 /* 11453 * Called by async thread to do synchronous pageio. Do the i/o, wait 11454 * for it to complete, and cleanup the page list when done. 11455 */ 11456 static int 11457 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11458 int flags, cred_t *cr) 11459 { 11460 int error; 11461 11462 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11463 11464 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11465 if (flags & B_READ) 11466 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11467 else 11468 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11469 return (error); 11470 } 11471 11472 /* ARGSUSED */ 11473 static int 11474 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11475 int flags, cred_t *cr, caller_context_t *ct) 11476 { 11477 int error; 11478 rnode4_t *rp; 11479 11480 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11481 return (EIO); 11482 11483 if (pp == NULL) 11484 return (EINVAL); 11485 11486 rp = VTOR4(vp); 11487 mutex_enter(&rp->r_statelock); 11488 rp->r_count++; 11489 mutex_exit(&rp->r_statelock); 11490 11491 if (flags & B_ASYNC) { 11492 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11493 nfs4_sync_pageio); 11494 } else 11495 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11496 mutex_enter(&rp->r_statelock); 11497 rp->r_count--; 11498 cv_broadcast(&rp->r_cv); 11499 mutex_exit(&rp->r_statelock); 11500 return (error); 11501 } 11502 11503 /* ARGSUSED */ 11504 static void 11505 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11506 caller_context_t *ct) 11507 { 11508 int error; 11509 rnode4_t *rp; 11510 page_t *plist; 11511 page_t *pptr; 11512 offset3 offset; 11513 count3 len; 11514 k_sigset_t smask; 11515 11516 /* 11517 * We should get called with fl equal to either B_FREE or 11518 * B_INVAL. Any other value is illegal. 11519 * 11520 * The page that we are either supposed to free or destroy 11521 * should be exclusive locked and its io lock should not 11522 * be held. 11523 */ 11524 ASSERT(fl == B_FREE || fl == B_INVAL); 11525 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11526 11527 rp = VTOR4(vp); 11528 11529 /* 11530 * If the page doesn't need to be committed or we shouldn't 11531 * even bother attempting to commit it, then just make sure 11532 * that the p_fsdata byte is clear and then either free or 11533 * destroy the page as appropriate. 11534 */ 11535 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11536 pp->p_fsdata = C_NOCOMMIT; 11537 if (fl == B_FREE) 11538 page_free(pp, dn); 11539 else 11540 page_destroy(pp, dn); 11541 return; 11542 } 11543 11544 /* 11545 * If there is a page invalidation operation going on, then 11546 * if this is one of the pages being destroyed, then just 11547 * clear the p_fsdata byte and then either free or destroy 11548 * the page as appropriate. 11549 */ 11550 mutex_enter(&rp->r_statelock); 11551 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11552 mutex_exit(&rp->r_statelock); 11553 pp->p_fsdata = C_NOCOMMIT; 11554 if (fl == B_FREE) 11555 page_free(pp, dn); 11556 else 11557 page_destroy(pp, dn); 11558 return; 11559 } 11560 11561 /* 11562 * If we are freeing this page and someone else is already 11563 * waiting to do a commit, then just unlock the page and 11564 * return. That other thread will take care of commiting 11565 * this page. The page can be freed sometime after the 11566 * commit has finished. Otherwise, if the page is marked 11567 * as delay commit, then we may be getting called from 11568 * pvn_write_done, one page at a time. This could result 11569 * in one commit per page, so we end up doing lots of small 11570 * commits instead of fewer larger commits. This is bad, 11571 * we want do as few commits as possible. 11572 */ 11573 if (fl == B_FREE) { 11574 if (rp->r_flags & R4COMMITWAIT) { 11575 page_unlock(pp); 11576 mutex_exit(&rp->r_statelock); 11577 return; 11578 } 11579 if (pp->p_fsdata == C_DELAYCOMMIT) { 11580 pp->p_fsdata = C_COMMIT; 11581 page_unlock(pp); 11582 mutex_exit(&rp->r_statelock); 11583 return; 11584 } 11585 } 11586 11587 /* 11588 * Check to see if there is a signal which would prevent an 11589 * attempt to commit the pages from being successful. If so, 11590 * then don't bother with all of the work to gather pages and 11591 * generate the unsuccessful RPC. Just return from here and 11592 * let the page be committed at some later time. 11593 */ 11594 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11595 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11596 sigunintr(&smask); 11597 page_unlock(pp); 11598 mutex_exit(&rp->r_statelock); 11599 return; 11600 } 11601 sigunintr(&smask); 11602 11603 /* 11604 * We are starting to need to commit pages, so let's try 11605 * to commit as many as possible at once to reduce the 11606 * overhead. 11607 * 11608 * Set the `commit inprogress' state bit. We must 11609 * first wait until any current one finishes. Then 11610 * we initialize the c_pages list with this page. 11611 */ 11612 while (rp->r_flags & R4COMMIT) { 11613 rp->r_flags |= R4COMMITWAIT; 11614 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11615 rp->r_flags &= ~R4COMMITWAIT; 11616 } 11617 rp->r_flags |= R4COMMIT; 11618 mutex_exit(&rp->r_statelock); 11619 ASSERT(rp->r_commit.c_pages == NULL); 11620 rp->r_commit.c_pages = pp; 11621 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11622 rp->r_commit.c_commlen = PAGESIZE; 11623 11624 /* 11625 * Gather together all other pages which can be committed. 11626 * They will all be chained off r_commit.c_pages. 11627 */ 11628 nfs4_get_commit(vp); 11629 11630 /* 11631 * Clear the `commit inprogress' status and disconnect 11632 * the list of pages to be committed from the rnode. 11633 * At this same time, we also save the starting offset 11634 * and length of data to be committed on the server. 11635 */ 11636 plist = rp->r_commit.c_pages; 11637 rp->r_commit.c_pages = NULL; 11638 offset = rp->r_commit.c_commbase; 11639 len = rp->r_commit.c_commlen; 11640 mutex_enter(&rp->r_statelock); 11641 rp->r_flags &= ~R4COMMIT; 11642 cv_broadcast(&rp->r_commit.c_cv); 11643 mutex_exit(&rp->r_statelock); 11644 11645 if (curproc == proc_pageout || curproc == proc_fsflush || 11646 nfs_zone() != VTOMI4(vp)->mi_zone) { 11647 nfs4_async_commit(vp, plist, offset, len, 11648 cr, do_nfs4_async_commit); 11649 return; 11650 } 11651 11652 /* 11653 * Actually generate the COMMIT op over the wire operation. 11654 */ 11655 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11656 11657 /* 11658 * If we got an error during the commit, just unlock all 11659 * of the pages. The pages will get retransmitted to the 11660 * server during a putpage operation. 11661 */ 11662 if (error) { 11663 while (plist != NULL) { 11664 pptr = plist; 11665 page_sub(&plist, pptr); 11666 page_unlock(pptr); 11667 } 11668 return; 11669 } 11670 11671 /* 11672 * We've tried as hard as we can to commit the data to stable 11673 * storage on the server. We just unlock the rest of the pages 11674 * and clear the commit required state. They will be put 11675 * onto the tail of the cachelist if they are nolonger 11676 * mapped. 11677 */ 11678 while (plist != pp) { 11679 pptr = plist; 11680 page_sub(&plist, pptr); 11681 pptr->p_fsdata = C_NOCOMMIT; 11682 page_unlock(pptr); 11683 } 11684 11685 /* 11686 * It is possible that nfs4_commit didn't return error but 11687 * some other thread has modified the page we are going 11688 * to free/destroy. 11689 * In this case we need to rewrite the page. Do an explicit check 11690 * before attempting to free/destroy the page. If modified, needs to 11691 * be rewritten so unlock the page and return. 11692 */ 11693 if (hat_ismod(pp)) { 11694 pp->p_fsdata = C_NOCOMMIT; 11695 page_unlock(pp); 11696 return; 11697 } 11698 11699 /* 11700 * Now, as appropriate, either free or destroy the page 11701 * that we were called with. 11702 */ 11703 pp->p_fsdata = C_NOCOMMIT; 11704 if (fl == B_FREE) 11705 page_free(pp, dn); 11706 else 11707 page_destroy(pp, dn); 11708 } 11709 11710 /* 11711 * Commit requires that the current fh be the file written to. 11712 * The compound op structure is: 11713 * PUTFH(file), COMMIT 11714 */ 11715 static int 11716 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11717 { 11718 COMPOUND4args_clnt args; 11719 COMPOUND4res_clnt res; 11720 COMMIT4res *cm_res; 11721 nfs_argop4 argop[2]; 11722 nfs_resop4 *resop; 11723 int doqueue; 11724 mntinfo4_t *mi; 11725 rnode4_t *rp; 11726 cred_t *cred_otw = NULL; 11727 bool_t needrecov = FALSE; 11728 nfs4_recov_state_t recov_state; 11729 nfs4_open_stream_t *osp = NULL; 11730 bool_t first_time = TRUE; /* first time getting OTW cred */ 11731 bool_t last_time = FALSE; /* last time getting OTW cred */ 11732 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11733 11734 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11735 11736 rp = VTOR4(vp); 11737 11738 mi = VTOMI4(vp); 11739 recov_state.rs_flags = 0; 11740 recov_state.rs_num_retry_despite_err = 0; 11741 get_commit_cred: 11742 /* 11743 * Releases the osp, if a valid open stream is provided. 11744 * Puts a hold on the cred_otw and the new osp (if found). 11745 */ 11746 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11747 &first_time, &last_time); 11748 args.ctag = TAG_COMMIT; 11749 recov_retry: 11750 /* 11751 * Commit ops: putfh file; commit 11752 */ 11753 args.array_len = 2; 11754 args.array = argop; 11755 11756 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11757 &recov_state, NULL); 11758 if (e.error) { 11759 crfree(cred_otw); 11760 if (osp != NULL) 11761 open_stream_rele(osp, rp); 11762 return (e.error); 11763 } 11764 11765 /* putfh directory */ 11766 argop[0].argop = OP_CPUTFH; 11767 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11768 11769 /* commit */ 11770 argop[1].argop = OP_COMMIT; 11771 argop[1].nfs_argop4_u.opcommit.offset = offset; 11772 argop[1].nfs_argop4_u.opcommit.count = count; 11773 11774 doqueue = 1; 11775 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11776 11777 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 11778 if (!needrecov && e.error) { 11779 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 11780 needrecov); 11781 crfree(cred_otw); 11782 if (e.error == EACCES && last_time == FALSE) 11783 goto get_commit_cred; 11784 if (osp != NULL) 11785 open_stream_rele(osp, rp); 11786 return (e.error); 11787 } 11788 11789 if (needrecov) { 11790 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 11791 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) { 11792 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11793 &recov_state, needrecov); 11794 if (!e.error) 11795 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11796 goto recov_retry; 11797 } 11798 if (e.error) { 11799 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11800 &recov_state, needrecov); 11801 crfree(cred_otw); 11802 if (osp != NULL) 11803 open_stream_rele(osp, rp); 11804 return (e.error); 11805 } 11806 /* fall through for res.status case */ 11807 } 11808 11809 if (res.status) { 11810 e.error = geterrno4(res.status); 11811 if (e.error == EACCES && last_time == FALSE) { 11812 crfree(cred_otw); 11813 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11814 &recov_state, needrecov); 11815 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11816 goto get_commit_cred; 11817 } 11818 /* 11819 * Can't do a nfs4_purge_stale_fh here because this 11820 * can cause a deadlock. nfs4_commit can 11821 * be called from nfs4_dispose which can be called 11822 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 11823 * can call back to pvn_vplist_dirty. 11824 */ 11825 if (e.error == ESTALE) { 11826 mutex_enter(&rp->r_statelock); 11827 rp->r_flags |= R4STALE; 11828 if (!rp->r_error) 11829 rp->r_error = e.error; 11830 mutex_exit(&rp->r_statelock); 11831 PURGE_ATTRCACHE4(vp); 11832 } else { 11833 mutex_enter(&rp->r_statelock); 11834 if (!rp->r_error) 11835 rp->r_error = e.error; 11836 mutex_exit(&rp->r_statelock); 11837 } 11838 } else { 11839 ASSERT(rp->r_flags & R4HAVEVERF); 11840 resop = &res.array[1]; /* commit res */ 11841 cm_res = &resop->nfs_resop4_u.opcommit; 11842 mutex_enter(&rp->r_statelock); 11843 if (cm_res->writeverf == rp->r_writeverf) { 11844 mutex_exit(&rp->r_statelock); 11845 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11846 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11847 &recov_state, needrecov); 11848 crfree(cred_otw); 11849 if (osp != NULL) 11850 open_stream_rele(osp, rp); 11851 return (0); 11852 } 11853 nfs4_set_mod(vp); 11854 rp->r_writeverf = cm_res->writeverf; 11855 mutex_exit(&rp->r_statelock); 11856 e.error = NFS_VERF_MISMATCH; 11857 } 11858 11859 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 11860 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 11861 crfree(cred_otw); 11862 if (osp != NULL) 11863 open_stream_rele(osp, rp); 11864 11865 return (e.error); 11866 } 11867 11868 static void 11869 nfs4_set_mod(vnode_t *vp) 11870 { 11871 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11872 11873 /* make sure we're looking at the master vnode, not a shadow */ 11874 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check); 11875 } 11876 11877 /* 11878 * This function is used to gather a page list of the pages which 11879 * can be committed on the server. 11880 * 11881 * The calling thread must have set R4COMMIT. This bit is used to 11882 * serialize access to the commit structure in the rnode. As long 11883 * as the thread has set R4COMMIT, then it can manipulate the commit 11884 * structure without requiring any other locks. 11885 * 11886 * When this function is called from nfs4_dispose() the page passed 11887 * into nfs4_dispose() will be SE_EXCL locked, and so this function 11888 * will skip it. This is not a problem since we initially add the 11889 * page to the r_commit page list. 11890 * 11891 */ 11892 static void 11893 nfs4_get_commit(vnode_t *vp) 11894 { 11895 rnode4_t *rp; 11896 page_t *pp; 11897 kmutex_t *vphm; 11898 11899 rp = VTOR4(vp); 11900 11901 ASSERT(rp->r_flags & R4COMMIT); 11902 11903 /* make sure we're looking at the master vnode, not a shadow */ 11904 11905 if (IS_SHADOW(vp, rp)) 11906 vp = RTOV4(rp); 11907 11908 vphm = page_vnode_mutex(vp); 11909 mutex_enter(vphm); 11910 11911 /* 11912 * If there are no pages associated with this vnode, then 11913 * just return. 11914 */ 11915 if ((pp = vp->v_pages) == NULL) { 11916 mutex_exit(vphm); 11917 return; 11918 } 11919 11920 /* 11921 * Step through all of the pages associated with this vnode 11922 * looking for pages which need to be committed. 11923 */ 11924 do { 11925 /* Skip marker pages. */ 11926 if (pp->p_hash == PVN_VPLIST_HASH_TAG) 11927 continue; 11928 11929 /* 11930 * First short-cut everything (without the page_lock) 11931 * and see if this page does not need to be committed 11932 * or is modified if so then we'll just skip it. 11933 */ 11934 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 11935 continue; 11936 11937 /* 11938 * Attempt to lock the page. If we can't, then 11939 * someone else is messing with it or we have been 11940 * called from nfs4_dispose and this is the page that 11941 * nfs4_dispose was called with.. anyway just skip it. 11942 */ 11943 if (!page_trylock(pp, SE_EXCL)) 11944 continue; 11945 11946 /* 11947 * Lets check again now that we have the page lock. 11948 */ 11949 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 11950 page_unlock(pp); 11951 continue; 11952 } 11953 11954 /* this had better not be a free page */ 11955 ASSERT(PP_ISFREE(pp) == 0); 11956 11957 /* 11958 * The page needs to be committed and we locked it. 11959 * Update the base and length parameters and add it 11960 * to r_pages. 11961 */ 11962 if (rp->r_commit.c_pages == NULL) { 11963 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11964 rp->r_commit.c_commlen = PAGESIZE; 11965 } else if (pp->p_offset < rp->r_commit.c_commbase) { 11966 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 11967 (offset3)pp->p_offset + rp->r_commit.c_commlen; 11968 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11969 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 11970 <= pp->p_offset) { 11971 rp->r_commit.c_commlen = (offset3)pp->p_offset - 11972 rp->r_commit.c_commbase + PAGESIZE; 11973 } 11974 page_add(&rp->r_commit.c_pages, pp); 11975 } while ((pp = pp->p_vpnext) != vp->v_pages); 11976 11977 mutex_exit(vphm); 11978 } 11979 11980 /* 11981 * This routine is used to gather together a page list of the pages 11982 * which are to be committed on the server. This routine must not 11983 * be called if the calling thread holds any locked pages. 11984 * 11985 * The calling thread must have set R4COMMIT. This bit is used to 11986 * serialize access to the commit structure in the rnode. As long 11987 * as the thread has set R4COMMIT, then it can manipulate the commit 11988 * structure without requiring any other locks. 11989 */ 11990 static void 11991 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 11992 { 11993 11994 rnode4_t *rp; 11995 page_t *pp; 11996 u_offset_t end; 11997 u_offset_t off; 11998 ASSERT(len != 0); 11999 rp = VTOR4(vp); 12000 ASSERT(rp->r_flags & R4COMMIT); 12001 12002 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12003 12004 /* make sure we're looking at the master vnode, not a shadow */ 12005 12006 if (IS_SHADOW(vp, rp)) 12007 vp = RTOV4(rp); 12008 12009 /* 12010 * If there are no pages associated with this vnode, then 12011 * just return. 12012 */ 12013 if ((pp = vp->v_pages) == NULL) 12014 return; 12015 /* 12016 * Calculate the ending offset. 12017 */ 12018 end = soff + len; 12019 for (off = soff; off < end; off += PAGESIZE) { 12020 /* 12021 * Lookup each page by vp, offset. 12022 */ 12023 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12024 continue; 12025 /* 12026 * If this page does not need to be committed or is 12027 * modified, then just skip it. 12028 */ 12029 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12030 page_unlock(pp); 12031 continue; 12032 } 12033 12034 ASSERT(PP_ISFREE(pp) == 0); 12035 /* 12036 * The page needs to be committed and we locked it. 12037 * Update the base and length parameters and add it 12038 * to r_pages. 12039 */ 12040 if (rp->r_commit.c_pages == NULL) { 12041 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12042 rp->r_commit.c_commlen = PAGESIZE; 12043 } else { 12044 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12045 rp->r_commit.c_commbase + PAGESIZE; 12046 } 12047 page_add(&rp->r_commit.c_pages, pp); 12048 } 12049 } 12050 12051 /* 12052 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12053 * Flushes and commits data to the server. 12054 */ 12055 static int 12056 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12057 { 12058 int error; 12059 verifier4 write_verf; 12060 rnode4_t *rp = VTOR4(vp); 12061 12062 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12063 12064 /* 12065 * Flush the data portion of the file and then commit any 12066 * portions which need to be committed. This may need to 12067 * be done twice if the server has changed state since 12068 * data was last written. The data will need to be 12069 * rewritten to the server and then a new commit done. 12070 * 12071 * In fact, this may need to be done several times if the 12072 * server is having problems and crashing while we are 12073 * attempting to do this. 12074 */ 12075 12076 top: 12077 /* 12078 * Do a flush based on the poff and plen arguments. This 12079 * will synchronously write out any modified pages in the 12080 * range specified by (poff, plen). This starts all of the 12081 * i/o operations which will be waited for in the next 12082 * call to nfs4_putpage 12083 */ 12084 12085 mutex_enter(&rp->r_statelock); 12086 write_verf = rp->r_writeverf; 12087 mutex_exit(&rp->r_statelock); 12088 12089 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12090 if (error == EAGAIN) 12091 error = 0; 12092 12093 /* 12094 * Do a flush based on the poff and plen arguments. This 12095 * will synchronously write out any modified pages in the 12096 * range specified by (poff, plen) and wait until all of 12097 * the asynchronous i/o's in that range are done as well. 12098 */ 12099 if (!error) 12100 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12101 12102 if (error) 12103 return (error); 12104 12105 mutex_enter(&rp->r_statelock); 12106 if (rp->r_writeverf != write_verf) { 12107 mutex_exit(&rp->r_statelock); 12108 goto top; 12109 } 12110 mutex_exit(&rp->r_statelock); 12111 12112 /* 12113 * Now commit any pages which might need to be committed. 12114 * If the error, NFS_VERF_MISMATCH, is returned, then 12115 * start over with the flush operation. 12116 */ 12117 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12118 12119 if (error == NFS_VERF_MISMATCH) 12120 goto top; 12121 12122 return (error); 12123 } 12124 12125 /* 12126 * nfs4_commit_vp() will wait for other pending commits and 12127 * will either commit the whole file or a range, plen dictates 12128 * if we commit whole file. a value of zero indicates the whole 12129 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12130 */ 12131 static int 12132 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12133 cred_t *cr, int wait_on_writes) 12134 { 12135 rnode4_t *rp; 12136 page_t *plist; 12137 offset3 offset; 12138 count3 len; 12139 12140 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12141 12142 rp = VTOR4(vp); 12143 12144 /* 12145 * before we gather commitable pages make 12146 * sure there are no outstanding async writes 12147 */ 12148 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12149 mutex_enter(&rp->r_statelock); 12150 while (rp->r_count > 0) { 12151 cv_wait(&rp->r_cv, &rp->r_statelock); 12152 } 12153 mutex_exit(&rp->r_statelock); 12154 } 12155 12156 /* 12157 * Set the `commit inprogress' state bit. We must 12158 * first wait until any current one finishes. 12159 */ 12160 mutex_enter(&rp->r_statelock); 12161 while (rp->r_flags & R4COMMIT) { 12162 rp->r_flags |= R4COMMITWAIT; 12163 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12164 rp->r_flags &= ~R4COMMITWAIT; 12165 } 12166 rp->r_flags |= R4COMMIT; 12167 mutex_exit(&rp->r_statelock); 12168 12169 /* 12170 * Gather all of the pages which need to be 12171 * committed. 12172 */ 12173 if (plen == 0) 12174 nfs4_get_commit(vp); 12175 else 12176 nfs4_get_commit_range(vp, poff, plen); 12177 12178 /* 12179 * Clear the `commit inprogress' bit and disconnect the 12180 * page list which was gathered by nfs4_get_commit. 12181 */ 12182 plist = rp->r_commit.c_pages; 12183 rp->r_commit.c_pages = NULL; 12184 offset = rp->r_commit.c_commbase; 12185 len = rp->r_commit.c_commlen; 12186 mutex_enter(&rp->r_statelock); 12187 rp->r_flags &= ~R4COMMIT; 12188 cv_broadcast(&rp->r_commit.c_cv); 12189 mutex_exit(&rp->r_statelock); 12190 12191 /* 12192 * If any pages need to be committed, commit them and 12193 * then unlock them so that they can be freed some 12194 * time later. 12195 */ 12196 if (plist == NULL) 12197 return (0); 12198 12199 /* 12200 * No error occurred during the flush portion 12201 * of this operation, so now attempt to commit 12202 * the data to stable storage on the server. 12203 * 12204 * This will unlock all of the pages on the list. 12205 */ 12206 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12207 } 12208 12209 static int 12210 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12211 cred_t *cr) 12212 { 12213 int error; 12214 page_t *pp; 12215 12216 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12217 12218 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12219 12220 /* 12221 * If we got an error, then just unlock all of the pages 12222 * on the list. 12223 */ 12224 if (error) { 12225 while (plist != NULL) { 12226 pp = plist; 12227 page_sub(&plist, pp); 12228 page_unlock(pp); 12229 } 12230 return (error); 12231 } 12232 /* 12233 * We've tried as hard as we can to commit the data to stable 12234 * storage on the server. We just unlock the pages and clear 12235 * the commit required state. They will get freed later. 12236 */ 12237 while (plist != NULL) { 12238 pp = plist; 12239 page_sub(&plist, pp); 12240 pp->p_fsdata = C_NOCOMMIT; 12241 page_unlock(pp); 12242 } 12243 12244 return (error); 12245 } 12246 12247 static void 12248 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12249 cred_t *cr) 12250 { 12251 12252 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12253 } 12254 12255 /*ARGSUSED*/ 12256 static int 12257 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12258 caller_context_t *ct) 12259 { 12260 int error = 0; 12261 mntinfo4_t *mi; 12262 vattr_t va; 12263 vsecattr_t nfsace4_vsap; 12264 12265 mi = VTOMI4(vp); 12266 if (nfs_zone() != mi->mi_zone) 12267 return (EIO); 12268 if (mi->mi_flags & MI4_ACL) { 12269 /* if we have a delegation, return it */ 12270 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12271 (void) nfs4delegreturn(VTOR4(vp), 12272 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12273 12274 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12275 NFS4_ACL_SET); 12276 if (error) /* EINVAL */ 12277 return (error); 12278 12279 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12280 /* 12281 * These are aclent_t type entries. 12282 */ 12283 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12284 vp->v_type == VDIR, FALSE); 12285 if (error) 12286 return (error); 12287 } else { 12288 /* 12289 * These are ace_t type entries. 12290 */ 12291 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12292 FALSE); 12293 if (error) 12294 return (error); 12295 } 12296 bzero(&va, sizeof (va)); 12297 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12298 vs_ace4_destroy(&nfsace4_vsap); 12299 return (error); 12300 } 12301 return (ENOSYS); 12302 } 12303 12304 /* ARGSUSED */ 12305 int 12306 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12307 caller_context_t *ct) 12308 { 12309 int error; 12310 mntinfo4_t *mi; 12311 nfs4_ga_res_t gar; 12312 rnode4_t *rp = VTOR4(vp); 12313 12314 mi = VTOMI4(vp); 12315 if (nfs_zone() != mi->mi_zone) 12316 return (EIO); 12317 12318 bzero(&gar, sizeof (gar)); 12319 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12320 12321 /* 12322 * vsecattr->vsa_mask holds the original acl request mask. 12323 * This is needed when determining what to return. 12324 * (See: nfs4_create_getsecattr_return()) 12325 */ 12326 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12327 if (error) /* EINVAL */ 12328 return (error); 12329 12330 /* 12331 * If this is a referral stub, don't try to go OTW for an ACL 12332 */ 12333 if (RP_ISSTUB_REFERRAL(VTOR4(vp))) 12334 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 12335 12336 if (mi->mi_flags & MI4_ACL) { 12337 /* 12338 * Check if the data is cached and the cache is valid. If it 12339 * is we don't go over the wire. 12340 */ 12341 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12342 mutex_enter(&rp->r_statelock); 12343 if (rp->r_secattr != NULL) { 12344 error = nfs4_create_getsecattr_return( 12345 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12346 rp->r_attr.va_gid, 12347 vp->v_type == VDIR); 12348 if (!error) { /* error == 0 - Success! */ 12349 mutex_exit(&rp->r_statelock); 12350 return (error); 12351 } 12352 } 12353 mutex_exit(&rp->r_statelock); 12354 } 12355 12356 /* 12357 * The getattr otw call will always get both the acl, in 12358 * the form of a list of nfsace4's, and the number of acl 12359 * entries; independent of the value of gar.n4g_va.va_mask. 12360 */ 12361 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12362 if (error) { 12363 vs_ace4_destroy(&gar.n4g_vsa); 12364 if (error == ENOTSUP || error == EOPNOTSUPP) 12365 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12366 return (error); 12367 } 12368 12369 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12370 /* 12371 * No error was returned, but according to the response 12372 * bitmap, neither was an acl. 12373 */ 12374 vs_ace4_destroy(&gar.n4g_vsa); 12375 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12376 return (error); 12377 } 12378 12379 /* 12380 * Update the cache with the ACL. 12381 */ 12382 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12383 12384 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12385 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12386 vp->v_type == VDIR); 12387 vs_ace4_destroy(&gar.n4g_vsa); 12388 if ((error) && (vsecattr->vsa_mask & 12389 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12390 (error != EACCES)) { 12391 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12392 } 12393 return (error); 12394 } 12395 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12396 return (error); 12397 } 12398 12399 /* 12400 * The function returns: 12401 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12402 * - EINVAL if the passed in "acl_mask" is an invalid request. 12403 * 12404 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12405 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12406 * 12407 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12408 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12409 * - We have a count field set without the corresponding acl field set. (e.g. - 12410 * VSA_ACECNT is set, but VSA_ACE is not) 12411 */ 12412 static int 12413 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12414 { 12415 /* Shortcut the masks that are always valid. */ 12416 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12417 return (0); 12418 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12419 return (0); 12420 12421 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12422 /* 12423 * We can't have any VSA_ACL type stuff in the mask now. 12424 */ 12425 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12426 VSA_DFACLCNT)) 12427 return (EINVAL); 12428 12429 if (op == NFS4_ACL_SET) { 12430 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12431 return (EINVAL); 12432 } 12433 } 12434 12435 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12436 /* 12437 * We can't have any VSA_ACE type stuff in the mask now. 12438 */ 12439 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12440 return (EINVAL); 12441 12442 if (op == NFS4_ACL_SET) { 12443 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12444 return (EINVAL); 12445 12446 if ((acl_mask & VSA_DFACLCNT) && 12447 !(acl_mask & VSA_DFACL)) 12448 return (EINVAL); 12449 } 12450 } 12451 return (0); 12452 } 12453 12454 /* 12455 * The theory behind creating the correct getsecattr return is simply this: 12456 * "Don't return anything that the caller is not expecting to have to free." 12457 */ 12458 static int 12459 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12460 uid_t uid, gid_t gid, int isdir) 12461 { 12462 int error = 0; 12463 /* Save the mask since the translators modify it. */ 12464 uint_t orig_mask = vsap->vsa_mask; 12465 12466 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12467 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE); 12468 12469 if (error) 12470 return (error); 12471 12472 /* 12473 * If the caller only asked for the ace count (VSA_ACECNT) 12474 * don't give them the full acl (VSA_ACE), free it. 12475 */ 12476 if (!orig_mask & VSA_ACE) { 12477 if (vsap->vsa_aclentp != NULL) { 12478 kmem_free(vsap->vsa_aclentp, 12479 vsap->vsa_aclcnt * sizeof (ace_t)); 12480 vsap->vsa_aclentp = NULL; 12481 } 12482 } 12483 vsap->vsa_mask = orig_mask; 12484 12485 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12486 VSA_DFACLCNT)) { 12487 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12488 isdir, FALSE); 12489 12490 if (error) 12491 return (error); 12492 12493 /* 12494 * If the caller only asked for the acl count (VSA_ACLCNT) 12495 * and/or the default acl count (VSA_DFACLCNT) don't give them 12496 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12497 */ 12498 if (!orig_mask & VSA_ACL) { 12499 if (vsap->vsa_aclentp != NULL) { 12500 kmem_free(vsap->vsa_aclentp, 12501 vsap->vsa_aclcnt * sizeof (aclent_t)); 12502 vsap->vsa_aclentp = NULL; 12503 } 12504 } 12505 12506 if (!orig_mask & VSA_DFACL) { 12507 if (vsap->vsa_dfaclentp != NULL) { 12508 kmem_free(vsap->vsa_dfaclentp, 12509 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12510 vsap->vsa_dfaclentp = NULL; 12511 } 12512 } 12513 vsap->vsa_mask = orig_mask; 12514 } 12515 return (0); 12516 } 12517 12518 /* ARGSUSED */ 12519 int 12520 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12521 caller_context_t *ct) 12522 { 12523 int error; 12524 12525 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12526 return (EIO); 12527 /* 12528 * check for valid cmd parameter 12529 */ 12530 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12531 return (EINVAL); 12532 12533 /* 12534 * Check access permissions 12535 */ 12536 if ((cmd & F_SHARE) && 12537 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12538 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12539 return (EBADF); 12540 12541 /* 12542 * If the filesystem is mounted using local locking, pass the 12543 * request off to the local share code. 12544 */ 12545 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12546 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12547 12548 switch (cmd) { 12549 case F_SHARE: 12550 case F_UNSHARE: 12551 /* 12552 * This will be properly implemented later, 12553 * see RFE: 4823948 . 12554 */ 12555 error = EAGAIN; 12556 break; 12557 12558 case F_HASREMOTELOCKS: 12559 /* 12560 * NFS client can't store remote locks itself 12561 */ 12562 shr->s_access = 0; 12563 error = 0; 12564 break; 12565 12566 default: 12567 error = EINVAL; 12568 break; 12569 } 12570 12571 return (error); 12572 } 12573 12574 /* 12575 * Common code called by directory ops to update the attrcache 12576 */ 12577 static int 12578 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12579 hrtime_t t, vnode_t *vp, cred_t *cr) 12580 { 12581 int error = 0; 12582 12583 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12584 12585 if (status != NFS4_OK) { 12586 /* getattr not done or failed */ 12587 PURGE_ATTRCACHE4(vp); 12588 return (error); 12589 } 12590 12591 if (garp) { 12592 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12593 } else { 12594 PURGE_ATTRCACHE4(vp); 12595 } 12596 return (error); 12597 } 12598 12599 /* 12600 * Update directory caches for directory modification ops (link, rename, etc.) 12601 * When dinfo is NULL, manage dircaches in the old way. 12602 */ 12603 static void 12604 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12605 dirattr_info_t *dinfo) 12606 { 12607 rnode4_t *drp = VTOR4(dvp); 12608 12609 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12610 12611 /* Purge rddir cache for dir since it changed */ 12612 if (drp->r_dir != NULL) 12613 nfs4_purge_rddir_cache(dvp); 12614 12615 /* 12616 * If caller provided dinfo, then use it to manage dir caches. 12617 */ 12618 if (dinfo != NULL) { 12619 if (vp != NULL) { 12620 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12621 if (!VTOR4(vp)->created_v4) { 12622 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12623 dnlc_update(dvp, nm, vp); 12624 } else { 12625 /* 12626 * XXX don't update if the created_v4 flag is 12627 * set 12628 */ 12629 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12630 NFS4_DEBUG(nfs4_client_state_debug, 12631 (CE_NOTE, "nfs4_update_dircaches: " 12632 "don't update dnlc: created_v4 flag")); 12633 } 12634 } 12635 12636 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12637 dinfo->di_cred, FALSE, cinfo); 12638 12639 return; 12640 } 12641 12642 /* 12643 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12644 * Since caller modified dir but didn't receive post-dirmod-op dir 12645 * attrs, the dir's attrs must be purged. 12646 * 12647 * XXX this check and dnlc update/purge should really be atomic, 12648 * XXX but can't use rnode statelock because it'll deadlock in 12649 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12650 * XXX does occur. 12651 * 12652 * XXX We also may want to check that atomic is true in the 12653 * XXX change_info struct. If it is not, the change_info may 12654 * XXX reflect changes by more than one clients which means that 12655 * XXX our cache may not be valid. 12656 */ 12657 PURGE_ATTRCACHE4(dvp); 12658 if (drp->r_change == cinfo->before) { 12659 /* no changes took place in the directory prior to our link */ 12660 if (vp != NULL) { 12661 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12662 if (!VTOR4(vp)->created_v4) { 12663 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12664 dnlc_update(dvp, nm, vp); 12665 } else { 12666 /* 12667 * XXX dont' update if the created_v4 flag 12668 * is set 12669 */ 12670 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12671 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12672 "nfs4_update_dircaches: don't" 12673 " update dnlc: created_v4 flag")); 12674 } 12675 } 12676 } else { 12677 /* Another client modified directory - purge its dnlc cache */ 12678 dnlc_purge_vp(dvp); 12679 } 12680 } 12681 12682 /* 12683 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12684 * file. 12685 * 12686 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12687 * file (ie: client recovery) and otherwise set to FALSE. 12688 * 12689 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12690 * initiated) calling functions. 12691 * 12692 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12693 * of resending a 'lost' open request. 12694 * 12695 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12696 * server that hands out BAD_SEQID on open confirm. 12697 * 12698 * Errors are returned via the nfs4_error_t parameter. 12699 */ 12700 void 12701 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12702 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12703 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12704 { 12705 COMPOUND4args_clnt args; 12706 COMPOUND4res_clnt res; 12707 nfs_argop4 argop[2]; 12708 nfs_resop4 *resop; 12709 int doqueue = 1; 12710 mntinfo4_t *mi; 12711 OPEN_CONFIRM4args *open_confirm_args; 12712 int needrecov; 12713 12714 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12715 #if DEBUG 12716 mutex_enter(&oop->oo_lock); 12717 ASSERT(oop->oo_seqid_inuse); 12718 mutex_exit(&oop->oo_lock); 12719 #endif 12720 12721 recov_retry_confirm: 12722 nfs4_error_zinit(ep); 12723 *retry_open = FALSE; 12724 12725 if (resend) 12726 args.ctag = TAG_OPEN_CONFIRM_LOST; 12727 else 12728 args.ctag = TAG_OPEN_CONFIRM; 12729 12730 args.array_len = 2; 12731 args.array = argop; 12732 12733 /* putfh target fh */ 12734 argop[0].argop = OP_CPUTFH; 12735 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12736 12737 argop[1].argop = OP_OPEN_CONFIRM; 12738 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12739 12740 (*seqid) += 1; 12741 open_confirm_args->seqid = *seqid; 12742 open_confirm_args->open_stateid = *stateid; 12743 12744 mi = VTOMI4(vp); 12745 12746 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12747 12748 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12749 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12750 } 12751 12752 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12753 if (!needrecov && ep->error) 12754 return; 12755 12756 if (needrecov) { 12757 bool_t abort = FALSE; 12758 12759 if (reopening_file == FALSE) { 12760 nfs4_bseqid_entry_t *bsep = NULL; 12761 12762 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12763 bsep = nfs4_create_bseqid_entry(oop, NULL, 12764 vp, 0, args.ctag, 12765 open_confirm_args->seqid); 12766 12767 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 12768 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL); 12769 if (bsep) { 12770 kmem_free(bsep, sizeof (*bsep)); 12771 if (num_bseqid_retryp && 12772 --(*num_bseqid_retryp) == 0) 12773 abort = TRUE; 12774 } 12775 } 12776 if ((ep->error == ETIMEDOUT || 12777 res.status == NFS4ERR_RESOURCE) && 12778 abort == FALSE && resend == FALSE) { 12779 if (!ep->error) 12780 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12781 12782 delay(SEC_TO_TICK(confirm_retry_sec)); 12783 goto recov_retry_confirm; 12784 } 12785 /* State may have changed so retry the entire OPEN op */ 12786 if (abort == FALSE) 12787 *retry_open = TRUE; 12788 else 12789 *retry_open = FALSE; 12790 if (!ep->error) 12791 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12792 return; 12793 } 12794 12795 if (res.status) { 12796 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12797 return; 12798 } 12799 12800 resop = &res.array[1]; /* open confirm res */ 12801 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 12802 stateid, sizeof (*stateid)); 12803 12804 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12805 } 12806 12807 /* 12808 * Return the credentials associated with a client state object. The 12809 * caller is responsible for freeing the credentials. 12810 */ 12811 12812 static cred_t * 12813 state_to_cred(nfs4_open_stream_t *osp) 12814 { 12815 cred_t *cr; 12816 12817 /* 12818 * It's ok to not lock the open stream and open owner to get 12819 * the oo_cred since this is only written once (upon creation) 12820 * and will not change. 12821 */ 12822 cr = osp->os_open_owner->oo_cred; 12823 crhold(cr); 12824 12825 return (cr); 12826 } 12827 12828 /* 12829 * nfs4_find_sysid 12830 * 12831 * Find the sysid for the knetconfig associated with the given mi. 12832 */ 12833 static struct lm_sysid * 12834 nfs4_find_sysid(mntinfo4_t *mi) 12835 { 12836 ASSERT(nfs_zone() == mi->mi_zone); 12837 12838 /* 12839 * Switch from RDMA knconf to original mount knconf 12840 */ 12841 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 12842 mi->mi_curr_serv->sv_hostname, NULL)); 12843 } 12844 12845 #ifdef DEBUG 12846 /* 12847 * Return a string version of the call type for easy reading. 12848 */ 12849 static char * 12850 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 12851 { 12852 switch (ctype) { 12853 case NFS4_LCK_CTYPE_NORM: 12854 return ("NORMAL"); 12855 case NFS4_LCK_CTYPE_RECLAIM: 12856 return ("RECLAIM"); 12857 case NFS4_LCK_CTYPE_RESEND: 12858 return ("RESEND"); 12859 case NFS4_LCK_CTYPE_REINSTATE: 12860 return ("REINSTATE"); 12861 default: 12862 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 12863 "type %d", ctype); 12864 return (""); 12865 } 12866 } 12867 #endif 12868 12869 /* 12870 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 12871 * Unlock requests don't have an over-the-wire locktype, so we just return 12872 * something non-threatening. 12873 */ 12874 12875 static nfs_lock_type4 12876 flk_to_locktype(int cmd, int l_type) 12877 { 12878 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 12879 12880 switch (l_type) { 12881 case F_UNLCK: 12882 return (READ_LT); 12883 case F_RDLCK: 12884 if (cmd == F_SETLK) 12885 return (READ_LT); 12886 else 12887 return (READW_LT); 12888 case F_WRLCK: 12889 if (cmd == F_SETLK) 12890 return (WRITE_LT); 12891 else 12892 return (WRITEW_LT); 12893 } 12894 panic("flk_to_locktype"); 12895 /*NOTREACHED*/ 12896 } 12897 12898 /* 12899 * Do some preliminary checks for nfs4frlock. 12900 */ 12901 static int 12902 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp, 12903 u_offset_t offset) 12904 { 12905 int error = 0; 12906 12907 /* 12908 * If we are setting a lock, check that the file is opened 12909 * with the correct mode. 12910 */ 12911 if (cmd == F_SETLK || cmd == F_SETLKW) { 12912 if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) || 12913 (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) { 12914 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12915 "nfs4frlock_validate_args: file was opened with " 12916 "incorrect mode")); 12917 return (EBADF); 12918 } 12919 } 12920 12921 /* Convert the offset. It may need to be restored before returning. */ 12922 if (error = convoff(vp, flk, 0, offset)) { 12923 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12924 "nfs4frlock_validate_args: convoff => error= %d\n", 12925 error)); 12926 return (error); 12927 } 12928 12929 return (error); 12930 } 12931 12932 /* 12933 * Set the flock64's lm_sysid for nfs4frlock. 12934 */ 12935 static int 12936 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 12937 { 12938 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12939 12940 /* Find the lm_sysid */ 12941 *lspp = nfs4_find_sysid(VTOMI4(vp)); 12942 12943 if (*lspp == NULL) { 12944 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 12945 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 12946 return (ENOLCK); 12947 } 12948 12949 flk->l_sysid = lm_sysidt(*lspp); 12950 12951 return (0); 12952 } 12953 12954 /* 12955 * Do the remaining preliminary setup for nfs4frlock. 12956 */ 12957 static void 12958 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 12959 flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr, 12960 cred_t **cred_otw) 12961 { 12962 /* 12963 * set tick_delay to the base delay time. 12964 * (NFS4_BASE_WAIT_TIME is in secs) 12965 */ 12966 12967 *tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000); 12968 12969 /* 12970 * If lock is relative to EOF, we need the newest length of the 12971 * file. Therefore invalidate the ATTR_CACHE. 12972 */ 12973 12974 *whencep = flk->l_whence; 12975 12976 if (*whencep == 2) /* SEEK_END */ 12977 PURGE_ATTRCACHE4(vp); 12978 12979 recov_statep->rs_flags = 0; 12980 recov_statep->rs_num_retry_despite_err = 0; 12981 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 12982 } 12983 12984 /* 12985 * Initialize and allocate the data structures necessary for 12986 * the nfs4frlock call. 12987 * Allocates argsp's op array. 12988 */ 12989 static void 12990 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 12991 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 12992 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 12993 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 12994 { 12995 int argoplist_size; 12996 int num_ops = 2; 12997 12998 *retry = FALSE; 12999 *did_start_fop = FALSE; 13000 *skip_get_err = FALSE; 13001 lost_rqstp->lr_op = 0; 13002 argoplist_size = num_ops * sizeof (nfs_argop4); 13003 /* fill array with zero */ 13004 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 13005 13006 *argspp = argsp; 13007 *respp = NULL; 13008 13009 argsp->array_len = num_ops; 13010 argsp->array = *argopp; 13011 13012 /* initialize in case of error; will get real value down below */ 13013 argsp->ctag = TAG_NONE; 13014 13015 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13016 *op_hintp = OH_LOCKU; 13017 else 13018 *op_hintp = OH_OTHER; 13019 } 13020 13021 /* 13022 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13023 * the proper nfs4_server_t for this instance of nfs4frlock. 13024 * Returns 0 (success) or an errno value. 13025 */ 13026 static int 13027 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13028 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13029 bool_t *did_start_fop, bool_t *startrecovp) 13030 { 13031 int error = 0; 13032 rnode4_t *rp; 13033 13034 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13035 13036 if (ctype == NFS4_LCK_CTYPE_NORM) { 13037 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13038 recov_statep, startrecovp); 13039 if (error) 13040 return (error); 13041 *did_start_fop = TRUE; 13042 } else { 13043 *did_start_fop = FALSE; 13044 *startrecovp = FALSE; 13045 } 13046 13047 if (!error) { 13048 rp = VTOR4(vp); 13049 13050 /* If the file failed recovery, just quit. */ 13051 mutex_enter(&rp->r_statelock); 13052 if (rp->r_flags & R4RECOVERR) { 13053 error = EIO; 13054 } 13055 mutex_exit(&rp->r_statelock); 13056 } 13057 13058 return (error); 13059 } 13060 13061 /* 13062 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13063 * resend nfs4frlock call is initiated by the recovery framework. 13064 * Acquires the lop and oop seqid synchronization. 13065 */ 13066 static void 13067 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13068 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13069 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13070 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13071 { 13072 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13073 int error; 13074 13075 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13076 (CE_NOTE, 13077 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13078 ASSERT(resend_rqstp != NULL); 13079 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13080 resend_rqstp->lr_op == OP_LOCKU); 13081 13082 *oopp = resend_rqstp->lr_oop; 13083 if (resend_rqstp->lr_oop) { 13084 open_owner_hold(resend_rqstp->lr_oop); 13085 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13086 ASSERT(error == 0); /* recov thread always succeeds */ 13087 } 13088 13089 /* Must resend this lost lock/locku request. */ 13090 ASSERT(resend_rqstp->lr_lop != NULL); 13091 *lopp = resend_rqstp->lr_lop; 13092 lock_owner_hold(resend_rqstp->lr_lop); 13093 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13094 ASSERT(error == 0); /* recov thread always succeeds */ 13095 13096 *ospp = resend_rqstp->lr_osp; 13097 if (*ospp) 13098 open_stream_hold(resend_rqstp->lr_osp); 13099 13100 if (resend_rqstp->lr_op == OP_LOCK) { 13101 LOCK4args *lock_args; 13102 13103 argop->argop = OP_LOCK; 13104 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13105 lock_args->locktype = resend_rqstp->lr_locktype; 13106 lock_args->reclaim = 13107 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13108 lock_args->offset = resend_rqstp->lr_flk->l_start; 13109 lock_args->length = resend_rqstp->lr_flk->l_len; 13110 if (lock_args->length == 0) 13111 lock_args->length = ~lock_args->length; 13112 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13113 mi2clientid(mi), &lock_args->locker); 13114 13115 switch (resend_rqstp->lr_ctype) { 13116 case NFS4_LCK_CTYPE_RESEND: 13117 argsp->ctag = TAG_LOCK_RESEND; 13118 break; 13119 case NFS4_LCK_CTYPE_REINSTATE: 13120 argsp->ctag = TAG_LOCK_REINSTATE; 13121 break; 13122 case NFS4_LCK_CTYPE_RECLAIM: 13123 argsp->ctag = TAG_LOCK_RECLAIM; 13124 break; 13125 default: 13126 argsp->ctag = TAG_LOCK_UNKNOWN; 13127 break; 13128 } 13129 } else { 13130 LOCKU4args *locku_args; 13131 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13132 13133 argop->argop = OP_LOCKU; 13134 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13135 locku_args->locktype = READ_LT; 13136 locku_args->seqid = lop->lock_seqid + 1; 13137 mutex_enter(&lop->lo_lock); 13138 locku_args->lock_stateid = lop->lock_stateid; 13139 mutex_exit(&lop->lo_lock); 13140 locku_args->offset = resend_rqstp->lr_flk->l_start; 13141 locku_args->length = resend_rqstp->lr_flk->l_len; 13142 if (locku_args->length == 0) 13143 locku_args->length = ~locku_args->length; 13144 13145 switch (resend_rqstp->lr_ctype) { 13146 case NFS4_LCK_CTYPE_RESEND: 13147 argsp->ctag = TAG_LOCKU_RESEND; 13148 break; 13149 case NFS4_LCK_CTYPE_REINSTATE: 13150 argsp->ctag = TAG_LOCKU_REINSTATE; 13151 break; 13152 default: 13153 argsp->ctag = TAG_LOCK_UNKNOWN; 13154 break; 13155 } 13156 } 13157 } 13158 13159 /* 13160 * Setup the LOCKT4 arguments. 13161 */ 13162 static void 13163 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13164 LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk, 13165 rnode4_t *rp) 13166 { 13167 LOCKT4args *lockt_args; 13168 13169 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13170 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13171 argop->argop = OP_LOCKT; 13172 argsp->ctag = TAG_LOCKT; 13173 lockt_args = &argop->nfs_argop4_u.oplockt; 13174 13175 /* 13176 * The locktype will be READ_LT unless it's 13177 * a write lock. We do this because the Solaris 13178 * system call allows the combination of 13179 * F_UNLCK and F_GETLK* and so in that case the 13180 * unlock is mapped to a read. 13181 */ 13182 if (flk->l_type == F_WRLCK) 13183 lockt_args->locktype = WRITE_LT; 13184 else 13185 lockt_args->locktype = READ_LT; 13186 13187 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13188 /* set the lock owner4 args */ 13189 nfs4_setlockowner_args(&lockt_args->owner, rp, 13190 ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13191 flk->l_pid); 13192 lockt_args->offset = flk->l_start; 13193 lockt_args->length = flk->l_len; 13194 if (flk->l_len == 0) 13195 lockt_args->length = ~lockt_args->length; 13196 13197 *lockt_argsp = lockt_args; 13198 } 13199 13200 /* 13201 * If the client is holding a delegation, and the open stream to be used 13202 * with this lock request is a delegation open stream, then re-open the stream. 13203 * Sets the nfs4_error_t to all zeros unless the open stream has already 13204 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13205 * means the caller should retry (like a recovery retry). 13206 */ 13207 static void 13208 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13209 { 13210 open_delegation_type4 dt; 13211 bool_t reopen_needed, force; 13212 nfs4_open_stream_t *osp; 13213 open_claim_type4 oclaim; 13214 rnode4_t *rp = VTOR4(vp); 13215 mntinfo4_t *mi = VTOMI4(vp); 13216 13217 ASSERT(nfs_zone() == mi->mi_zone); 13218 13219 nfs4_error_zinit(ep); 13220 13221 mutex_enter(&rp->r_statev4_lock); 13222 dt = rp->r_deleg_type; 13223 mutex_exit(&rp->r_statev4_lock); 13224 13225 if (dt != OPEN_DELEGATE_NONE) { 13226 nfs4_open_owner_t *oop; 13227 13228 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13229 if (!oop) { 13230 ep->stat = NFS4ERR_IO; 13231 return; 13232 } 13233 /* returns with 'os_sync_lock' held */ 13234 osp = find_open_stream(oop, rp); 13235 if (!osp) { 13236 open_owner_rele(oop); 13237 ep->stat = NFS4ERR_IO; 13238 return; 13239 } 13240 13241 if (osp->os_failed_reopen) { 13242 NFS4_DEBUG((nfs4_open_stream_debug || 13243 nfs4_client_lock_debug), (CE_NOTE, 13244 "nfs4frlock_check_deleg: os_failed_reopen set " 13245 "for osp %p, cr %p, rp %s", (void *)osp, 13246 (void *)cr, rnode4info(rp))); 13247 mutex_exit(&osp->os_sync_lock); 13248 open_stream_rele(osp, rp); 13249 open_owner_rele(oop); 13250 ep->stat = NFS4ERR_IO; 13251 return; 13252 } 13253 13254 /* 13255 * Determine whether a reopen is needed. If this 13256 * is a delegation open stream, then send the open 13257 * to the server to give visibility to the open owner. 13258 * Even if it isn't a delegation open stream, we need 13259 * to check if the previous open CLAIM_DELEGATE_CUR 13260 * was sufficient. 13261 */ 13262 13263 reopen_needed = osp->os_delegation || 13264 ((lt == F_RDLCK && 13265 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13266 (lt == F_WRLCK && 13267 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13268 13269 mutex_exit(&osp->os_sync_lock); 13270 open_owner_rele(oop); 13271 13272 if (reopen_needed) { 13273 /* 13274 * Always use CLAIM_PREVIOUS after server reboot. 13275 * The server will reject CLAIM_DELEGATE_CUR if 13276 * it is used during the grace period. 13277 */ 13278 mutex_enter(&mi->mi_lock); 13279 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13280 oclaim = CLAIM_PREVIOUS; 13281 force = TRUE; 13282 } else { 13283 oclaim = CLAIM_DELEGATE_CUR; 13284 force = FALSE; 13285 } 13286 mutex_exit(&mi->mi_lock); 13287 13288 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13289 if (ep->error == EAGAIN) { 13290 nfs4_error_zinit(ep); 13291 ep->stat = NFS4ERR_DELAY; 13292 } 13293 } 13294 open_stream_rele(osp, rp); 13295 osp = NULL; 13296 } 13297 } 13298 13299 /* 13300 * Setup the LOCKU4 arguments. 13301 * Returns errors via the nfs4_error_t. 13302 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13303 * over-the-wire. The caller must release the 13304 * reference on *lopp. 13305 * NFS4ERR_DELAY caller should retry (like recovery retry) 13306 * (other) unrecoverable error. 13307 */ 13308 static void 13309 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13310 LOCKU4args **locku_argsp, flock64_t *flk, 13311 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13312 vnode_t *vp, int flag, u_offset_t offset, cred_t *cr, 13313 bool_t *skip_get_err, bool_t *go_otwp) 13314 { 13315 nfs4_lock_owner_t *lop = NULL; 13316 LOCKU4args *locku_args; 13317 pid_t pid; 13318 bool_t is_spec = FALSE; 13319 rnode4_t *rp = VTOR4(vp); 13320 13321 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13322 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13323 13324 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13325 if (ep->error || ep->stat) 13326 return; 13327 13328 argop->argop = OP_LOCKU; 13329 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13330 argsp->ctag = TAG_LOCKU_REINSTATE; 13331 else 13332 argsp->ctag = TAG_LOCKU; 13333 locku_args = &argop->nfs_argop4_u.oplocku; 13334 *locku_argsp = locku_args; 13335 13336 /* locktype should be set to any legal value */ 13337 locku_args->locktype = READ_LT; 13338 13339 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id : 13340 flk->l_pid; 13341 13342 /* 13343 * Get the lock owner stateid. If no lock owner 13344 * exists, return success. 13345 */ 13346 lop = find_lock_owner(rp, pid, LOWN_ANY); 13347 *lopp = lop; 13348 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13349 is_spec = TRUE; 13350 if (!lop || is_spec) { 13351 /* 13352 * No lock owner so no locks to unlock. 13353 * Return success. If there was a failed 13354 * reclaim earlier, the lock might still be 13355 * registered with the local locking code, 13356 * so notify it of the unlock. 13357 * 13358 * If the lockowner is using a special stateid, 13359 * then the original lock request (that created 13360 * this lockowner) was never successful, so we 13361 * have no lock to undo OTW. 13362 */ 13363 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13364 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13365 "(%ld) so return success", (long)pid)); 13366 13367 if (ctype == NFS4_LCK_CTYPE_NORM) 13368 flk->l_pid = curproc->p_pid; 13369 nfs4_register_lock_locally(vp, flk, flag, offset); 13370 /* 13371 * Release our hold and NULL out so final_cleanup 13372 * doesn't try to end a lock seqid sync we 13373 * never started. 13374 */ 13375 if (is_spec) { 13376 lock_owner_rele(lop); 13377 *lopp = NULL; 13378 } 13379 *skip_get_err = TRUE; 13380 *go_otwp = FALSE; 13381 return; 13382 } 13383 13384 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13385 if (ep->error == EAGAIN) { 13386 lock_owner_rele(lop); 13387 *lopp = NULL; 13388 return; 13389 } 13390 13391 mutex_enter(&lop->lo_lock); 13392 locku_args->lock_stateid = lop->lock_stateid; 13393 mutex_exit(&lop->lo_lock); 13394 locku_args->seqid = lop->lock_seqid + 1; 13395 13396 /* leave the ref count on lop, rele after RPC call */ 13397 13398 locku_args->offset = flk->l_start; 13399 locku_args->length = flk->l_len; 13400 if (flk->l_len == 0) 13401 locku_args->length = ~locku_args->length; 13402 13403 *go_otwp = TRUE; 13404 } 13405 13406 /* 13407 * Setup the LOCK4 arguments. 13408 * 13409 * Returns errors via the nfs4_error_t. 13410 * NFS4_OK no problems 13411 * NFS4ERR_DELAY caller should retry (like recovery retry) 13412 * (other) unrecoverable error 13413 */ 13414 static void 13415 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13416 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13417 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13418 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13419 { 13420 LOCK4args *lock_args; 13421 nfs4_open_owner_t *oop = NULL; 13422 nfs4_open_stream_t *osp = NULL; 13423 nfs4_lock_owner_t *lop = NULL; 13424 pid_t pid; 13425 rnode4_t *rp = VTOR4(vp); 13426 13427 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13428 13429 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13430 if (ep->error || ep->stat != NFS4_OK) 13431 return; 13432 13433 argop->argop = OP_LOCK; 13434 if (ctype == NFS4_LCK_CTYPE_NORM) 13435 argsp->ctag = TAG_LOCK; 13436 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13437 argsp->ctag = TAG_RELOCK; 13438 else 13439 argsp->ctag = TAG_LOCK_REINSTATE; 13440 lock_args = &argop->nfs_argop4_u.oplock; 13441 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13442 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13443 /* 13444 * Get the lock owner. If no lock owner exists, 13445 * create a 'temporary' one and grab the open seqid 13446 * synchronization (which puts a hold on the open 13447 * owner and open stream). 13448 * This also grabs the lock seqid synchronization. 13449 */ 13450 pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid; 13451 ep->stat = 13452 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13453 13454 if (ep->stat != NFS4_OK) 13455 goto out; 13456 13457 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13458 &lock_args->locker); 13459 13460 lock_args->offset = flk->l_start; 13461 lock_args->length = flk->l_len; 13462 if (flk->l_len == 0) 13463 lock_args->length = ~lock_args->length; 13464 *lock_argsp = lock_args; 13465 out: 13466 *oopp = oop; 13467 *ospp = osp; 13468 *lopp = lop; 13469 } 13470 13471 /* 13472 * After we get the reply from the server, record the proper information 13473 * for possible resend lock requests. 13474 */ 13475 static void 13476 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13477 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13478 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13479 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13480 { 13481 bool_t unlock = (flk->l_type == F_UNLCK); 13482 13483 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13484 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13485 ctype == NFS4_LCK_CTYPE_REINSTATE); 13486 13487 if (error != 0 && !unlock) { 13488 NFS4_DEBUG((nfs4_lost_rqst_debug || 13489 nfs4_client_lock_debug), (CE_NOTE, 13490 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13491 " for lop %p", (void *)lop)); 13492 ASSERT(lop != NULL); 13493 mutex_enter(&lop->lo_lock); 13494 lop->lo_pending_rqsts = 1; 13495 mutex_exit(&lop->lo_lock); 13496 } 13497 13498 lost_rqstp->lr_putfirst = FALSE; 13499 lost_rqstp->lr_op = 0; 13500 13501 /* 13502 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13503 * recovery purposes so that the lock request that was sent 13504 * can be saved and re-issued later. Ditto for EIO from a forced 13505 * unmount. This is done to have the client's local locking state 13506 * match the v4 server's state; that is, the request was 13507 * potentially received and accepted by the server but the client 13508 * thinks it was not. 13509 */ 13510 if (error == ETIMEDOUT || error == EINTR || 13511 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13512 NFS4_DEBUG((nfs4_lost_rqst_debug || 13513 nfs4_client_lock_debug), (CE_NOTE, 13514 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13515 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13516 (void *)lop, (void *)oop, (void *)osp)); 13517 if (unlock) 13518 lost_rqstp->lr_op = OP_LOCKU; 13519 else { 13520 lost_rqstp->lr_op = OP_LOCK; 13521 lost_rqstp->lr_locktype = locktype; 13522 } 13523 /* 13524 * Objects are held and rele'd via the recovery code. 13525 * See nfs4_save_lost_rqst. 13526 */ 13527 lost_rqstp->lr_vp = vp; 13528 lost_rqstp->lr_dvp = NULL; 13529 lost_rqstp->lr_oop = oop; 13530 lost_rqstp->lr_osp = osp; 13531 lost_rqstp->lr_lop = lop; 13532 lost_rqstp->lr_cr = cr; 13533 switch (ctype) { 13534 case NFS4_LCK_CTYPE_NORM: 13535 flk->l_pid = ttoproc(curthread)->p_pid; 13536 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13537 break; 13538 case NFS4_LCK_CTYPE_REINSTATE: 13539 lost_rqstp->lr_putfirst = TRUE; 13540 lost_rqstp->lr_ctype = ctype; 13541 break; 13542 default: 13543 break; 13544 } 13545 lost_rqstp->lr_flk = flk; 13546 } 13547 } 13548 13549 /* 13550 * Update lop's seqid. Also update the seqid stored in a resend request, 13551 * if any. (Some recovery errors increment the seqid, and we may have to 13552 * send the resend request again.) 13553 */ 13554 13555 static void 13556 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13557 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13558 { 13559 if (lock_args) { 13560 if (lock_args->locker.new_lock_owner == TRUE) 13561 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13562 else { 13563 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13564 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13565 } 13566 } else if (locku_args) { 13567 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13568 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13569 } 13570 } 13571 13572 /* 13573 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13574 * COMPOUND4 args/res for calls that need to retry. 13575 * Switches the *cred_otwp to base_cr. 13576 */ 13577 static void 13578 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13579 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13580 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13581 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13582 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13583 { 13584 nfs4_open_owner_t *oop = *oopp; 13585 nfs4_open_stream_t *osp = *ospp; 13586 nfs4_lock_owner_t *lop = *lopp; 13587 nfs_argop4 *argop = (*argspp)->array; 13588 13589 if (*did_start_fop) { 13590 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13591 needrecov); 13592 *did_start_fop = FALSE; 13593 } 13594 ASSERT((*argspp)->array_len == 2); 13595 if (argop[1].argop == OP_LOCK) 13596 nfs4args_lock_free(&argop[1]); 13597 else if (argop[1].argop == OP_LOCKT) 13598 nfs4args_lockt_free(&argop[1]); 13599 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13600 if (!error) 13601 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13602 *argspp = NULL; 13603 *respp = NULL; 13604 13605 if (lop) { 13606 nfs4_end_lock_seqid_sync(lop); 13607 lock_owner_rele(lop); 13608 *lopp = NULL; 13609 } 13610 13611 /* need to free up the reference on osp for lock args */ 13612 if (osp != NULL) { 13613 open_stream_rele(osp, VTOR4(vp)); 13614 *ospp = NULL; 13615 } 13616 13617 /* need to free up the reference on oop for lock args */ 13618 if (oop != NULL) { 13619 nfs4_end_open_seqid_sync(oop); 13620 open_owner_rele(oop); 13621 *oopp = NULL; 13622 } 13623 13624 crfree(*cred_otwp); 13625 *cred_otwp = base_cr; 13626 crhold(*cred_otwp); 13627 } 13628 13629 /* 13630 * Function to process the client's recovery for nfs4frlock. 13631 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13632 * 13633 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13634 * COMPOUND4 args/res for calls that need to retry. 13635 * 13636 * Note: the rp's r_lkserlock is *not* dropped during this path. 13637 */ 13638 static bool_t 13639 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13640 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13641 LOCK4args *lock_args, LOCKU4args *locku_args, 13642 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13643 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13644 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13645 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13646 { 13647 nfs4_open_owner_t *oop = *oopp; 13648 nfs4_open_stream_t *osp = *ospp; 13649 nfs4_lock_owner_t *lop = *lopp; 13650 13651 bool_t abort, retry; 13652 13653 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13654 ASSERT((*argspp) != NULL); 13655 ASSERT((*respp) != NULL); 13656 if (lock_args || locku_args) 13657 ASSERT(lop != NULL); 13658 13659 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13660 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13661 13662 retry = TRUE; 13663 abort = FALSE; 13664 if (needrecov) { 13665 nfs4_bseqid_entry_t *bsep = NULL; 13666 nfs_opnum4 op; 13667 13668 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13669 13670 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13671 seqid4 seqid; 13672 13673 if (lock_args) { 13674 if (lock_args->locker.new_lock_owner == TRUE) 13675 seqid = lock_args->locker.locker4_u. 13676 open_owner.open_seqid; 13677 else 13678 seqid = lock_args->locker.locker4_u. 13679 lock_owner.lock_seqid; 13680 } else if (locku_args) { 13681 seqid = locku_args->seqid; 13682 } else { 13683 seqid = 0; 13684 } 13685 13686 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13687 flk->l_pid, (*argspp)->ctag, seqid); 13688 } 13689 13690 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13691 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13692 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13693 NULL, op, bsep, NULL, NULL); 13694 13695 if (bsep) 13696 kmem_free(bsep, sizeof (*bsep)); 13697 } 13698 13699 /* 13700 * Return that we do not want to retry the request for 3 cases: 13701 * 1. If we received EINTR or are bailing out because of a forced 13702 * unmount, we came into this code path just for the sake of 13703 * initiating recovery, we now need to return the error. 13704 * 2. If we have aborted recovery. 13705 * 3. We received NFS4ERR_BAD_SEQID. 13706 */ 13707 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13708 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13709 retry = FALSE; 13710 13711 if (*did_start_fop == TRUE) { 13712 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13713 needrecov); 13714 *did_start_fop = FALSE; 13715 } 13716 13717 if (retry == TRUE) { 13718 nfs_argop4 *argop; 13719 13720 argop = (*argspp)->array; 13721 ASSERT((*argspp)->array_len == 2); 13722 13723 if (argop[1].argop == OP_LOCK) 13724 nfs4args_lock_free(&argop[1]); 13725 else if (argop[1].argop == OP_LOCKT) 13726 nfs4args_lockt_free(&argop[1]); 13727 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13728 if (!ep->error) 13729 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13730 *respp = NULL; 13731 *argspp = NULL; 13732 } 13733 13734 if (lop != NULL) { 13735 nfs4_end_lock_seqid_sync(lop); 13736 lock_owner_rele(lop); 13737 } 13738 13739 *lopp = NULL; 13740 13741 /* need to free up the reference on osp for lock args */ 13742 if (osp != NULL) { 13743 open_stream_rele(osp, rp); 13744 *ospp = NULL; 13745 } 13746 13747 /* need to free up the reference on oop for lock args */ 13748 if (oop != NULL) { 13749 nfs4_end_open_seqid_sync(oop); 13750 open_owner_rele(oop); 13751 *oopp = NULL; 13752 } 13753 13754 return (retry); 13755 } 13756 13757 /* 13758 * Handles the successful reply from the server for nfs4frlock. 13759 */ 13760 static void 13761 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk, 13762 vnode_t *vp, int flag, u_offset_t offset, 13763 nfs4_lost_rqst_t *resend_rqstp) 13764 { 13765 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13766 if ((cmd == F_SETLK || cmd == F_SETLKW) && 13767 (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) { 13768 if (ctype == NFS4_LCK_CTYPE_NORM) { 13769 flk->l_pid = ttoproc(curthread)->p_pid; 13770 /* 13771 * We do not register lost locks locally in 13772 * the 'resend' case since the user/application 13773 * doesn't think we have the lock. 13774 */ 13775 ASSERT(!resend_rqstp); 13776 nfs4_register_lock_locally(vp, flk, flag, offset); 13777 } 13778 } 13779 } 13780 13781 /* 13782 * Handle the DENIED reply from the server for nfs4frlock. 13783 * Returns TRUE if we should retry the request; FALSE otherwise. 13784 * 13785 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13786 * COMPOUND4 args/res for calls that need to retry. Can also 13787 * drop and regrab the r_lkserlock. 13788 */ 13789 static bool_t 13790 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13791 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13792 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13793 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13794 nfs4_recov_state_t *recov_statep, int needrecov, 13795 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13796 clock_t *tick_delayp, short *whencep, int *errorp, 13797 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13798 bool_t *skip_get_err) 13799 { 13800 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13801 13802 if (lock_args) { 13803 nfs4_open_owner_t *oop = *oopp; 13804 nfs4_open_stream_t *osp = *ospp; 13805 nfs4_lock_owner_t *lop = *lopp; 13806 int intr; 13807 13808 /* 13809 * Blocking lock needs to sleep and retry from the request. 13810 * 13811 * Do not block and wait for 'resend' or 'reinstate' 13812 * lock requests, just return the error. 13813 * 13814 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13815 */ 13816 if (cmd == F_SETLKW) { 13817 rnode4_t *rp = VTOR4(vp); 13818 nfs_argop4 *argop = (*argspp)->array; 13819 13820 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13821 13822 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13823 recov_statep, needrecov); 13824 *did_start_fop = FALSE; 13825 ASSERT((*argspp)->array_len == 2); 13826 if (argop[1].argop == OP_LOCK) 13827 nfs4args_lock_free(&argop[1]); 13828 else if (argop[1].argop == OP_LOCKT) 13829 nfs4args_lockt_free(&argop[1]); 13830 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13831 if (*respp) 13832 xdr_free(xdr_COMPOUND4res_clnt, 13833 (caddr_t)*respp); 13834 *argspp = NULL; 13835 *respp = NULL; 13836 nfs4_end_lock_seqid_sync(lop); 13837 lock_owner_rele(lop); 13838 *lopp = NULL; 13839 if (osp != NULL) { 13840 open_stream_rele(osp, rp); 13841 *ospp = NULL; 13842 } 13843 if (oop != NULL) { 13844 nfs4_end_open_seqid_sync(oop); 13845 open_owner_rele(oop); 13846 *oopp = NULL; 13847 } 13848 13849 nfs_rw_exit(&rp->r_lkserlock); 13850 13851 intr = nfs4_block_and_wait(tick_delayp, rp); 13852 13853 if (intr) { 13854 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13855 RW_WRITER, FALSE); 13856 *errorp = EINTR; 13857 return (FALSE); 13858 } 13859 13860 (void) nfs_rw_enter_sig(&rp->r_lkserlock, 13861 RW_WRITER, FALSE); 13862 13863 /* 13864 * Make sure we are still safe to lock with 13865 * regards to mmapping. 13866 */ 13867 if (!nfs4_safelock(vp, flk, cr)) { 13868 *errorp = EAGAIN; 13869 return (FALSE); 13870 } 13871 13872 return (TRUE); 13873 } 13874 if (ctype == NFS4_LCK_CTYPE_NORM) 13875 *errorp = EAGAIN; 13876 *skip_get_err = TRUE; 13877 flk->l_whence = 0; 13878 *whencep = 0; 13879 return (FALSE); 13880 } else if (lockt_args) { 13881 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13882 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 13883 13884 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 13885 flk, lockt_args); 13886 13887 /* according to NLM code */ 13888 *errorp = 0; 13889 *whencep = 0; 13890 *skip_get_err = TRUE; 13891 return (FALSE); 13892 } 13893 return (FALSE); 13894 } 13895 13896 /* 13897 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 13898 */ 13899 static void 13900 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 13901 { 13902 switch (resp->status) { 13903 case NFS4ERR_ACCESS: 13904 case NFS4ERR_ADMIN_REVOKED: 13905 case NFS4ERR_BADHANDLE: 13906 case NFS4ERR_BAD_RANGE: 13907 case NFS4ERR_BAD_SEQID: 13908 case NFS4ERR_BAD_STATEID: 13909 case NFS4ERR_BADXDR: 13910 case NFS4ERR_DEADLOCK: 13911 case NFS4ERR_DELAY: 13912 case NFS4ERR_EXPIRED: 13913 case NFS4ERR_FHEXPIRED: 13914 case NFS4ERR_GRACE: 13915 case NFS4ERR_INVAL: 13916 case NFS4ERR_ISDIR: 13917 case NFS4ERR_LEASE_MOVED: 13918 case NFS4ERR_LOCK_NOTSUPP: 13919 case NFS4ERR_LOCK_RANGE: 13920 case NFS4ERR_MOVED: 13921 case NFS4ERR_NOFILEHANDLE: 13922 case NFS4ERR_NO_GRACE: 13923 case NFS4ERR_OLD_STATEID: 13924 case NFS4ERR_OPENMODE: 13925 case NFS4ERR_RECLAIM_BAD: 13926 case NFS4ERR_RECLAIM_CONFLICT: 13927 case NFS4ERR_RESOURCE: 13928 case NFS4ERR_SERVERFAULT: 13929 case NFS4ERR_STALE: 13930 case NFS4ERR_STALE_CLIENTID: 13931 case NFS4ERR_STALE_STATEID: 13932 return; 13933 default: 13934 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13935 "nfs4frlock_results_default: got unrecognizable " 13936 "res.status %d", resp->status)); 13937 *errorp = NFS4ERR_INVAL; 13938 } 13939 } 13940 13941 /* 13942 * The lock request was successful, so update the client's state. 13943 */ 13944 static void 13945 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 13946 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 13947 vnode_t *vp, flock64_t *flk, cred_t *cr, 13948 nfs4_lost_rqst_t *resend_rqstp) 13949 { 13950 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13951 13952 if (lock_args) { 13953 LOCK4res *lock_res; 13954 13955 lock_res = &resop->nfs_resop4_u.oplock; 13956 /* update the stateid with server's response */ 13957 13958 if (lock_args->locker.new_lock_owner == TRUE) { 13959 mutex_enter(&lop->lo_lock); 13960 lop->lo_just_created = NFS4_PERM_CREATED; 13961 mutex_exit(&lop->lo_lock); 13962 } 13963 13964 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 13965 13966 /* 13967 * If the lock was the result of a resending a lost 13968 * request, we've synched up the stateid and seqid 13969 * with the server, but now the server might be out of sync 13970 * with what the application thinks it has for locks. 13971 * Clean that up here. It's unclear whether we should do 13972 * this even if the filesystem has been forcibly unmounted. 13973 * For most servers, it's probably wasted effort, but 13974 * RFC 7530 lets servers require that unlocks exactly match 13975 * the locks that are held. 13976 */ 13977 if (resend_rqstp != NULL && 13978 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 13979 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 13980 } else { 13981 flk->l_whence = 0; 13982 } 13983 } else if (locku_args) { 13984 LOCKU4res *locku_res; 13985 13986 locku_res = &resop->nfs_resop4_u.oplocku; 13987 13988 /* Update the stateid with the server's response */ 13989 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 13990 } else if (lockt_args) { 13991 /* Switch the lock type to express success, see fcntl */ 13992 flk->l_type = F_UNLCK; 13993 flk->l_whence = 0; 13994 } 13995 } 13996 13997 /* 13998 * Do final cleanup before exiting nfs4frlock. 13999 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14000 * COMPOUND4 args/res for calls that haven't already. 14001 */ 14002 static void 14003 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14004 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14005 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14006 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 14007 short whence, u_offset_t offset, struct lm_sysid *ls, 14008 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14009 bool_t did_start_fop, bool_t skip_get_err, 14010 cred_t *cred_otw, cred_t *cred) 14011 { 14012 mntinfo4_t *mi = VTOMI4(vp); 14013 rnode4_t *rp = VTOR4(vp); 14014 int error = *errorp; 14015 nfs_argop4 *argop; 14016 int do_flush_pages = 0; 14017 14018 ASSERT(nfs_zone() == mi->mi_zone); 14019 /* 14020 * The client recovery code wants the raw status information, 14021 * so don't map the NFS status code to an errno value for 14022 * non-normal call types. 14023 */ 14024 if (ctype == NFS4_LCK_CTYPE_NORM) { 14025 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14026 *errorp = geterrno4(resp->status); 14027 if (did_start_fop == TRUE) 14028 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14029 needrecov); 14030 14031 /* 14032 * We've established a new lock on the server, so invalidate 14033 * the pages associated with the vnode to get the most up to 14034 * date pages from the server after acquiring the lock. We 14035 * want to be sure that the read operation gets the newest data. 14036 * N.B. 14037 * We used to do this in nfs4frlock_results_ok but that doesn't 14038 * work since VOP_PUTPAGE can call nfs4_commit which calls 14039 * nfs4_start_fop. We flush the pages below after calling 14040 * nfs4_end_fop above 14041 * The flush of the page cache must be done after 14042 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14043 */ 14044 if (!error && resp && resp->status == NFS4_OK) 14045 do_flush_pages = 1; 14046 } 14047 if (argsp) { 14048 ASSERT(argsp->array_len == 2); 14049 argop = argsp->array; 14050 if (argop[1].argop == OP_LOCK) 14051 nfs4args_lock_free(&argop[1]); 14052 else if (argop[1].argop == OP_LOCKT) 14053 nfs4args_lockt_free(&argop[1]); 14054 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14055 if (resp) 14056 xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14057 } 14058 14059 /* free the reference on the lock owner */ 14060 if (lop != NULL) { 14061 nfs4_end_lock_seqid_sync(lop); 14062 lock_owner_rele(lop); 14063 } 14064 14065 /* need to free up the reference on osp for lock args */ 14066 if (osp != NULL) 14067 open_stream_rele(osp, rp); 14068 14069 /* need to free up the reference on oop for lock args */ 14070 if (oop != NULL) { 14071 nfs4_end_open_seqid_sync(oop); 14072 open_owner_rele(oop); 14073 } 14074 14075 if (do_flush_pages) 14076 nfs4_flush_pages(vp, cred); 14077 14078 (void) convoff(vp, flk, whence, offset); 14079 14080 lm_rel_sysid(ls); 14081 14082 /* 14083 * Record debug information in the event we get EINVAL. 14084 */ 14085 mutex_enter(&mi->mi_lock); 14086 if (*errorp == EINVAL && (lock_args || locku_args) && 14087 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14088 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14089 zcmn_err(getzoneid(), CE_NOTE, 14090 "%s operation failed with " 14091 "EINVAL probably since the server, %s," 14092 " doesn't support POSIX style locking", 14093 lock_args ? "LOCK" : "LOCKU", 14094 mi->mi_curr_serv->sv_hostname); 14095 mi->mi_flags |= MI4_LOCK_DEBUG; 14096 } 14097 } 14098 mutex_exit(&mi->mi_lock); 14099 14100 if (cred_otw) 14101 crfree(cred_otw); 14102 } 14103 14104 /* 14105 * This calls the server and the local locking code. 14106 * 14107 * Client locks are registerred locally by oring the sysid with 14108 * LM_SYSID_CLIENT. The server registers locks locally using just the sysid. 14109 * We need to distinguish between the two to avoid collision in case one 14110 * machine is used as both client and server. 14111 * 14112 * Blocking lock requests will continually retry to acquire the lock 14113 * forever. 14114 * 14115 * The ctype is defined as follows: 14116 * NFS4_LCK_CTYPE_NORM: normal lock request. 14117 * 14118 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14119 * recovery, get the pid from flk instead of curproc, and don't reregister 14120 * the lock locally. 14121 * 14122 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14123 * that we will use the information passed in via resend_rqstp to setup the 14124 * lock/locku request. This resend is the exact same request as the 'lost 14125 * lock', and is initiated by the recovery framework. A successful resend 14126 * request can initiate one or more reinstate requests. 14127 * 14128 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14129 * does not trigger additional reinstate requests. This lock call type is 14130 * set for setting the v4 server's locking state back to match what the 14131 * client's local locking state is in the event of a received 'lost lock'. 14132 * 14133 * Errors are returned via the nfs4_error_t parameter. 14134 */ 14135 void 14136 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14137 int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep, 14138 nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp) 14139 { 14140 COMPOUND4args_clnt args, *argsp = NULL; 14141 COMPOUND4res_clnt res, *resp = NULL; 14142 nfs_argop4 *argop; 14143 nfs_resop4 *resop; 14144 rnode4_t *rp; 14145 int doqueue = 1; 14146 clock_t tick_delay; /* delay in clock ticks */ 14147 struct lm_sysid *ls; 14148 LOCK4args *lock_args = NULL; 14149 LOCKU4args *locku_args = NULL; 14150 LOCKT4args *lockt_args = NULL; 14151 nfs4_open_owner_t *oop = NULL; 14152 nfs4_open_stream_t *osp = NULL; 14153 nfs4_lock_owner_t *lop = NULL; 14154 bool_t needrecov = FALSE; 14155 nfs4_recov_state_t recov_state; 14156 short whence; 14157 nfs4_op_hint_t op_hint; 14158 nfs4_lost_rqst_t lost_rqst; 14159 bool_t retry = FALSE; 14160 bool_t did_start_fop = FALSE; 14161 bool_t skip_get_err = FALSE; 14162 cred_t *cred_otw = NULL; 14163 bool_t recovonly; /* just queue request */ 14164 int frc_no_reclaim = 0; 14165 #ifdef DEBUG 14166 char *name; 14167 #endif 14168 14169 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14170 14171 #ifdef DEBUG 14172 name = fn_name(VTOSV(vp)->sv_name); 14173 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14174 "%s: cmd %d, type %d, offset %llu, start %"PRIx64", " 14175 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14176 "resend request %s", name, cmd, flk->l_type, offset, flk->l_start, 14177 flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : 14178 flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype), 14179 resend_rqstp ? "TRUE" : "FALSE")); 14180 kmem_free(name, MAXNAMELEN); 14181 #endif 14182 14183 nfs4_error_zinit(ep); 14184 ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset); 14185 if (ep->error) 14186 return; 14187 ep->error = nfs4frlock_get_sysid(&ls, vp, flk); 14188 if (ep->error) 14189 return; 14190 nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence, 14191 vp, cr, &cred_otw); 14192 14193 recov_retry: 14194 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14195 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14196 rp = VTOR4(vp); 14197 14198 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14199 &did_start_fop, &recovonly); 14200 14201 if (ep->error) 14202 goto out; 14203 14204 if (recovonly) { 14205 /* 14206 * Leave the request for the recovery system to deal with. 14207 */ 14208 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14209 ASSERT(cmd != F_GETLK); 14210 ASSERT(flk->l_type == F_UNLCK); 14211 14212 nfs4_error_init(ep, EINTR); 14213 needrecov = TRUE; 14214 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14215 if (lop != NULL) { 14216 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14217 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14218 (void) nfs4_start_recovery(ep, 14219 VTOMI4(vp), vp, NULL, NULL, 14220 (lost_rqst.lr_op == OP_LOCK || 14221 lost_rqst.lr_op == OP_LOCKU) ? 14222 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL); 14223 lock_owner_rele(lop); 14224 lop = NULL; 14225 } 14226 flk->l_pid = curproc->p_pid; 14227 nfs4_register_lock_locally(vp, flk, flag, offset); 14228 goto out; 14229 } 14230 14231 /* putfh directory fh */ 14232 argop[0].argop = OP_CPUTFH; 14233 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14234 14235 /* 14236 * Set up the over-the-wire arguments and get references to the 14237 * open owner, etc. 14238 */ 14239 14240 if (ctype == NFS4_LCK_CTYPE_RESEND || 14241 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14242 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14243 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14244 } else { 14245 bool_t go_otw = TRUE; 14246 14247 ASSERT(resend_rqstp == NULL); 14248 14249 switch (cmd) { 14250 case F_GETLK: 14251 nfs4frlock_setup_lockt_args(ctype, &argop[1], 14252 &lockt_args, argsp, flk, rp); 14253 break; 14254 case F_SETLKW: 14255 case F_SETLK: 14256 if (flk->l_type == F_UNLCK) 14257 nfs4frlock_setup_locku_args(ctype, 14258 &argop[1], &locku_args, flk, 14259 &lop, ep, argsp, 14260 vp, flag, offset, cr, 14261 &skip_get_err, &go_otw); 14262 else 14263 nfs4frlock_setup_lock_args(ctype, 14264 &lock_args, &oop, &osp, &lop, &argop[1], 14265 argsp, flk, cmd, vp, cr, ep); 14266 14267 if (ep->error) 14268 goto out; 14269 14270 switch (ep->stat) { 14271 case NFS4_OK: 14272 break; 14273 case NFS4ERR_DELAY: 14274 /* recov thread never gets this error */ 14275 ASSERT(resend_rqstp == NULL); 14276 ASSERT(did_start_fop); 14277 14278 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14279 &recov_state, TRUE); 14280 did_start_fop = FALSE; 14281 if (argop[1].argop == OP_LOCK) 14282 nfs4args_lock_free(&argop[1]); 14283 else if (argop[1].argop == OP_LOCKT) 14284 nfs4args_lockt_free(&argop[1]); 14285 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14286 argsp = NULL; 14287 goto recov_retry; 14288 default: 14289 ep->error = EIO; 14290 goto out; 14291 } 14292 break; 14293 default: 14294 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14295 "nfs4_frlock: invalid cmd %d", cmd)); 14296 ep->error = EINVAL; 14297 goto out; 14298 } 14299 14300 if (!go_otw) 14301 goto out; 14302 } 14303 14304 /* XXX should we use the local reclock as a cache ? */ 14305 /* 14306 * Unregister the lock with the local locking code before 14307 * contacting the server. This avoids a potential race where 14308 * another process gets notified that it has been granted a lock 14309 * before we can unregister ourselves locally. 14310 */ 14311 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) { 14312 if (ctype == NFS4_LCK_CTYPE_NORM) 14313 flk->l_pid = ttoproc(curthread)->p_pid; 14314 nfs4_register_lock_locally(vp, flk, flag, offset); 14315 } 14316 14317 /* 14318 * Send the server the lock request. Continually loop with a delay 14319 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14320 */ 14321 resp = &res; 14322 14323 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14324 (CE_NOTE, 14325 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14326 rnode4info(rp))); 14327 14328 if (lock_args && frc_no_reclaim) { 14329 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14330 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14331 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14332 lock_args->reclaim = FALSE; 14333 if (did_reclaimp) 14334 *did_reclaimp = 0; 14335 } 14336 14337 /* 14338 * Do the OTW call. 14339 */ 14340 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14341 14342 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14343 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14344 14345 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14346 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14347 "nfs4frlock: needrecov %d", needrecov)); 14348 14349 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14350 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14351 args.ctag); 14352 14353 /* 14354 * Check if one of these mutually exclusive error cases has 14355 * happened: 14356 * need to swap credentials due to access error 14357 * recovery is needed 14358 * different error (only known case is missing Kerberos ticket) 14359 */ 14360 14361 if ((ep->error == EACCES || 14362 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14363 cred_otw != cr) { 14364 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14365 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14366 cr, &cred_otw); 14367 goto recov_retry; 14368 } 14369 14370 if (needrecov) { 14371 /* 14372 * LOCKT requests don't need to recover from lost 14373 * requests since they don't create/modify state. 14374 */ 14375 if ((ep->error == EINTR || 14376 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14377 lockt_args) 14378 goto out; 14379 /* 14380 * Do not attempt recovery for requests initiated by 14381 * the recovery framework. Let the framework redrive them. 14382 */ 14383 if (ctype != NFS4_LCK_CTYPE_NORM) 14384 goto out; 14385 else { 14386 ASSERT(resend_rqstp == NULL); 14387 } 14388 14389 nfs4frlock_save_lost_rqst(ctype, ep->error, 14390 flk_to_locktype(cmd, flk->l_type), 14391 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14392 14393 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14394 &resp, lock_args, locku_args, &oop, &osp, &lop, 14395 rp, vp, &recov_state, op_hint, &did_start_fop, 14396 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14397 14398 if (retry) { 14399 ASSERT(oop == NULL); 14400 ASSERT(osp == NULL); 14401 ASSERT(lop == NULL); 14402 goto recov_retry; 14403 } 14404 goto out; 14405 } 14406 14407 /* 14408 * Bail out if have reached this point with ep->error set. Can 14409 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14410 * This happens if Kerberos ticket has expired or has been 14411 * destroyed. 14412 */ 14413 if (ep->error != 0) 14414 goto out; 14415 14416 /* 14417 * Process the reply. 14418 */ 14419 switch (resp->status) { 14420 case NFS4_OK: 14421 resop = &resp->array[1]; 14422 nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset, 14423 resend_rqstp); 14424 /* 14425 * Have a successful lock operation, now update state. 14426 */ 14427 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14428 resop, lop, vp, flk, cr, resend_rqstp); 14429 break; 14430 14431 case NFS4ERR_DENIED: 14432 resop = &resp->array[1]; 14433 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14434 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14435 &recov_state, needrecov, &argsp, &resp, 14436 &tick_delay, &whence, &ep->error, resop, cr, 14437 &did_start_fop, &skip_get_err); 14438 14439 if (retry) { 14440 ASSERT(oop == NULL); 14441 ASSERT(osp == NULL); 14442 ASSERT(lop == NULL); 14443 goto recov_retry; 14444 } 14445 break; 14446 /* 14447 * If the server won't let us reclaim, fall-back to trying to lock 14448 * the file from scratch. Code elsewhere will check the changeinfo 14449 * to ensure the file hasn't been changed. 14450 */ 14451 case NFS4ERR_NO_GRACE: 14452 if (lock_args && lock_args->reclaim == TRUE) { 14453 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14454 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14455 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14456 frc_no_reclaim = 1; 14457 /* clean up before retrying */ 14458 needrecov = 0; 14459 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14460 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14461 &recov_state, op_hint, &did_start_fop, NULL, flk); 14462 goto recov_retry; 14463 } 14464 /* FALLTHROUGH */ 14465 14466 default: 14467 nfs4frlock_results_default(resp, &ep->error); 14468 break; 14469 } 14470 out: 14471 /* 14472 * Process and cleanup from error. Make interrupted unlock 14473 * requests look successful, since they will be handled by the 14474 * client recovery code. 14475 */ 14476 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14477 needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error, 14478 lock_args, locku_args, did_start_fop, 14479 skip_get_err, cred_otw, cr); 14480 14481 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14482 (cmd == F_SETLK || cmd == F_SETLKW)) 14483 ep->error = 0; 14484 } 14485 14486 /* 14487 * nfs4_safelock: 14488 * 14489 * Return non-zero if the given lock request can be handled without 14490 * violating the constraints on concurrent mapping and locking. 14491 */ 14492 14493 static int 14494 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14495 { 14496 rnode4_t *rp = VTOR4(vp); 14497 struct vattr va; 14498 int error; 14499 14500 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14501 ASSERT(rp->r_mapcnt >= 0); 14502 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14503 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14504 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14505 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14506 14507 if (rp->r_mapcnt == 0) 14508 return (1); /* always safe if not mapped */ 14509 14510 /* 14511 * If the file is already mapped and there are locks, then they 14512 * should be all safe locks. So adding or removing a lock is safe 14513 * as long as the new request is safe (i.e., whole-file, meaning 14514 * length and starting offset are both zero). 14515 */ 14516 14517 if (bfp->l_start != 0 || bfp->l_len != 0) { 14518 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14519 "cannot lock a memory mapped file unless locking the " 14520 "entire file: start %"PRIx64", len %"PRIx64, 14521 bfp->l_start, bfp->l_len)); 14522 return (0); 14523 } 14524 14525 /* mandatory locking and mapping don't mix */ 14526 va.va_mask = AT_MODE; 14527 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14528 if (error != 0) { 14529 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14530 "getattr error %d", error)); 14531 return (0); /* treat errors conservatively */ 14532 } 14533 if (MANDLOCK(vp, va.va_mode)) { 14534 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14535 "cannot mandatory lock and mmap a file")); 14536 return (0); 14537 } 14538 14539 return (1); 14540 } 14541 14542 14543 /* 14544 * Register the lock locally within Solaris. 14545 * As the client, we "or" the sysid with LM_SYSID_CLIENT when 14546 * recording locks locally. 14547 * 14548 * This should handle conflicts/cooperation with NFS v2/v3 since all locks 14549 * are registered locally. 14550 */ 14551 void 14552 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag, 14553 u_offset_t offset) 14554 { 14555 int oldsysid; 14556 int error; 14557 #ifdef DEBUG 14558 char *name; 14559 #endif 14560 14561 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14562 14563 #ifdef DEBUG 14564 name = fn_name(VTOSV(vp)->sv_name); 14565 NFS4_DEBUG(nfs4_client_lock_debug, 14566 (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, " 14567 "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d", 14568 name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid, 14569 flk->l_sysid)); 14570 kmem_free(name, MAXNAMELEN); 14571 #endif 14572 14573 /* register the lock with local locking */ 14574 oldsysid = flk->l_sysid; 14575 flk->l_sysid |= LM_SYSID_CLIENT; 14576 error = reclock(vp, flk, SETFLCK, flag, offset, NULL); 14577 #ifdef DEBUG 14578 if (error != 0) { 14579 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14580 "nfs4_register_lock_locally: could not register with" 14581 " local locking")); 14582 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14583 "error %d, vp 0x%p, pid %d, sysid 0x%x", 14584 error, (void *)vp, flk->l_pid, flk->l_sysid)); 14585 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14586 "type %d off 0x%" PRIx64 " len 0x%" PRIx64, 14587 flk->l_type, flk->l_start, flk->l_len)); 14588 (void) reclock(vp, flk, 0, flag, offset, NULL); 14589 NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT, 14590 "blocked by pid %d sysid 0x%x type %d " 14591 "off 0x%" PRIx64 " len 0x%" PRIx64, 14592 flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start, 14593 flk->l_len)); 14594 } 14595 #endif 14596 flk->l_sysid = oldsysid; 14597 } 14598 14599 /* 14600 * nfs4_lockrelease: 14601 * 14602 * Release any locks on the given vnode that are held by the current 14603 * process. Also removes the lock owner (if one exists) from the rnode's 14604 * list. 14605 */ 14606 static int 14607 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14608 { 14609 flock64_t ld; 14610 int ret, error; 14611 rnode4_t *rp; 14612 nfs4_lock_owner_t *lop; 14613 nfs4_recov_state_t recov_state; 14614 mntinfo4_t *mi; 14615 bool_t possible_orphan = FALSE; 14616 bool_t recovonly; 14617 14618 ASSERT((uintptr_t)vp > KERNELBASE); 14619 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14620 14621 rp = VTOR4(vp); 14622 mi = VTOMI4(vp); 14623 14624 /* 14625 * If we have not locked anything then we can 14626 * just return since we have no work to do. 14627 */ 14628 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14629 return (0); 14630 } 14631 14632 /* 14633 * We need to comprehend that another thread may 14634 * kick off recovery and the lock_owner we have stashed 14635 * in lop might be invalid so we should NOT cache it 14636 * locally! 14637 */ 14638 recov_state.rs_flags = 0; 14639 recov_state.rs_num_retry_despite_err = 0; 14640 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14641 &recovonly); 14642 if (error) { 14643 mutex_enter(&rp->r_statelock); 14644 rp->r_flags |= R4LODANGLERS; 14645 mutex_exit(&rp->r_statelock); 14646 return (error); 14647 } 14648 14649 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14650 14651 /* 14652 * Check if the lock owner might have a lock (request was sent but 14653 * no response was received). Also check if there are any remote 14654 * locks on the file. (In theory we shouldn't have to make this 14655 * second check if there's no lock owner, but for now we'll be 14656 * conservative and do it anyway.) If either condition is true, 14657 * send an unlock for the entire file to the server. 14658 * 14659 * Note that no explicit synchronization is needed here. At worst, 14660 * flk_has_remote_locks() will return a false positive, in which case 14661 * the unlock call wastes time but doesn't harm correctness. 14662 */ 14663 14664 if (lop) { 14665 mutex_enter(&lop->lo_lock); 14666 possible_orphan = lop->lo_pending_rqsts; 14667 mutex_exit(&lop->lo_lock); 14668 lock_owner_rele(lop); 14669 } 14670 14671 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14672 14673 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14674 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14675 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14676 (void *)lop)); 14677 14678 if (possible_orphan || flk_has_remote_locks(vp)) { 14679 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14680 ld.l_whence = 0; /* unlock from start of file */ 14681 ld.l_start = 0; 14682 ld.l_len = 0; /* do entire file */ 14683 14684 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14685 cr, NULL); 14686 14687 if (ret != 0) { 14688 /* 14689 * If VOP_FRLOCK fails, make sure we unregister 14690 * local locks before we continue. 14691 */ 14692 ld.l_pid = ttoproc(curthread)->p_pid; 14693 nfs4_register_lock_locally(vp, &ld, flag, offset); 14694 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14695 "nfs4_lockrelease: lock release error on vp" 14696 " %p: error %d.\n", (void *)vp, ret)); 14697 } 14698 } 14699 14700 recov_state.rs_flags = 0; 14701 recov_state.rs_num_retry_despite_err = 0; 14702 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14703 &recovonly); 14704 if (error) { 14705 mutex_enter(&rp->r_statelock); 14706 rp->r_flags |= R4LODANGLERS; 14707 mutex_exit(&rp->r_statelock); 14708 return (error); 14709 } 14710 14711 /* 14712 * So, here we're going to need to retrieve the lock-owner 14713 * again (in case recovery has done a switch-a-roo) and 14714 * remove it because we can. 14715 */ 14716 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14717 14718 if (lop) { 14719 nfs4_rnode_remove_lock_owner(rp, lop); 14720 lock_owner_rele(lop); 14721 } 14722 14723 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14724 return (0); 14725 } 14726 14727 /* 14728 * Wait for 'tick_delay' clock ticks. 14729 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14730 * NOTE: lock_lease_time is in seconds. 14731 * 14732 * XXX For future improvements, should implement a waiting queue scheme. 14733 */ 14734 static int 14735 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp) 14736 { 14737 long milliseconds_delay; 14738 time_t lock_lease_time; 14739 14740 /* wait tick_delay clock ticks or siginteruptus */ 14741 if (delay_sig(*tick_delay)) { 14742 return (EINTR); 14743 } 14744 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14745 "reissue the lock request: blocked for %ld clock ticks: %ld " 14746 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14747 14748 /* get the lease time */ 14749 lock_lease_time = r2lease_time(rp); 14750 14751 /* drv_hztousec converts ticks to microseconds */ 14752 milliseconds_delay = drv_hztousec(*tick_delay) / 1000; 14753 if (milliseconds_delay < lock_lease_time * 1000) { 14754 *tick_delay = 2 * *tick_delay; 14755 if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000) 14756 *tick_delay = drv_usectohz(lock_lease_time*1000*1000); 14757 } 14758 return (0); 14759 } 14760 14761 14762 void 14763 nfs4_vnops_init(void) 14764 { 14765 } 14766 14767 void 14768 nfs4_vnops_fini(void) 14769 { 14770 } 14771 14772 /* 14773 * Return a reference to the directory (parent) vnode for a given vnode, 14774 * using the saved pathname information and the directory file handle. The 14775 * caller is responsible for disposing of the reference. 14776 * Returns zero or an errno value. 14777 * 14778 * Caller should set need_start_op to FALSE if it is the recovery 14779 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14780 */ 14781 int 14782 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14783 { 14784 svnode_t *svnp; 14785 vnode_t *dvp = NULL; 14786 servinfo4_t *svp; 14787 nfs4_fname_t *mfname; 14788 int error; 14789 14790 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14791 14792 if (vp->v_flag & VROOT) { 14793 nfs4_sharedfh_t *sfh; 14794 nfs_fh4 fh; 14795 mntinfo4_t *mi; 14796 14797 ASSERT(vp->v_type == VREG); 14798 14799 mi = VTOMI4(vp); 14800 svp = mi->mi_curr_serv; 14801 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14802 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14803 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14804 sfh = sfh4_get(&fh, VTOMI4(vp)); 14805 nfs_rw_exit(&svp->sv_lock); 14806 mfname = mi->mi_fname; 14807 fn_hold(mfname); 14808 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14809 sfh4_rele(&sfh); 14810 14811 if (dvp->v_type == VNON) 14812 dvp->v_type = VDIR; 14813 *dvpp = dvp; 14814 return (0); 14815 } 14816 14817 svnp = VTOSV(vp); 14818 14819 if (svnp == NULL) { 14820 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14821 "shadow node is NULL")); 14822 return (EINVAL); 14823 } 14824 14825 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14826 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14827 "shadow node name or dfh val == NULL")); 14828 return (EINVAL); 14829 } 14830 14831 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14832 (int)need_start_op); 14833 if (error != 0) { 14834 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14835 "nfs4_make_dotdot returned %d", error)); 14836 return (error); 14837 } 14838 if (!dvp) { 14839 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14840 "nfs4_make_dotdot returned a NULL dvp")); 14841 return (EIO); 14842 } 14843 if (dvp->v_type == VNON) 14844 dvp->v_type = VDIR; 14845 ASSERT(dvp->v_type == VDIR); 14846 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14847 mutex_enter(&dvp->v_lock); 14848 dvp->v_flag |= V_XATTRDIR; 14849 mutex_exit(&dvp->v_lock); 14850 } 14851 *dvpp = dvp; 14852 return (0); 14853 } 14854 14855 /* 14856 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14857 * length that fnamep can accept, including the trailing null. 14858 * Returns 0 if okay, returns an errno value if there was a problem. 14859 */ 14860 14861 int 14862 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14863 { 14864 char *fn; 14865 int err = 0; 14866 servinfo4_t *svp; 14867 svnode_t *shvp; 14868 14869 /* 14870 * If the file being opened has VROOT set, then this is 14871 * a "file" mount. sv_name will not be interesting, so 14872 * go back to the servinfo4 to get the original mount 14873 * path and strip off all but the final edge. Otherwise 14874 * just return the name from the shadow vnode. 14875 */ 14876 14877 if (vp->v_flag & VROOT) { 14878 14879 svp = VTOMI4(vp)->mi_curr_serv; 14880 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14881 14882 fn = strrchr(svp->sv_path, '/'); 14883 if (fn == NULL) 14884 err = EINVAL; 14885 else 14886 fn++; 14887 } else { 14888 shvp = VTOSV(vp); 14889 fn = fn_name(shvp->sv_name); 14890 } 14891 14892 if (err == 0) 14893 if (strlen(fn) < maxlen) 14894 (void) strcpy(fnamep, fn); 14895 else 14896 err = ENAMETOOLONG; 14897 14898 if (vp->v_flag & VROOT) 14899 nfs_rw_exit(&svp->sv_lock); 14900 else 14901 kmem_free(fn, MAXNAMELEN); 14902 14903 return (err); 14904 } 14905 14906 /* 14907 * Bookkeeping for a close that doesn't need to go over the wire. 14908 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14909 * it is left at 1. 14910 */ 14911 void 14912 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14913 { 14914 rnode4_t *rp; 14915 mntinfo4_t *mi; 14916 14917 mi = VTOMI4(vp); 14918 rp = VTOR4(vp); 14919 14920 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14921 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14922 ASSERT(nfs_zone() == mi->mi_zone); 14923 ASSERT(mutex_owned(&osp->os_sync_lock)); 14924 ASSERT(*have_lockp); 14925 14926 if (!osp->os_valid || 14927 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14928 return; 14929 } 14930 14931 /* 14932 * This removes the reference obtained at OPEN; ie, 14933 * when the open stream structure was created. 14934 * 14935 * We don't have to worry about calling 'open_stream_rele' 14936 * since we our currently holding a reference to this 14937 * open stream which means the count can not go to 0 with 14938 * this decrement. 14939 */ 14940 ASSERT(osp->os_ref_count >= 2); 14941 osp->os_ref_count--; 14942 osp->os_valid = 0; 14943 mutex_exit(&osp->os_sync_lock); 14944 *have_lockp = 0; 14945 14946 nfs4_dec_state_ref_count(mi); 14947 } 14948 14949 /* 14950 * Close all remaining open streams on the rnode. These open streams 14951 * could be here because: 14952 * - The close attempted at either close or delmap failed 14953 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 14954 * - Someone did mknod on a regular file but never opened it 14955 */ 14956 int 14957 nfs4close_all(vnode_t *vp, cred_t *cr) 14958 { 14959 nfs4_open_stream_t *osp; 14960 int error; 14961 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 14962 rnode4_t *rp; 14963 14964 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14965 14966 error = 0; 14967 rp = VTOR4(vp); 14968 14969 /* 14970 * At this point, all we know is that the last time 14971 * someone called vn_rele, the count was 1. Since then, 14972 * the vnode could have been re-activated. We want to 14973 * loop through the open streams and close each one, but 14974 * we have to be careful since once we release the rnode 14975 * hash bucket lock, someone else is free to come in and 14976 * re-activate the rnode and add new open streams. The 14977 * strategy is take the rnode hash bucket lock, verify that 14978 * the count is still 1, grab the open stream off the 14979 * head of the list and mark it invalid, then release the 14980 * rnode hash bucket lock and proceed with that open stream. 14981 * This is ok because nfs4close_one() will acquire the proper 14982 * open/create to close/destroy synchronization for open 14983 * streams, and will ensure that if someone has reopened 14984 * the open stream after we've dropped the hash bucket lock 14985 * then we'll just simply return without destroying the 14986 * open stream. 14987 * Repeat until the list is empty. 14988 */ 14989 14990 for (;;) { 14991 14992 /* make sure vnode hasn't been reactivated */ 14993 rw_enter(&rp->r_hashq->r_lock, RW_READER); 14994 mutex_enter(&vp->v_lock); 14995 if (vp->v_count > 1) { 14996 mutex_exit(&vp->v_lock); 14997 rw_exit(&rp->r_hashq->r_lock); 14998 break; 14999 } 15000 /* 15001 * Grabbing r_os_lock before releasing v_lock prevents 15002 * a window where the rnode/open stream could get 15003 * reactivated (and os_force_close set to 0) before we 15004 * had a chance to set os_force_close to 1. 15005 */ 15006 mutex_enter(&rp->r_os_lock); 15007 mutex_exit(&vp->v_lock); 15008 15009 osp = list_head(&rp->r_open_streams); 15010 if (!osp) { 15011 /* nothing left to CLOSE OTW, so return */ 15012 mutex_exit(&rp->r_os_lock); 15013 rw_exit(&rp->r_hashq->r_lock); 15014 break; 15015 } 15016 15017 mutex_enter(&rp->r_statev4_lock); 15018 /* the file can't still be mem mapped */ 15019 ASSERT(rp->r_mapcnt == 0); 15020 if (rp->created_v4) 15021 rp->created_v4 = 0; 15022 mutex_exit(&rp->r_statev4_lock); 15023 15024 /* 15025 * Grab a ref on this open stream; nfs4close_one 15026 * will mark it as invalid 15027 */ 15028 mutex_enter(&osp->os_sync_lock); 15029 osp->os_ref_count++; 15030 osp->os_force_close = 1; 15031 mutex_exit(&osp->os_sync_lock); 15032 mutex_exit(&rp->r_os_lock); 15033 rw_exit(&rp->r_hashq->r_lock); 15034 15035 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15036 15037 /* Update error if it isn't already non-zero */ 15038 if (error == 0) { 15039 if (e.error) 15040 error = e.error; 15041 else if (e.stat) 15042 error = geterrno4(e.stat); 15043 } 15044 15045 #ifdef DEBUG 15046 nfs4close_all_cnt++; 15047 #endif 15048 /* Release the ref on osp acquired above. */ 15049 open_stream_rele(osp, rp); 15050 15051 /* Proceed to the next open stream, if any */ 15052 } 15053 return (error); 15054 } 15055 15056 /* 15057 * nfs4close_one - close one open stream for a file if needed. 15058 * 15059 * "close_type" indicates which close path this is: 15060 * CLOSE_NORM: close initiated via VOP_CLOSE. 15061 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15062 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15063 * the close and release of client state for this open stream 15064 * (unless someone else has the open stream open). 15065 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15066 * (e.g., due to abort because of a signal). 15067 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15068 * 15069 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15070 * recovery. Instead, the caller is expected to deal with retries. 15071 * 15072 * The caller can either pass in the osp ('provided_osp') or not. 15073 * 15074 * 'access_bits' represents the access we are closing/downgrading. 15075 * 15076 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15077 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15078 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15079 * 15080 * Errors are returned via the nfs4_error_t. 15081 */ 15082 void 15083 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15084 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15085 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15086 uint_t mmap_flags) 15087 { 15088 nfs4_open_owner_t *oop; 15089 nfs4_open_stream_t *osp = NULL; 15090 int retry = 0; 15091 int num_retries = NFS4_NUM_RECOV_RETRIES; 15092 rnode4_t *rp; 15093 mntinfo4_t *mi; 15094 nfs4_recov_state_t recov_state; 15095 cred_t *cred_otw = NULL; 15096 bool_t recovonly = FALSE; 15097 int isrecov; 15098 int force_close; 15099 int close_failed = 0; 15100 int did_dec_count = 0; 15101 int did_start_op = 0; 15102 int did_force_recovlock = 0; 15103 int did_start_seqid_sync = 0; 15104 int have_sync_lock = 0; 15105 15106 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15107 15108 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15109 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15110 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15111 len, maxprot, mmap_flags, access_bits)); 15112 15113 nfs4_error_zinit(ep); 15114 rp = VTOR4(vp); 15115 mi = VTOMI4(vp); 15116 isrecov = (close_type == CLOSE_RESEND || 15117 close_type == CLOSE_AFTER_RESEND); 15118 15119 /* 15120 * First get the open owner. 15121 */ 15122 if (!provided_osp) { 15123 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15124 } else { 15125 oop = provided_osp->os_open_owner; 15126 ASSERT(oop != NULL); 15127 open_owner_hold(oop); 15128 } 15129 15130 if (!oop) { 15131 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15132 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15133 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15134 (void *)provided_osp, close_type)); 15135 ep->error = EIO; 15136 goto out; 15137 } 15138 15139 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15140 recov_retry: 15141 osp = NULL; 15142 close_failed = 0; 15143 force_close = (close_type == CLOSE_FORCE); 15144 retry = 0; 15145 did_start_op = 0; 15146 did_force_recovlock = 0; 15147 did_start_seqid_sync = 0; 15148 have_sync_lock = 0; 15149 recovonly = FALSE; 15150 recov_state.rs_flags = 0; 15151 recov_state.rs_num_retry_despite_err = 0; 15152 15153 /* 15154 * Second synchronize with recovery. 15155 */ 15156 if (!isrecov) { 15157 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15158 &recov_state, &recovonly); 15159 if (!ep->error) { 15160 did_start_op = 1; 15161 } else { 15162 close_failed = 1; 15163 /* 15164 * If we couldn't get start_fop, but have to 15165 * cleanup state, then at least acquire the 15166 * mi_recovlock so we can synchronize with 15167 * recovery. 15168 */ 15169 if (close_type == CLOSE_FORCE) { 15170 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15171 RW_READER, FALSE); 15172 did_force_recovlock = 1; 15173 } else 15174 goto out; 15175 } 15176 } 15177 15178 /* 15179 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15180 * set 'recovonly' to TRUE since most likely this is due to 15181 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15182 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15183 * to retry, causing us to loop until recovery finishes. Plus we 15184 * don't need protection over the open seqid since we're not going 15185 * OTW, hence don't need to use the seqid. 15186 */ 15187 if (recovonly == FALSE) { 15188 /* need to grab the open owner sync before 'os_sync_lock' */ 15189 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15190 if (ep->error == EAGAIN) { 15191 ASSERT(!isrecov); 15192 if (did_start_op) 15193 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15194 &recov_state, TRUE); 15195 if (did_force_recovlock) 15196 nfs_rw_exit(&mi->mi_recovlock); 15197 goto recov_retry; 15198 } 15199 did_start_seqid_sync = 1; 15200 } 15201 15202 /* 15203 * Third get an open stream and acquire 'os_sync_lock' to 15204 * sychronize the opening/creating of an open stream with the 15205 * closing/destroying of an open stream. 15206 */ 15207 if (!provided_osp) { 15208 /* returns with 'os_sync_lock' held */ 15209 osp = find_open_stream(oop, rp); 15210 if (!osp) { 15211 ep->error = EIO; 15212 goto out; 15213 } 15214 } else { 15215 osp = provided_osp; 15216 open_stream_hold(osp); 15217 mutex_enter(&osp->os_sync_lock); 15218 } 15219 have_sync_lock = 1; 15220 15221 ASSERT(oop == osp->os_open_owner); 15222 15223 /* 15224 * Fourth, do any special pre-OTW CLOSE processing 15225 * based on the specific close type. 15226 */ 15227 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15228 !did_dec_count) { 15229 ASSERT(osp->os_open_ref_count > 0); 15230 osp->os_open_ref_count--; 15231 did_dec_count = 1; 15232 if (osp->os_open_ref_count == 0) 15233 osp->os_final_close = 1; 15234 } 15235 15236 if (close_type == CLOSE_FORCE) { 15237 /* see if somebody reopened the open stream. */ 15238 if (!osp->os_force_close) { 15239 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15240 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15241 "was reopened, vp %p", (void *)osp, (void *)vp)); 15242 ep->error = 0; 15243 ep->stat = NFS4_OK; 15244 goto out; 15245 } 15246 15247 if (!osp->os_final_close && !did_dec_count) { 15248 osp->os_open_ref_count--; 15249 did_dec_count = 1; 15250 } 15251 15252 /* 15253 * We can't depend on os_open_ref_count being 0 due to the 15254 * way executables are opened (VN_RELE to match a VOP_OPEN). 15255 */ 15256 #ifdef NOTYET 15257 ASSERT(osp->os_open_ref_count == 0); 15258 #endif 15259 if (osp->os_open_ref_count != 0) { 15260 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15261 "nfs4close_one: should panic here on an " 15262 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15263 "since this is probably the exec problem.")); 15264 15265 osp->os_open_ref_count = 0; 15266 } 15267 15268 /* 15269 * There is the possibility that nfs4close_one() 15270 * for close_type == CLOSE_DELMAP couldn't find the 15271 * open stream, thus couldn't decrement its os_mapcnt; 15272 * therefore we can't use this ASSERT yet. 15273 */ 15274 #ifdef NOTYET 15275 ASSERT(osp->os_mapcnt == 0); 15276 #endif 15277 osp->os_mapcnt = 0; 15278 } 15279 15280 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15281 ASSERT(osp->os_mapcnt >= btopr(len)); 15282 15283 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15284 osp->os_mmap_write -= btopr(len); 15285 if (maxprot & PROT_READ) 15286 osp->os_mmap_read -= btopr(len); 15287 if (maxprot & PROT_EXEC) 15288 osp->os_mmap_read -= btopr(len); 15289 /* mirror the PROT_NONE check in nfs4_addmap() */ 15290 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15291 !(maxprot & PROT_EXEC)) 15292 osp->os_mmap_read -= btopr(len); 15293 osp->os_mapcnt -= btopr(len); 15294 did_dec_count = 1; 15295 } 15296 15297 if (recovonly) { 15298 nfs4_lost_rqst_t lost_rqst; 15299 15300 /* request should not already be in recovery queue */ 15301 ASSERT(lrp == NULL); 15302 nfs4_error_init(ep, EINTR); 15303 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15304 osp, cred_otw, vp); 15305 mutex_exit(&osp->os_sync_lock); 15306 have_sync_lock = 0; 15307 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15308 lost_rqst.lr_op == OP_CLOSE ? 15309 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL); 15310 close_failed = 1; 15311 force_close = 0; 15312 goto close_cleanup; 15313 } 15314 15315 /* 15316 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15317 * we stopped operating on the open owner's <old oo_name, old seqid> 15318 * space, which means we stopped operating on the open stream 15319 * too. So don't go OTW (as the seqid is likely bad, and the 15320 * stateid could be stale, potentially triggering a false 15321 * setclientid), and just clean up the client's internal state. 15322 */ 15323 if (osp->os_orig_oo_name != oop->oo_name) { 15324 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15325 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15326 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15327 "oo_name %" PRIx64")", 15328 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15329 oop->oo_name)); 15330 close_failed = 1; 15331 } 15332 15333 /* If the file failed recovery, just quit. */ 15334 mutex_enter(&rp->r_statelock); 15335 if (rp->r_flags & R4RECOVERR) { 15336 close_failed = 1; 15337 } 15338 mutex_exit(&rp->r_statelock); 15339 15340 /* 15341 * If the force close path failed to obtain start_fop 15342 * then skip the OTW close and just remove the state. 15343 */ 15344 if (close_failed) 15345 goto close_cleanup; 15346 15347 /* 15348 * Fifth, check to see if there are still mapped pages or other 15349 * opens using this open stream. If there are then we can't 15350 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15351 */ 15352 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15353 nfs4_lost_rqst_t new_lost_rqst; 15354 bool_t needrecov = FALSE; 15355 cred_t *odg_cred_otw = NULL; 15356 seqid4 open_dg_seqid = 0; 15357 15358 if (osp->os_delegation) { 15359 /* 15360 * If this open stream was never OPENed OTW then we 15361 * surely can't DOWNGRADE it (especially since the 15362 * osp->open_stateid is really a delegation stateid 15363 * when os_delegation is 1). 15364 */ 15365 if (access_bits & FREAD) 15366 osp->os_share_acc_read--; 15367 if (access_bits & FWRITE) 15368 osp->os_share_acc_write--; 15369 osp->os_share_deny_none--; 15370 nfs4_error_zinit(ep); 15371 goto out; 15372 } 15373 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15374 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15375 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15376 if (needrecov && !isrecov) { 15377 bool_t abort; 15378 nfs4_bseqid_entry_t *bsep = NULL; 15379 15380 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15381 bsep = nfs4_create_bseqid_entry(oop, NULL, 15382 vp, 0, 15383 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15384 open_dg_seqid); 15385 15386 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15387 oop, osp, odg_cred_otw, vp, access_bits, 0); 15388 mutex_exit(&osp->os_sync_lock); 15389 have_sync_lock = 0; 15390 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15391 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15392 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15393 bsep, NULL, NULL); 15394 if (odg_cred_otw) 15395 crfree(odg_cred_otw); 15396 if (bsep) 15397 kmem_free(bsep, sizeof (*bsep)); 15398 15399 if (abort == TRUE) 15400 goto out; 15401 15402 if (did_start_seqid_sync) { 15403 nfs4_end_open_seqid_sync(oop); 15404 did_start_seqid_sync = 0; 15405 } 15406 open_stream_rele(osp, rp); 15407 15408 if (did_start_op) 15409 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15410 &recov_state, FALSE); 15411 if (did_force_recovlock) 15412 nfs_rw_exit(&mi->mi_recovlock); 15413 15414 goto recov_retry; 15415 } else { 15416 if (odg_cred_otw) 15417 crfree(odg_cred_otw); 15418 } 15419 goto out; 15420 } 15421 15422 /* 15423 * If this open stream was created as the results of an open 15424 * while holding a delegation, then just release it; no need 15425 * to do an OTW close. Otherwise do a "normal" OTW close. 15426 */ 15427 if (osp->os_delegation) { 15428 nfs4close_notw(vp, osp, &have_sync_lock); 15429 nfs4_error_zinit(ep); 15430 goto out; 15431 } 15432 15433 /* 15434 * If this stream is not valid, we're done. 15435 */ 15436 if (!osp->os_valid) { 15437 nfs4_error_zinit(ep); 15438 goto out; 15439 } 15440 15441 /* 15442 * Last open or mmap ref has vanished, need to do an OTW close. 15443 * First check to see if a close is still necessary. 15444 */ 15445 if (osp->os_failed_reopen) { 15446 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15447 "don't close OTW osp %p since reopen failed.", 15448 (void *)osp)); 15449 /* 15450 * Reopen of the open stream failed, hence the 15451 * stateid of the open stream is invalid/stale, and 15452 * sending this OTW would incorrectly cause another 15453 * round of recovery. In this case, we need to set 15454 * the 'os_valid' bit to 0 so another thread doesn't 15455 * come in and re-open this open stream before 15456 * this "closing" thread cleans up state (decrementing 15457 * the nfs4_server_t's state_ref_count and decrementing 15458 * the os_ref_count). 15459 */ 15460 osp->os_valid = 0; 15461 /* 15462 * This removes the reference obtained at OPEN; ie, 15463 * when the open stream structure was created. 15464 * 15465 * We don't have to worry about calling 'open_stream_rele' 15466 * since we our currently holding a reference to this 15467 * open stream which means the count can not go to 0 with 15468 * this decrement. 15469 */ 15470 ASSERT(osp->os_ref_count >= 2); 15471 osp->os_ref_count--; 15472 nfs4_error_zinit(ep); 15473 close_failed = 0; 15474 goto close_cleanup; 15475 } 15476 15477 ASSERT(osp->os_ref_count > 1); 15478 15479 /* 15480 * Sixth, try the CLOSE OTW. 15481 */ 15482 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15483 close_type, ep, &have_sync_lock); 15484 15485 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15486 /* 15487 * Let the recovery thread be responsible for 15488 * removing the state for CLOSE. 15489 */ 15490 close_failed = 1; 15491 force_close = 0; 15492 retry = 0; 15493 } 15494 15495 /* See if we need to retry with a different cred */ 15496 if ((ep->error == EACCES || 15497 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15498 cred_otw != cr) { 15499 crfree(cred_otw); 15500 cred_otw = cr; 15501 crhold(cred_otw); 15502 retry = 1; 15503 } 15504 15505 if (ep->error || ep->stat) 15506 close_failed = 1; 15507 15508 if (retry && !isrecov && num_retries-- > 0) { 15509 if (have_sync_lock) { 15510 mutex_exit(&osp->os_sync_lock); 15511 have_sync_lock = 0; 15512 } 15513 if (did_start_seqid_sync) { 15514 nfs4_end_open_seqid_sync(oop); 15515 did_start_seqid_sync = 0; 15516 } 15517 open_stream_rele(osp, rp); 15518 15519 if (did_start_op) 15520 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15521 &recov_state, FALSE); 15522 if (did_force_recovlock) 15523 nfs_rw_exit(&mi->mi_recovlock); 15524 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15525 "nfs4close_one: need to retry the close " 15526 "operation")); 15527 goto recov_retry; 15528 } 15529 close_cleanup: 15530 /* 15531 * Seventh and lastly, process our results. 15532 */ 15533 if (close_failed && force_close) { 15534 /* 15535 * It's ok to drop and regrab the 'os_sync_lock' since 15536 * nfs4close_notw() will recheck to make sure the 15537 * "close"/removal of state should happen. 15538 */ 15539 if (!have_sync_lock) { 15540 mutex_enter(&osp->os_sync_lock); 15541 have_sync_lock = 1; 15542 } 15543 /* 15544 * This is last call, remove the ref on the open 15545 * stream created by open and clean everything up. 15546 */ 15547 osp->os_pending_close = 0; 15548 nfs4close_notw(vp, osp, &have_sync_lock); 15549 nfs4_error_zinit(ep); 15550 } 15551 15552 if (!close_failed) { 15553 if (have_sync_lock) { 15554 osp->os_pending_close = 0; 15555 mutex_exit(&osp->os_sync_lock); 15556 have_sync_lock = 0; 15557 } else { 15558 mutex_enter(&osp->os_sync_lock); 15559 osp->os_pending_close = 0; 15560 mutex_exit(&osp->os_sync_lock); 15561 } 15562 if (did_start_op && recov_state.rs_sp != NULL) { 15563 mutex_enter(&recov_state.rs_sp->s_lock); 15564 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15565 mutex_exit(&recov_state.rs_sp->s_lock); 15566 } else { 15567 nfs4_dec_state_ref_count(mi); 15568 } 15569 nfs4_error_zinit(ep); 15570 } 15571 15572 out: 15573 if (have_sync_lock) 15574 mutex_exit(&osp->os_sync_lock); 15575 if (did_start_op) 15576 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15577 recovonly ? TRUE : FALSE); 15578 if (did_force_recovlock) 15579 nfs_rw_exit(&mi->mi_recovlock); 15580 if (cred_otw) 15581 crfree(cred_otw); 15582 if (osp) 15583 open_stream_rele(osp, rp); 15584 if (oop) { 15585 if (did_start_seqid_sync) 15586 nfs4_end_open_seqid_sync(oop); 15587 open_owner_rele(oop); 15588 } 15589 } 15590 15591 /* 15592 * Convert information returned by the server in the LOCK4denied 15593 * structure to the form required by fcntl. 15594 */ 15595 static void 15596 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15597 { 15598 nfs4_lo_name_t *lo; 15599 15600 #ifdef DEBUG 15601 if (denied_to_flk_debug) { 15602 lockt_denied_debug = lockt_denied; 15603 debug_enter("lockt_denied"); 15604 } 15605 #endif 15606 15607 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15608 flk->l_whence = 0; /* aka SEEK_SET */ 15609 flk->l_start = lockt_denied->offset; 15610 flk->l_len = lockt_denied->length; 15611 15612 /* 15613 * If the blocking clientid matches our client id, then we can 15614 * interpret the lockowner (since we built it). If not, then 15615 * fabricate a sysid and pid. Note that the l_sysid field 15616 * in *flk already has the local sysid. 15617 */ 15618 15619 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15620 15621 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15622 lo = (nfs4_lo_name_t *) 15623 lockt_denied->owner.owner_val; 15624 15625 flk->l_pid = lo->ln_pid; 15626 } else { 15627 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15628 "denied_to_flk: bad lock owner length\n")); 15629 15630 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15631 } 15632 } else { 15633 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15634 "denied_to_flk: foreign clientid\n")); 15635 15636 /* 15637 * Construct a new sysid which should be different from 15638 * sysids of other systems. 15639 */ 15640 15641 flk->l_sysid++; 15642 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15643 } 15644 } 15645 15646 static pid_t 15647 lo_to_pid(lock_owner4 *lop) 15648 { 15649 pid_t pid = 0; 15650 uchar_t *cp; 15651 int i; 15652 15653 cp = (uchar_t *)&lop->clientid; 15654 15655 for (i = 0; i < sizeof (lop->clientid); i++) 15656 pid += (pid_t)*cp++; 15657 15658 cp = (uchar_t *)lop->owner_val; 15659 15660 for (i = 0; i < lop->owner_len; i++) 15661 pid += (pid_t)*cp++; 15662 15663 return (pid); 15664 } 15665 15666 /* 15667 * Given a lock pointer, returns the length of that lock. 15668 * "end" is the last locked offset the "l_len" covers from 15669 * the start of the lock. 15670 */ 15671 static off64_t 15672 lock_to_end(flock64_t *lock) 15673 { 15674 off64_t lock_end; 15675 15676 if (lock->l_len == 0) 15677 lock_end = (off64_t)MAXEND; 15678 else 15679 lock_end = lock->l_start + lock->l_len - 1; 15680 15681 return (lock_end); 15682 } 15683 15684 /* 15685 * Given the end of a lock, it will return you the length "l_len" for that lock. 15686 */ 15687 static off64_t 15688 end_to_len(off64_t start, off64_t end) 15689 { 15690 off64_t lock_len; 15691 15692 ASSERT(end >= start); 15693 if (end == MAXEND) 15694 lock_len = 0; 15695 else 15696 lock_len = end - start + 1; 15697 15698 return (lock_len); 15699 } 15700 15701 /* 15702 * On given end for a lock it determines if it is the last locked offset 15703 * or not, if so keeps it as is, else adds one to return the length for 15704 * valid start. 15705 */ 15706 static off64_t 15707 start_check(off64_t x) 15708 { 15709 if (x == MAXEND) 15710 return (x); 15711 else 15712 return (x + 1); 15713 } 15714 15715 /* 15716 * See if these two locks overlap, and if so return 1; 15717 * otherwise, return 0. 15718 */ 15719 static int 15720 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15721 { 15722 off64_t llfp_end, curfp_end; 15723 15724 llfp_end = lock_to_end(llfp); 15725 curfp_end = lock_to_end(curfp); 15726 15727 if (((llfp_end >= curfp->l_start) && 15728 (llfp->l_start <= curfp->l_start)) || 15729 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15730 return (1); 15731 return (0); 15732 } 15733 15734 /* 15735 * Determine what the intersecting lock region is, and add that to the 15736 * 'nl_llpp' locklist in increasing order (by l_start). 15737 */ 15738 static void 15739 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15740 locklist_t **nl_llpp, vnode_t *vp) 15741 { 15742 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15743 off64_t lost_flp_end, local_flp_end, len, start; 15744 15745 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15746 15747 if (!locks_intersect(lost_flp, local_flp)) 15748 return; 15749 15750 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15751 "locks intersect")); 15752 15753 lost_flp_end = lock_to_end(lost_flp); 15754 local_flp_end = lock_to_end(local_flp); 15755 15756 /* Find the starting point of the intersecting region */ 15757 if (local_flp->l_start > lost_flp->l_start) 15758 start = local_flp->l_start; 15759 else 15760 start = lost_flp->l_start; 15761 15762 /* Find the lenght of the intersecting region */ 15763 if (lost_flp_end < local_flp_end) 15764 len = end_to_len(start, lost_flp_end); 15765 else 15766 len = end_to_len(start, local_flp_end); 15767 15768 /* 15769 * Prepare the flock structure for the intersection found and insert 15770 * it into the new list in increasing l_start order. This list contains 15771 * intersections of locks registered by the client with the local host 15772 * and the lost lock. 15773 * The lock type of this lock is the same as that of the local_flp. 15774 */ 15775 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15776 intersect_llp->ll_flock.l_start = start; 15777 intersect_llp->ll_flock.l_len = len; 15778 intersect_llp->ll_flock.l_type = local_flp->l_type; 15779 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15780 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15781 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15782 intersect_llp->ll_vp = vp; 15783 15784 tmp_fllp = *nl_llpp; 15785 cur_fllp = NULL; 15786 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15787 intersect_llp->ll_flock.l_start) { 15788 cur_fllp = tmp_fllp; 15789 tmp_fllp = tmp_fllp->ll_next; 15790 } 15791 if (cur_fllp == NULL) { 15792 /* first on the list */ 15793 intersect_llp->ll_next = *nl_llpp; 15794 *nl_llpp = intersect_llp; 15795 } else { 15796 intersect_llp->ll_next = cur_fllp->ll_next; 15797 cur_fllp->ll_next = intersect_llp; 15798 } 15799 15800 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15801 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15802 intersect_llp->ll_flock.l_start, 15803 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15804 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15805 } 15806 15807 /* 15808 * Our local locking current state is potentially different than 15809 * what the NFSv4 server thinks we have due to a lost lock that was 15810 * resent and then received. We need to reset our "NFSv4" locking 15811 * state to match the current local locking state for this pid since 15812 * that is what the user/application sees as what the world is. 15813 * 15814 * We cannot afford to drop the open/lock seqid sync since then we can 15815 * get confused about what the current local locking state "is" versus 15816 * "was". 15817 * 15818 * If we are unable to fix up the locks, we send SIGLOST to the affected 15819 * process. This is not done if the filesystem has been forcibly 15820 * unmounted, in case the process has already exited and a new process 15821 * exists with the same pid. 15822 */ 15823 static void 15824 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15825 nfs4_lock_owner_t *lop) 15826 { 15827 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15828 mntinfo4_t *mi = VTOMI4(vp); 15829 const int cmd = F_SETLK; 15830 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15831 flock64_t ul_fl; 15832 15833 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15834 "nfs4_reinstitute_local_lock_state")); 15835 15836 /* 15837 * Find active locks for this vp from the local locking code. 15838 * Scan through this list and find out the locks that intersect with 15839 * the lost lock. Once we find the lock that intersects, add the 15840 * intersection area as a new lock to a new list "ri_llp". The lock 15841 * type of the intersection region lock added to ri_llp is the same 15842 * as that found in the active lock list, "list". The intersecting 15843 * region locks are added to ri_llp in increasing l_start order. 15844 */ 15845 ASSERT(nfs_zone() == mi->mi_zone); 15846 15847 locks = flk_active_locks_for_vp(vp); 15848 ri_llp = NULL; 15849 15850 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15851 ASSERT(llp->ll_vp == vp); 15852 /* 15853 * Pick locks that belong to this pid/lockowner 15854 */ 15855 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15856 continue; 15857 15858 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15859 } 15860 15861 /* 15862 * Now we have the list of intersections with the lost lock. These are 15863 * the locks that were/are active before the server replied to the 15864 * last/lost lock. Issue these locks to the server here. Playing these 15865 * locks to the server will re-establish our current local locking state 15866 * with the v4 server. 15867 * If we get an error, send SIGLOST to the application for that lock. 15868 */ 15869 15870 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15871 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15872 "nfs4_reinstitute_local_lock_state: need to issue " 15873 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15874 llp->ll_flock.l_start, 15875 llp->ll_flock.l_start + llp->ll_flock.l_len, 15876 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15877 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15878 /* 15879 * No need to relock what we already have 15880 */ 15881 if (llp->ll_flock.l_type == lost_flp->l_type) 15882 continue; 15883 15884 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15885 } 15886 15887 /* 15888 * Now keeping the start of the lost lock as our reference parse the 15889 * newly created ri_llp locklist to find the ranges that we have locked 15890 * with the v4 server but not in the current local locking. We need 15891 * to unlock these ranges. 15892 * These ranges can also be reffered to as those ranges, where the lost 15893 * lock does not overlap with the locks in the ri_llp but are locked 15894 * since the server replied to the lost lock. 15895 */ 15896 cur_start = lost_flp->l_start; 15897 lost_flp_end = lock_to_end(lost_flp); 15898 15899 ul_fl.l_type = F_UNLCK; 15900 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15901 ul_fl.l_sysid = lost_flp->l_sysid; 15902 ul_fl.l_pid = lost_flp->l_pid; 15903 15904 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15905 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15906 15907 if (llp->ll_flock.l_start <= cur_start) { 15908 cur_start = start_check(llp_ll_flock_end); 15909 continue; 15910 } 15911 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15912 "nfs4_reinstitute_local_lock_state: " 15913 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15914 cur_start, llp->ll_flock.l_start)); 15915 15916 ul_fl.l_start = cur_start; 15917 ul_fl.l_len = end_to_len(cur_start, 15918 (llp->ll_flock.l_start - 1)); 15919 15920 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15921 cur_start = start_check(llp_ll_flock_end); 15922 } 15923 15924 /* 15925 * In the case where the lost lock ends after all intersecting locks, 15926 * unlock the last part of the lost lock range. 15927 */ 15928 if (cur_start != start_check(lost_flp_end)) { 15929 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15930 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15931 "lost lock region [%"PRIx64" - %"PRIx64"]", 15932 cur_start, lost_flp->l_start + lost_flp->l_len)); 15933 15934 ul_fl.l_start = cur_start; 15935 /* 15936 * Is it an to-EOF lock? if so unlock till the end 15937 */ 15938 if (lost_flp->l_len == 0) 15939 ul_fl.l_len = 0; 15940 else 15941 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15942 15943 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15944 } 15945 15946 if (locks != NULL) 15947 flk_free_locklist(locks); 15948 15949 /* Free up our newly created locklist */ 15950 for (llp = ri_llp; llp != NULL; ) { 15951 tmp_llp = llp->ll_next; 15952 kmem_free(llp, sizeof (locklist_t)); 15953 llp = tmp_llp; 15954 } 15955 15956 /* 15957 * Now return back to the original calling nfs4frlock() 15958 * and let us naturally drop our seqid syncs. 15959 */ 15960 } 15961 15962 /* 15963 * Create a lost state record for the given lock reinstantiation request 15964 * and push it onto the lost state queue. 15965 */ 15966 static void 15967 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 15968 nfs4_lock_owner_t *lop) 15969 { 15970 nfs4_lost_rqst_t req; 15971 nfs_lock_type4 locktype; 15972 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 15973 15974 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15975 15976 locktype = flk_to_locktype(cmd, flk->l_type); 15977 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 15978 NULL, NULL, lop, flk, &req, cr, vp); 15979 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 15980 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 15981 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 15982 NULL, NULL, NULL); 15983 } 15984