1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* 32 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 33 * All Rights Reserved 34 */ 35 36 /* 37 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 38 */ 39 40 /* 41 * Copyright (c) 2014, STRATO AG. All rights reserved. 42 */ 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 #include <sys/systm.h> 47 #include <sys/cred.h> 48 #include <sys/time.h> 49 #include <sys/vnode.h> 50 #include <sys/vfs.h> 51 #include <sys/vfs_opreg.h> 52 #include <sys/file.h> 53 #include <sys/filio.h> 54 #include <sys/uio.h> 55 #include <sys/buf.h> 56 #include <sys/mman.h> 57 #include <sys/pathname.h> 58 #include <sys/dirent.h> 59 #include <sys/debug.h> 60 #include <sys/vmsystm.h> 61 #include <sys/fcntl.h> 62 #include <sys/flock.h> 63 #include <sys/swap.h> 64 #include <sys/errno.h> 65 #include <sys/strsubr.h> 66 #include <sys/sysmacros.h> 67 #include <sys/kmem.h> 68 #include <sys/cmn_err.h> 69 #include <sys/pathconf.h> 70 #include <sys/utsname.h> 71 #include <sys/dnlc.h> 72 #include <sys/acl.h> 73 #include <sys/systeminfo.h> 74 #include <sys/policy.h> 75 #include <sys/sdt.h> 76 #include <sys/list.h> 77 #include <sys/stat.h> 78 #include <sys/zone.h> 79 80 #include <rpc/types.h> 81 #include <rpc/auth.h> 82 #include <rpc/clnt.h> 83 84 #include <nfs/nfs.h> 85 #include <nfs/nfs_clnt.h> 86 #include <nfs/nfs_acl.h> 87 #include <nfs/lm.h> 88 #include <nfs/nfs4.h> 89 #include <nfs/nfs4_kprot.h> 90 #include <nfs/rnode4.h> 91 #include <nfs/nfs4_clnt.h> 92 93 #include <vm/hat.h> 94 #include <vm/as.h> 95 #include <vm/page.h> 96 #include <vm/pvn.h> 97 #include <vm/seg.h> 98 #include <vm/seg_map.h> 99 #include <vm/seg_kpm.h> 100 #include <vm/seg_vn.h> 101 102 #include <fs/fs_subr.h> 103 104 #include <sys/ddi.h> 105 #include <sys/int_fmtio.h> 106 #include <sys/fs/autofs.h> 107 108 typedef struct { 109 nfs4_ga_res_t *di_garp; 110 cred_t *di_cred; 111 hrtime_t di_time_call; 112 } dirattr_info_t; 113 114 typedef enum nfs4_acl_op { 115 NFS4_ACL_GET, 116 NFS4_ACL_SET 117 } nfs4_acl_op_t; 118 119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 120 static int nfs4frlock_get_sysid(struct lm_sysid **, vnode_t *, flock64_t *); 121 122 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 123 char *, dirattr_info_t *); 124 125 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 126 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 127 nfs4_error_t *, int *); 128 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 129 cred_t *); 130 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 131 stable_how4 *); 132 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 133 cred_t *, bool_t, struct uio *); 134 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 135 vsecattr_t *); 136 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 137 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 138 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 139 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 140 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 141 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 142 int, vnode_t **, cred_t *); 143 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 144 cred_t *, int, int, enum createmode4, int); 145 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 146 caller_context_t *); 147 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 148 vnode_t *, char *, cred_t *, nfsstat4 *); 149 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 150 vnode_t *, char *, cred_t *, nfsstat4 *); 151 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 152 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 153 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 154 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 155 page_t *[], size_t, struct seg *, caddr_t, 156 enum seg_rw, cred_t *); 157 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 158 cred_t *); 159 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 160 int, cred_t *); 161 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 162 int, cred_t *); 163 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 164 static void nfs4_set_mod(vnode_t *); 165 static void nfs4_get_commit(vnode_t *); 166 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 167 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 168 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 169 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 170 cred_t *); 171 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 172 cred_t *); 173 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 174 hrtime_t, vnode_t *, cred_t *); 175 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 176 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 177 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 178 static int nfs4_block_and_wait(clock_t *); 179 static cred_t *state_to_cred(nfs4_open_stream_t *); 180 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 181 static pid_t lo_to_pid(lock_owner4 *); 182 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 183 cred_t *, nfs4_lock_owner_t *); 184 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 185 nfs4_lock_owner_t *); 186 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 187 static void nfs4_delmap_callback(struct as *, void *, uint_t); 188 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 189 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 190 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 191 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 192 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 193 uid_t, gid_t, int); 194 195 /* 196 * Routines that implement the setting of v4 args for the misc. ops 197 */ 198 static void nfs4args_lock_free(nfs_argop4 *); 199 static void nfs4args_lockt_free(nfs_argop4 *); 200 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 201 int, rnode4_t *, cred_t *, bitmap4, int *, 202 nfs4_stateid_types_t *); 203 static void nfs4args_setattr_free(nfs_argop4 *); 204 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 205 bitmap4); 206 static void nfs4args_verify_free(nfs_argop4 *); 207 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 208 WRITE4args **, nfs4_stateid_types_t *); 209 210 /* 211 * These are the vnode ops functions that implement the vnode interface to 212 * the networked file system. See more comments below at nfs4_vnodeops. 213 */ 214 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 215 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 216 caller_context_t *); 217 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 218 caller_context_t *); 219 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 220 caller_context_t *); 221 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 222 caller_context_t *); 223 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 224 caller_context_t *); 225 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 226 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 227 caller_context_t *); 228 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 229 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 230 int, vnode_t **, cred_t *, int, caller_context_t *, 231 vsecattr_t *); 232 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 233 int); 234 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 235 caller_context_t *, int); 236 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 237 caller_context_t *, int); 238 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 239 cred_t *, caller_context_t *, int, vsecattr_t *); 240 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 241 caller_context_t *, int); 242 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 243 cred_t *, caller_context_t *, int); 244 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 245 caller_context_t *, int); 246 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 247 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 248 page_t *[], size_t, struct seg *, caddr_t, 249 enum seg_rw, cred_t *, caller_context_t *); 250 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 251 caller_context_t *); 252 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 253 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 254 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 255 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 256 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 257 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 258 struct flk_callback *, cred_t *, caller_context_t *); 259 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 260 cred_t *, caller_context_t *); 261 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 262 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 263 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 264 cred_t *, caller_context_t *); 265 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 266 caller_context_t *); 267 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 268 caller_context_t *); 269 /* 270 * These vnode ops are required to be called from outside this source file, 271 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 272 * as static. 273 */ 274 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 275 caller_context_t *); 276 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 277 int nfs4_lookup(vnode_t *, char *, vnode_t **, 278 struct pathname *, int, vnode_t *, cred_t *, 279 caller_context_t *, int *, pathname_t *); 280 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 281 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 282 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 283 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 284 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 285 caller_context_t *); 286 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 287 caller_context_t *); 288 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 289 caller_context_t *); 290 291 /* 292 * Used for nfs4_commit_vp() to indicate if we should 293 * wait on pending writes. 294 */ 295 #define NFS4_WRITE_NOWAIT 0 296 #define NFS4_WRITE_WAIT 1 297 298 /* 299 * Error flags used to pass information about certain special errors 300 * which need to be handled specially. 301 */ 302 #define NFS_EOF -98 303 #define NFS_VERF_MISMATCH -97 304 305 /* 306 * Flags used to differentiate between which operation drove the 307 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 308 */ 309 #define NFS4_CLOSE_OP 0x1 310 #define NFS4_DELMAP_OP 0x2 311 #define NFS4_INACTIVE_OP 0x3 312 313 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 314 315 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 316 #define ALIGN64(x, ptr, sz) \ 317 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 318 if (x) { \ 319 x = sizeof (uint64_t) - (x); \ 320 sz -= (x); \ 321 ptr += (x); \ 322 } 323 324 #ifdef DEBUG 325 int nfs4_client_attr_debug = 0; 326 int nfs4_client_state_debug = 0; 327 int nfs4_client_shadow_debug = 0; 328 int nfs4_client_lock_debug = 0; 329 int nfs4_seqid_sync = 0; 330 int nfs4_client_map_debug = 0; 331 static int nfs4_pageio_debug = 0; 332 int nfs4_client_inactive_debug = 0; 333 int nfs4_client_recov_debug = 0; 334 int nfs4_client_failover_debug = 0; 335 int nfs4_client_call_debug = 0; 336 int nfs4_client_lookup_debug = 0; 337 int nfs4_client_zone_debug = 0; 338 int nfs4_lost_rqst_debug = 0; 339 int nfs4_rdattrerr_debug = 0; 340 int nfs4_open_stream_debug = 0; 341 342 int nfs4read_error_inject; 343 344 static int nfs4_create_misses = 0; 345 346 static int nfs4_readdir_cache_shorts = 0; 347 static int nfs4_readdir_readahead = 0; 348 349 static int nfs4_bio_do_stop = 0; 350 351 static int nfs4_lostpage = 0; /* number of times we lost original page */ 352 353 int nfs4_mmap_debug = 0; 354 355 static int nfs4_pathconf_cache_hits = 0; 356 static int nfs4_pathconf_cache_misses = 0; 357 358 int nfs4close_all_cnt; 359 int nfs4close_one_debug = 0; 360 int nfs4close_notw_debug = 0; 361 362 int denied_to_flk_debug = 0; 363 void *lockt_denied_debug; 364 365 #endif 366 367 /* 368 * In milliseconds. Should be less than half of the lease time or better, 369 * less than one second. 370 */ 371 int nfs4_base_wait_time = 20; 372 int nfs4_max_base_wait_time = 1 * 1000; /* 1 sec */ 373 374 /* 375 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 376 * or NFS4ERR_RESOURCE. 377 */ 378 static int confirm_retry_sec = 30; 379 380 static int nfs4_lookup_neg_cache = 1; 381 382 /* 383 * number of pages to read ahead 384 * optimized for 100 base-T. 385 */ 386 static int nfs4_nra = 4; 387 388 static int nfs4_do_symlink_cache = 1; 389 390 static int nfs4_pathconf_disable_cache = 0; 391 392 /* 393 * These are the vnode ops routines which implement the vnode interface to 394 * the networked file system. These routines just take their parameters, 395 * make them look networkish by putting the right info into interface structs, 396 * and then calling the appropriate remote routine(s) to do the work. 397 * 398 * Note on directory name lookup cacheing: If we detect a stale fhandle, 399 * we purge the directory cache relative to that vnode. This way, the 400 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 401 * more details on rnode locking. 402 */ 403 404 struct vnodeops *nfs4_vnodeops; 405 406 const fs_operation_def_t nfs4_vnodeops_template[] = { 407 VOPNAME_OPEN, { .vop_open = nfs4_open }, 408 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 409 VOPNAME_READ, { .vop_read = nfs4_read }, 410 VOPNAME_WRITE, { .vop_write = nfs4_write }, 411 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 412 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 413 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 414 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 415 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 416 VOPNAME_CREATE, { .vop_create = nfs4_create }, 417 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 418 VOPNAME_LINK, { .vop_link = nfs4_link }, 419 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 420 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 421 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 422 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 423 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 424 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 425 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 426 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 427 VOPNAME_FID, { .vop_fid = nfs4_fid }, 428 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 429 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 430 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 431 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 432 VOPNAME_SPACE, { .vop_space = nfs4_space }, 433 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 434 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 435 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 436 VOPNAME_MAP, { .vop_map = nfs4_map }, 437 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 438 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 439 /* no separate nfs4_dump */ 440 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 441 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 442 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 443 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 444 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 445 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 446 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 447 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 448 NULL, NULL 449 }; 450 451 /* 452 * The following are subroutines and definitions to set args or get res 453 * for the different nfsv4 ops 454 */ 455 456 void 457 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 458 { 459 int i; 460 461 for (i = 0; i < arglen; i++) { 462 if (argop[i].argop == OP_LOOKUP) { 463 kmem_free( 464 argop[i].nfs_argop4_u.oplookup. 465 objname.utf8string_val, 466 argop[i].nfs_argop4_u.oplookup. 467 objname.utf8string_len); 468 } 469 } 470 } 471 472 static void 473 nfs4args_lock_free(nfs_argop4 *argop) 474 { 475 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 476 477 if (locker->new_lock_owner == TRUE) { 478 open_to_lock_owner4 *open_owner; 479 480 open_owner = &locker->locker4_u.open_owner; 481 if (open_owner->lock_owner.owner_val != NULL) { 482 kmem_free(open_owner->lock_owner.owner_val, 483 open_owner->lock_owner.owner_len); 484 } 485 } 486 } 487 488 static void 489 nfs4args_lockt_free(nfs_argop4 *argop) 490 { 491 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 492 493 if (lowner->owner_val != NULL) { 494 kmem_free(lowner->owner_val, lowner->owner_len); 495 } 496 } 497 498 static void 499 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 500 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 501 nfs4_stateid_types_t *sid_types) 502 { 503 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 504 mntinfo4_t *mi; 505 506 argop->argop = OP_SETATTR; 507 /* 508 * The stateid is set to 0 if client is not modifying the size 509 * and otherwise to whatever nfs4_get_stateid() returns. 510 * 511 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 512 * state struct could be found for the process/file pair. We may 513 * want to change this in the future (by OPENing the file). See 514 * bug # 4474852. 515 */ 516 if (vap->va_mask & AT_SIZE) { 517 518 ASSERT(rp != NULL); 519 mi = VTOMI4(RTOV4(rp)); 520 521 argop->nfs_argop4_u.opsetattr.stateid = 522 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 523 OP_SETATTR, sid_types, FALSE); 524 } else { 525 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 526 sizeof (stateid4)); 527 } 528 529 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 530 if (*error) 531 bzero(attr, sizeof (*attr)); 532 } 533 534 static void 535 nfs4args_setattr_free(nfs_argop4 *argop) 536 { 537 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 538 } 539 540 static int 541 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 542 bitmap4 supp) 543 { 544 fattr4 *attr; 545 int error = 0; 546 547 argop->argop = op; 548 switch (op) { 549 case OP_VERIFY: 550 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 551 break; 552 case OP_NVERIFY: 553 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 554 break; 555 default: 556 return (EINVAL); 557 } 558 if (!error) 559 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 560 if (error) 561 bzero(attr, sizeof (*attr)); 562 return (error); 563 } 564 565 static void 566 nfs4args_verify_free(nfs_argop4 *argop) 567 { 568 switch (argop->argop) { 569 case OP_VERIFY: 570 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 571 break; 572 case OP_NVERIFY: 573 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 574 break; 575 default: 576 break; 577 } 578 } 579 580 static void 581 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 582 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 583 { 584 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 585 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 586 587 argop->argop = OP_WRITE; 588 wargs->stable = stable; 589 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 590 mi, OP_WRITE, sid_tp); 591 wargs->mblk = NULL; 592 *wargs_pp = wargs; 593 } 594 595 void 596 nfs4args_copen_free(OPEN4cargs *open_args) 597 { 598 if (open_args->owner.owner_val) { 599 kmem_free(open_args->owner.owner_val, 600 open_args->owner.owner_len); 601 } 602 if ((open_args->opentype == OPEN4_CREATE) && 603 (open_args->mode != EXCLUSIVE4)) { 604 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 605 } 606 } 607 608 /* 609 * XXX: This is referenced in modstubs.s 610 */ 611 struct vnodeops * 612 nfs4_getvnodeops(void) 613 { 614 return (nfs4_vnodeops); 615 } 616 617 /* 618 * The OPEN operation opens a regular file. 619 */ 620 /*ARGSUSED3*/ 621 static int 622 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 623 { 624 vnode_t *dvp = NULL; 625 rnode4_t *rp, *drp; 626 int error; 627 int just_been_created; 628 char fn[MAXNAMELEN]; 629 630 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 631 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 632 return (EIO); 633 rp = VTOR4(*vpp); 634 635 /* 636 * Check to see if opening something besides a regular file; 637 * if so skip the OTW call 638 */ 639 if ((*vpp)->v_type != VREG) { 640 error = nfs4_open_non_reg_file(vpp, flag, cr); 641 return (error); 642 } 643 644 /* 645 * XXX - would like a check right here to know if the file is 646 * executable or not, so as to skip OTW 647 */ 648 649 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 650 return (error); 651 652 drp = VTOR4(dvp); 653 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 654 return (EINTR); 655 656 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 657 nfs_rw_exit(&drp->r_rwlock); 658 return (error); 659 } 660 661 /* 662 * See if this file has just been CREATEd. 663 * If so, clear the flag and update the dnlc, which was previously 664 * skipped in nfs4_create. 665 * XXX need better serilization on this. 666 * XXX move this into the nf4open_otw call, after we have 667 * XXX acquired the open owner seqid sync. 668 */ 669 mutex_enter(&rp->r_statev4_lock); 670 if (rp->created_v4) { 671 rp->created_v4 = 0; 672 mutex_exit(&rp->r_statev4_lock); 673 674 dnlc_update(dvp, fn, *vpp); 675 /* This is needed so we don't bump the open ref count */ 676 just_been_created = 1; 677 } else { 678 mutex_exit(&rp->r_statev4_lock); 679 just_been_created = 0; 680 } 681 682 /* 683 * If caller specified O_TRUNC/FTRUNC, then be sure to set 684 * FWRITE (to drive successful setattr(size=0) after open) 685 */ 686 if (flag & FTRUNC) 687 flag |= FWRITE; 688 689 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 690 just_been_created); 691 692 if (!error && !((*vpp)->v_flag & VROOT)) 693 dnlc_update(dvp, fn, *vpp); 694 695 nfs_rw_exit(&drp->r_rwlock); 696 697 /* release the hold from vtodv */ 698 VN_RELE(dvp); 699 700 /* exchange the shadow for the master vnode, if needed */ 701 702 if (error == 0 && IS_SHADOW(*vpp, rp)) 703 sv_exchange(vpp); 704 705 return (error); 706 } 707 708 /* 709 * See if there's a "lost open" request to be saved and recovered. 710 */ 711 static void 712 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 713 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 714 vnode_t *dvp, OPEN4cargs *open_args) 715 { 716 vfs_t *vfsp; 717 char *srccfp; 718 719 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 720 721 if (error != ETIMEDOUT && error != EINTR && 722 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 723 lost_rqstp->lr_op = 0; 724 return; 725 } 726 727 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 728 "nfs4open_save_lost_rqst: error %d", error)); 729 730 lost_rqstp->lr_op = OP_OPEN; 731 732 /* 733 * The vp (if it is not NULL) and dvp are held and rele'd via 734 * the recovery code. See nfs4_save_lost_rqst. 735 */ 736 lost_rqstp->lr_vp = vp; 737 lost_rqstp->lr_dvp = dvp; 738 lost_rqstp->lr_oop = oop; 739 lost_rqstp->lr_osp = NULL; 740 lost_rqstp->lr_lop = NULL; 741 lost_rqstp->lr_cr = cr; 742 lost_rqstp->lr_flk = NULL; 743 lost_rqstp->lr_oacc = open_args->share_access; 744 lost_rqstp->lr_odeny = open_args->share_deny; 745 lost_rqstp->lr_oclaim = open_args->claim; 746 if (open_args->claim == CLAIM_DELEGATE_CUR) { 747 lost_rqstp->lr_ostateid = 748 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 749 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 750 } else { 751 srccfp = open_args->open_claim4_u.cfile; 752 } 753 lost_rqstp->lr_ofile.utf8string_len = 0; 754 lost_rqstp->lr_ofile.utf8string_val = NULL; 755 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 756 lost_rqstp->lr_putfirst = FALSE; 757 } 758 759 struct nfs4_excl_time { 760 uint32 seconds; 761 uint32 nseconds; 762 }; 763 764 /* 765 * The OPEN operation creates and/or opens a regular file 766 * 767 * ARGSUSED 768 */ 769 static int 770 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 771 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 772 enum createmode4 createmode, int file_just_been_created) 773 { 774 rnode4_t *rp; 775 rnode4_t *drp = VTOR4(dvp); 776 vnode_t *vp = NULL; 777 vnode_t *vpi = *vpp; 778 bool_t needrecov = FALSE; 779 780 int doqueue = 1; 781 782 COMPOUND4args_clnt args; 783 COMPOUND4res_clnt res; 784 nfs_argop4 *argop; 785 nfs_resop4 *resop; 786 int argoplist_size; 787 int idx_open, idx_fattr; 788 789 GETFH4res *gf_res = NULL; 790 OPEN4res *op_res = NULL; 791 nfs4_ga_res_t *garp; 792 fattr4 *attr = NULL; 793 struct nfs4_excl_time verf; 794 bool_t did_excl_setup = FALSE; 795 int created_osp; 796 797 OPEN4cargs *open_args; 798 nfs4_open_owner_t *oop = NULL; 799 nfs4_open_stream_t *osp = NULL; 800 seqid4 seqid = 0; 801 bool_t retry_open = FALSE; 802 nfs4_recov_state_t recov_state; 803 nfs4_lost_rqst_t lost_rqst; 804 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 805 hrtime_t t; 806 int acc = 0; 807 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 808 cred_t *ncr = NULL; 809 810 nfs4_sharedfh_t *otw_sfh; 811 nfs4_sharedfh_t *orig_sfh; 812 int fh_differs = 0; 813 int numops, setgid_flag; 814 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 815 nfs4_rcsync_t *rcsync; 816 817 /* 818 * Make sure we properly deal with setting the right gid on 819 * a newly created file to reflect the parent's setgid bit 820 */ 821 setgid_flag = 0; 822 if (create_flag && in_va) { 823 824 /* 825 * If there is grpid mount flag used or 826 * the parent's directory has the setgid bit set 827 * _and_ the client was able to get a valid mapping 828 * for the parent dir's owner_group, we want to 829 * append NVERIFY(owner_group == dva.va_gid) and 830 * SETATTR to the CREATE compound. 831 */ 832 mutex_enter(&drp->r_statelock); 833 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID || 834 drp->r_attr.va_mode & VSGID) && 835 drp->r_attr.va_gid != GID_NOBODY) { 836 in_va->va_mask |= AT_GID; 837 in_va->va_gid = drp->r_attr.va_gid; 838 setgid_flag = 1; 839 } 840 mutex_exit(&drp->r_statelock); 841 } 842 843 /* 844 * Normal/non-create compound: 845 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 846 * 847 * Open(create) compound no setgid: 848 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 849 * RESTOREFH + GETATTR 850 * 851 * Open(create) setgid: 852 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 853 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 854 * NVERIFY(grp) + SETATTR 855 */ 856 if (setgid_flag) { 857 numops = 10; 858 idx_open = 1; 859 idx_fattr = 3; 860 } else if (create_flag) { 861 numops = 7; 862 idx_open = 2; 863 idx_fattr = 4; 864 } else { 865 numops = 4; 866 idx_open = 1; 867 idx_fattr = 3; 868 } 869 870 args.array_len = numops; 871 argoplist_size = numops * sizeof (nfs_argop4); 872 argop = kmem_alloc(argoplist_size, KM_SLEEP); 873 874 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 875 "open %s open flag 0x%x cred %p", file_name, open_flag, 876 (void *)cr)); 877 878 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 879 if (create_flag) { 880 /* 881 * We are to create a file. Initialize the passed in vnode 882 * pointer. 883 */ 884 vpi = NULL; 885 } else { 886 /* 887 * Check to see if the client owns a read delegation and is 888 * trying to open for write. If so, then return the delegation 889 * to avoid the server doing a cb_recall and returning DELAY. 890 * NB - we don't use the statev4_lock here because we'd have 891 * to drop the lock anyway and the result would be stale. 892 */ 893 if ((open_flag & FWRITE) && 894 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 895 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 896 897 /* 898 * If the file has a delegation, then do an access check up 899 * front. This avoids having to an access check later after 900 * we've already done start_op, which could deadlock. 901 */ 902 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 903 if (open_flag & FREAD && 904 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 905 acc |= VREAD; 906 if (open_flag & FWRITE && 907 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 908 acc |= VWRITE; 909 } 910 } 911 912 drp = VTOR4(dvp); 913 914 recov_state.rs_flags = 0; 915 recov_state.rs_num_retry_despite_err = 0; 916 cred_otw = cr; 917 918 recov_retry: 919 fh_differs = 0; 920 nfs4_error_zinit(&e); 921 922 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 923 if (e.error) { 924 if (ncr != NULL) 925 crfree(ncr); 926 kmem_free(argop, argoplist_size); 927 return (e.error); 928 } 929 930 args.ctag = TAG_OPEN; 931 args.array_len = numops; 932 args.array = argop; 933 934 /* putfh directory fh */ 935 argop[0].argop = OP_CPUTFH; 936 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 937 938 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 939 argop[idx_open].argop = OP_COPEN; 940 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 941 open_args->claim = CLAIM_NULL; 942 943 /* name of file */ 944 open_args->open_claim4_u.cfile = file_name; 945 open_args->owner.owner_len = 0; 946 open_args->owner.owner_val = NULL; 947 948 if (create_flag) { 949 /* CREATE a file */ 950 open_args->opentype = OPEN4_CREATE; 951 open_args->mode = createmode; 952 if (createmode == EXCLUSIVE4) { 953 if (did_excl_setup == FALSE) { 954 verf.seconds = zone_get_hostid(NULL); 955 if (verf.seconds != 0) 956 verf.nseconds = newnum(); 957 else { 958 timestruc_t now; 959 960 gethrestime(&now); 961 verf.seconds = now.tv_sec; 962 verf.nseconds = now.tv_nsec; 963 } 964 /* 965 * Since the server will use this value for the 966 * mtime, make sure that it can't overflow. Zero 967 * out the MSB. The actual value does not matter 968 * here, only its uniqeness. 969 */ 970 verf.seconds &= INT32_MAX; 971 did_excl_setup = TRUE; 972 } 973 974 /* Now copy over verifier to OPEN4args. */ 975 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 976 } else { 977 int v_error; 978 bitmap4 supp_attrs; 979 servinfo4_t *svp; 980 981 attr = &open_args->createhow4_u.createattrs; 982 983 svp = drp->r_server; 984 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 985 supp_attrs = svp->sv_supp_attrs; 986 nfs_rw_exit(&svp->sv_lock); 987 988 /* GUARDED4 or UNCHECKED4 */ 989 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 990 supp_attrs); 991 if (v_error) { 992 bzero(attr, sizeof (*attr)); 993 nfs4args_copen_free(open_args); 994 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 995 &recov_state, FALSE); 996 if (ncr != NULL) 997 crfree(ncr); 998 kmem_free(argop, argoplist_size); 999 return (v_error); 1000 } 1001 } 1002 } else { 1003 /* NO CREATE */ 1004 open_args->opentype = OPEN4_NOCREATE; 1005 } 1006 1007 if (recov_state.rs_sp != NULL) { 1008 mutex_enter(&recov_state.rs_sp->s_lock); 1009 open_args->owner.clientid = recov_state.rs_sp->clientid; 1010 mutex_exit(&recov_state.rs_sp->s_lock); 1011 } else { 1012 /* XXX should we just fail here? */ 1013 open_args->owner.clientid = 0; 1014 } 1015 1016 /* 1017 * This increments oop's ref count or creates a temporary 'just_created' 1018 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 1019 * completes. 1020 */ 1021 mutex_enter(&VTOMI4(dvp)->mi_lock); 1022 1023 /* See if a permanent or just created open owner exists */ 1024 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1025 if (!oop) { 1026 /* 1027 * This open owner does not exist so create a temporary 1028 * just created one. 1029 */ 1030 oop = create_open_owner(cr, VTOMI4(dvp)); 1031 ASSERT(oop != NULL); 1032 } 1033 mutex_exit(&VTOMI4(dvp)->mi_lock); 1034 1035 /* this length never changes, do alloc before seqid sync */ 1036 open_args->owner.owner_len = sizeof (oop->oo_name); 1037 open_args->owner.owner_val = 1038 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1039 1040 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1041 if (e.error == EAGAIN) { 1042 open_owner_rele(oop); 1043 nfs4args_copen_free(open_args); 1044 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1045 if (ncr != NULL) { 1046 crfree(ncr); 1047 ncr = NULL; 1048 } 1049 goto recov_retry; 1050 } 1051 1052 /* Check to see if we need to do the OTW call */ 1053 if (!create_flag) { 1054 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1055 file_just_been_created, &e.error, acc, &recov_state)) { 1056 1057 /* 1058 * The OTW open is not necessary. Either 1059 * the open can succeed without it (eg. 1060 * delegation, error == 0) or the open 1061 * must fail due to an access failure 1062 * (error != 0). In either case, tidy 1063 * up and return. 1064 */ 1065 1066 nfs4_end_open_seqid_sync(oop); 1067 open_owner_rele(oop); 1068 nfs4args_copen_free(open_args); 1069 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1070 if (ncr != NULL) 1071 crfree(ncr); 1072 kmem_free(argop, argoplist_size); 1073 return (e.error); 1074 } 1075 } 1076 1077 bcopy(&oop->oo_name, open_args->owner.owner_val, 1078 open_args->owner.owner_len); 1079 1080 seqid = nfs4_get_open_seqid(oop) + 1; 1081 open_args->seqid = seqid; 1082 open_args->share_access = 0; 1083 if (open_flag & FREAD) 1084 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1085 if (open_flag & FWRITE) 1086 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1087 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1088 1089 1090 1091 /* 1092 * getfh w/sanity check for idx_open/idx_fattr 1093 */ 1094 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1095 argop[idx_open + 1].argop = OP_GETFH; 1096 1097 /* getattr */ 1098 argop[idx_fattr].argop = OP_GETATTR; 1099 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1100 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1101 1102 if (setgid_flag) { 1103 vattr_t _v; 1104 servinfo4_t *svp; 1105 bitmap4 supp_attrs; 1106 1107 svp = drp->r_server; 1108 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1109 supp_attrs = svp->sv_supp_attrs; 1110 nfs_rw_exit(&svp->sv_lock); 1111 1112 /* 1113 * For setgid case, we need to: 1114 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1115 */ 1116 argop[4].argop = OP_SAVEFH; 1117 1118 argop[5].argop = OP_CPUTFH; 1119 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1120 1121 argop[6].argop = OP_GETATTR; 1122 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1123 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1124 1125 argop[7].argop = OP_RESTOREFH; 1126 1127 /* 1128 * nverify 1129 */ 1130 _v.va_mask = AT_GID; 1131 _v.va_gid = in_va->va_gid; 1132 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1133 supp_attrs))) { 1134 1135 /* 1136 * setattr 1137 * 1138 * We _know_ we're not messing with AT_SIZE or 1139 * AT_XTIME, so no need for stateid or flags. 1140 * Also we specify NULL rp since we're only 1141 * interested in setting owner_group attributes. 1142 */ 1143 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1144 supp_attrs, &e.error, 0); 1145 if (e.error) 1146 nfs4args_verify_free(&argop[8]); 1147 } 1148 1149 if (e.error) { 1150 /* 1151 * XXX - Revisit the last argument to nfs4_end_op() 1152 * once 5020486 is fixed. 1153 */ 1154 nfs4_end_open_seqid_sync(oop); 1155 open_owner_rele(oop); 1156 nfs4args_copen_free(open_args); 1157 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1158 if (ncr != NULL) 1159 crfree(ncr); 1160 kmem_free(argop, argoplist_size); 1161 return (e.error); 1162 } 1163 } else if (create_flag) { 1164 argop[1].argop = OP_SAVEFH; 1165 1166 argop[5].argop = OP_RESTOREFH; 1167 1168 argop[6].argop = OP_GETATTR; 1169 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1170 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1171 } 1172 1173 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1174 "nfs4open_otw: %s call, nm %s, rp %s", 1175 needrecov ? "recov" : "first", file_name, 1176 rnode4info(VTOR4(dvp)))); 1177 1178 t = gethrtime(); 1179 1180 rcsync = nfs4_recall_sync_start(VTOMI4(dvp)); 1181 1182 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1183 1184 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1185 nfs4_set_open_seqid(seqid, oop, args.ctag); 1186 1187 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1188 1189 if (e.error || needrecov) { 1190 bool_t abort = FALSE; 1191 1192 if (needrecov) { 1193 nfs4_bseqid_entry_t *bsep = NULL; 1194 1195 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1196 cred_otw, vpi, dvp, open_args); 1197 1198 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1199 bsep = nfs4_create_bseqid_entry(oop, NULL, 1200 vpi, 0, args.ctag, open_args->seqid); 1201 num_bseqid_retry--; 1202 } 1203 1204 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1205 NULL, lost_rqst.lr_op == OP_OPEN ? 1206 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL); 1207 1208 if (bsep) 1209 kmem_free(bsep, sizeof (*bsep)); 1210 /* give up if we keep getting BAD_SEQID */ 1211 if (num_bseqid_retry == 0) 1212 abort = TRUE; 1213 if (abort == TRUE && e.error == 0) 1214 e.error = geterrno4(res.status); 1215 } 1216 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1217 nfs4_end_open_seqid_sync(oop); 1218 open_owner_rele(oop); 1219 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1220 nfs4args_copen_free(open_args); 1221 if (setgid_flag) { 1222 nfs4args_verify_free(&argop[8]); 1223 nfs4args_setattr_free(&argop[9]); 1224 } 1225 if (!e.error) 1226 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1227 if (ncr != NULL) { 1228 crfree(ncr); 1229 ncr = NULL; 1230 } 1231 if (!needrecov || abort == TRUE || e.error == EINTR || 1232 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1233 kmem_free(argop, argoplist_size); 1234 return (e.error); 1235 } 1236 goto recov_retry; 1237 } 1238 1239 /* 1240 * Will check and update lease after checking the rflag for 1241 * OPEN_CONFIRM in the successful OPEN call. 1242 */ 1243 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1244 1245 /* 1246 * XXX what if we're crossing mount points from server1:/drp 1247 * to server2:/drp/rp. 1248 */ 1249 1250 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1251 1252 /* Signal our end of use of the open seqid */ 1253 nfs4_end_open_seqid_sync(oop); 1254 1255 /* 1256 * This will destroy the open owner if it was just created, 1257 * and no one else has put a reference on it. 1258 */ 1259 open_owner_rele(oop); 1260 if (create_flag && (createmode != EXCLUSIVE4) && 1261 res.status == NFS4ERR_BADOWNER) 1262 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1263 1264 e.error = geterrno4(res.status); 1265 nfs4args_copen_free(open_args); 1266 if (setgid_flag) { 1267 nfs4args_verify_free(&argop[8]); 1268 nfs4args_setattr_free(&argop[9]); 1269 } 1270 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1271 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1272 /* 1273 * If the reply is NFS4ERR_ACCESS, it may be because 1274 * we are root (no root net access). If the real uid 1275 * is not root, then retry with the real uid instead. 1276 */ 1277 if (ncr != NULL) { 1278 crfree(ncr); 1279 ncr = NULL; 1280 } 1281 if (res.status == NFS4ERR_ACCESS && 1282 (ncr = crnetadjust(cred_otw)) != NULL) { 1283 cred_otw = ncr; 1284 goto recov_retry; 1285 } 1286 kmem_free(argop, argoplist_size); 1287 return (e.error); 1288 } 1289 1290 resop = &res.array[idx_open]; /* open res */ 1291 op_res = &resop->nfs_resop4_u.opopen; 1292 1293 #ifdef DEBUG 1294 /* 1295 * verify attrset bitmap 1296 */ 1297 if (create_flag && 1298 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1299 /* make sure attrset returned is what we asked for */ 1300 /* XXX Ignore this 'error' for now */ 1301 if (attr->attrmask != op_res->attrset) 1302 /* EMPTY */; 1303 } 1304 #endif 1305 1306 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1307 mutex_enter(&VTOMI4(dvp)->mi_lock); 1308 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1309 mutex_exit(&VTOMI4(dvp)->mi_lock); 1310 } 1311 1312 resop = &res.array[idx_open + 1]; /* getfh res */ 1313 gf_res = &resop->nfs_resop4_u.opgetfh; 1314 1315 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1316 1317 /* 1318 * The open stateid has been updated on the server but not 1319 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1320 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1321 * WRITE call. That, however, will use the old stateid, so go ahead 1322 * and upate the open stateid now, before any call to makenfs4node. 1323 */ 1324 if (vpi) { 1325 nfs4_open_stream_t *tmp_osp; 1326 rnode4_t *tmp_rp = VTOR4(vpi); 1327 1328 tmp_osp = find_open_stream(oop, tmp_rp); 1329 if (tmp_osp) { 1330 tmp_osp->open_stateid = op_res->stateid; 1331 mutex_exit(&tmp_osp->os_sync_lock); 1332 open_stream_rele(tmp_osp, tmp_rp); 1333 } 1334 1335 /* 1336 * We must determine if the file handle given by the otw open 1337 * is the same as the file handle which was passed in with 1338 * *vpp. This case can be reached if the file we are trying 1339 * to open has been removed and another file has been created 1340 * having the same file name. The passed in vnode is released 1341 * later. 1342 */ 1343 orig_sfh = VTOR4(vpi)->r_fh; 1344 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1345 } 1346 1347 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1348 1349 if (create_flag || fh_differs) { 1350 int rnode_err = 0; 1351 1352 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1353 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1354 1355 if (e.error) 1356 PURGE_ATTRCACHE4(vp); 1357 /* 1358 * For the newly created vp case, make sure the rnode 1359 * isn't bad before using it. 1360 */ 1361 mutex_enter(&(VTOR4(vp))->r_statelock); 1362 if (VTOR4(vp)->r_flags & R4RECOVERR) 1363 rnode_err = EIO; 1364 mutex_exit(&(VTOR4(vp))->r_statelock); 1365 1366 if (rnode_err) { 1367 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1368 nfs4_end_open_seqid_sync(oop); 1369 nfs4args_copen_free(open_args); 1370 if (setgid_flag) { 1371 nfs4args_verify_free(&argop[8]); 1372 nfs4args_setattr_free(&argop[9]); 1373 } 1374 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1375 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1376 needrecov); 1377 open_owner_rele(oop); 1378 VN_RELE(vp); 1379 if (ncr != NULL) 1380 crfree(ncr); 1381 sfh4_rele(&otw_sfh); 1382 kmem_free(argop, argoplist_size); 1383 return (EIO); 1384 } 1385 } else { 1386 vp = vpi; 1387 } 1388 sfh4_rele(&otw_sfh); 1389 1390 /* 1391 * It seems odd to get a full set of attrs and then not update 1392 * the object's attrcache in the non-create case. Create case uses 1393 * the attrs since makenfs4node checks to see if the attrs need to 1394 * be updated (and then updates them). The non-create case should 1395 * update attrs also. 1396 */ 1397 if (! create_flag && ! fh_differs && !e.error) { 1398 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1399 } 1400 1401 nfs4_error_zinit(&e); 1402 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1403 /* This does not do recovery for vp explicitly. */ 1404 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1405 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1406 1407 if (e.error || e.stat) { 1408 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1409 nfs4_end_open_seqid_sync(oop); 1410 nfs4args_copen_free(open_args); 1411 if (setgid_flag) { 1412 nfs4args_verify_free(&argop[8]); 1413 nfs4args_setattr_free(&argop[9]); 1414 } 1415 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1416 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1417 needrecov); 1418 open_owner_rele(oop); 1419 if (create_flag || fh_differs) { 1420 /* rele the makenfs4node */ 1421 VN_RELE(vp); 1422 } 1423 if (ncr != NULL) { 1424 crfree(ncr); 1425 ncr = NULL; 1426 } 1427 if (retry_open == TRUE) { 1428 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1429 "nfs4open_otw: retry the open since OPEN " 1430 "CONFIRM failed with error %d stat %d", 1431 e.error, e.stat)); 1432 if (create_flag && createmode == GUARDED4) { 1433 NFS4_DEBUG(nfs4_client_recov_debug, 1434 (CE_NOTE, "nfs4open_otw: switch " 1435 "createmode from GUARDED4 to " 1436 "UNCHECKED4")); 1437 createmode = UNCHECKED4; 1438 } 1439 goto recov_retry; 1440 } 1441 if (!e.error) { 1442 if (create_flag && (createmode != EXCLUSIVE4) && 1443 e.stat == NFS4ERR_BADOWNER) 1444 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1445 1446 e.error = geterrno4(e.stat); 1447 } 1448 kmem_free(argop, argoplist_size); 1449 return (e.error); 1450 } 1451 } 1452 1453 rp = VTOR4(vp); 1454 1455 mutex_enter(&rp->r_statev4_lock); 1456 if (create_flag) 1457 rp->created_v4 = 1; 1458 mutex_exit(&rp->r_statev4_lock); 1459 1460 mutex_enter(&oop->oo_lock); 1461 /* Doesn't matter if 'oo_just_created' already was set as this */ 1462 oop->oo_just_created = NFS4_PERM_CREATED; 1463 if (oop->oo_cred_otw) 1464 crfree(oop->oo_cred_otw); 1465 oop->oo_cred_otw = cred_otw; 1466 crhold(oop->oo_cred_otw); 1467 mutex_exit(&oop->oo_lock); 1468 1469 /* returns with 'os_sync_lock' held */ 1470 osp = find_or_create_open_stream(oop, rp, &created_osp); 1471 if (!osp) { 1472 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1473 "nfs4open_otw: failed to create an open stream")); 1474 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1475 "signal our end of use of the open seqid")); 1476 1477 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1478 nfs4_end_open_seqid_sync(oop); 1479 open_owner_rele(oop); 1480 nfs4args_copen_free(open_args); 1481 if (setgid_flag) { 1482 nfs4args_verify_free(&argop[8]); 1483 nfs4args_setattr_free(&argop[9]); 1484 } 1485 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1486 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1487 if (create_flag || fh_differs) 1488 VN_RELE(vp); 1489 if (ncr != NULL) 1490 crfree(ncr); 1491 1492 kmem_free(argop, argoplist_size); 1493 return (EINVAL); 1494 1495 } 1496 1497 osp->open_stateid = op_res->stateid; 1498 1499 if (open_flag & FREAD) 1500 osp->os_share_acc_read++; 1501 if (open_flag & FWRITE) 1502 osp->os_share_acc_write++; 1503 osp->os_share_deny_none++; 1504 1505 /* 1506 * Need to reset this bitfield for the possible case where we were 1507 * going to OTW CLOSE the file, got a non-recoverable error, and before 1508 * we could retry the CLOSE, OPENed the file again. 1509 */ 1510 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1511 osp->os_final_close = 0; 1512 osp->os_force_close = 0; 1513 #ifdef DEBUG 1514 if (osp->os_failed_reopen) 1515 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1516 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1517 (void *)osp, (void *)cr, rnode4info(rp))); 1518 #endif 1519 osp->os_failed_reopen = 0; 1520 1521 mutex_exit(&osp->os_sync_lock); 1522 1523 nfs4_end_open_seqid_sync(oop); 1524 1525 if (created_osp && recov_state.rs_sp != NULL) { 1526 mutex_enter(&recov_state.rs_sp->s_lock); 1527 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1528 mutex_exit(&recov_state.rs_sp->s_lock); 1529 } 1530 1531 /* get rid of our reference to find oop */ 1532 open_owner_rele(oop); 1533 1534 open_stream_rele(osp, rp); 1535 1536 /* accept delegation, if any */ 1537 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1538 1539 nfs4_recall_sync_end(VTOMI4(dvp), rcsync); 1540 1541 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1542 1543 if (createmode == EXCLUSIVE4 && 1544 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1545 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1546 " EXCLUSIVE4: sending a SETATTR")); 1547 /* 1548 * If doing an exclusive create, then generate 1549 * a SETATTR to set the initial attributes. 1550 * Try to set the mtime and the atime to the 1551 * server's current time. It is somewhat 1552 * expected that these fields will be used to 1553 * store the exclusive create cookie. If not, 1554 * server implementors will need to know that 1555 * a SETATTR will follow an exclusive create 1556 * and the cookie should be destroyed if 1557 * appropriate. 1558 * 1559 * The AT_GID and AT_SIZE bits are turned off 1560 * so that the SETATTR request will not attempt 1561 * to process these. The gid will be set 1562 * separately if appropriate. The size is turned 1563 * off because it is assumed that a new file will 1564 * be created empty and if the file wasn't empty, 1565 * then the exclusive create will have failed 1566 * because the file must have existed already. 1567 * Therefore, no truncate operation is needed. 1568 */ 1569 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1570 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1571 1572 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1573 if (e.error) { 1574 nfs4_error_t err; 1575 1576 /* 1577 * Couldn't correct the attributes of 1578 * the newly created file and the 1579 * attributes are wrong. Remove the 1580 * file and return an error to the 1581 * application. 1582 */ 1583 /* XXX will this take care of client state ? */ 1584 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1585 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1586 " remove file", e.error)); 1587 1588 /* 1589 * The file is currently open so try to close it first. 1590 * 1591 * If we do not close the file explicitly here then the 1592 * VN_RELE() would do an (implicit and asynchronous) 1593 * close for us. But such async close could race with 1594 * the nfs4_remove() below. If the async close is 1595 * slower than nfs4_remove() then nfs4_remove() 1596 * wouldn't remove the file but rename it to .nfsXXXX 1597 * instead. 1598 */ 1599 nfs4close_one(vp, NULL, cr, open_flag, NULL, &err, 1600 CLOSE_NORM, 0, 0, 0); 1601 VN_RELE(vp); 1602 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1603 1604 /* 1605 * Since we've reled the vnode and removed 1606 * the file we now need to return the error. 1607 * At this point we don't want to update the 1608 * dircaches, call nfs4_waitfor_purge_complete 1609 * or set vpp to vp so we need to skip these 1610 * as well. 1611 */ 1612 goto skip_update_dircaches; 1613 } 1614 } 1615 1616 /* 1617 * If we created or found the correct vnode, due to create_flag or 1618 * fh_differs being set, then update directory cache attribute, readdir 1619 * and dnlc caches. 1620 */ 1621 if (create_flag || fh_differs) { 1622 dirattr_info_t dinfo, *dinfop; 1623 1624 /* 1625 * Make sure getattr succeeded before using results. 1626 * note: op 7 is getattr(dir) for both flavors of 1627 * open(create). 1628 */ 1629 if (create_flag && res.status == NFS4_OK) { 1630 dinfo.di_time_call = t; 1631 dinfo.di_cred = cr; 1632 dinfo.di_garp = 1633 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1634 dinfop = &dinfo; 1635 } else { 1636 dinfop = NULL; 1637 } 1638 1639 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1640 dinfop); 1641 } 1642 1643 /* 1644 * If the page cache for this file was flushed from actions 1645 * above, it was done asynchronously and if that is true, 1646 * there is a need to wait here for it to complete. This must 1647 * be done outside of start_fop/end_fop. 1648 */ 1649 (void) nfs4_waitfor_purge_complete(vp); 1650 1651 /* 1652 * It is implicit that we are in the open case (create_flag == 0) since 1653 * fh_differs can only be set to a non-zero value in the open case. 1654 */ 1655 if (fh_differs != 0 && vpi != NULL) 1656 VN_RELE(vpi); 1657 1658 /* 1659 * Be sure to set *vpp to the correct value before returning. 1660 */ 1661 *vpp = vp; 1662 1663 skip_update_dircaches: 1664 1665 nfs4args_copen_free(open_args); 1666 if (setgid_flag) { 1667 nfs4args_verify_free(&argop[8]); 1668 nfs4args_setattr_free(&argop[9]); 1669 } 1670 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1671 1672 if (ncr) 1673 crfree(ncr); 1674 kmem_free(argop, argoplist_size); 1675 return (e.error); 1676 } 1677 1678 /* 1679 * Reopen an open instance. cf. nfs4open_otw(). 1680 * 1681 * Errors are returned by the nfs4_error_t parameter. 1682 * - ep->error contains an errno value or zero. 1683 * - if it is zero, ep->stat is set to an NFS status code, if any. 1684 * If the file could not be reopened, but the caller should continue, the 1685 * file is marked dead and no error values are returned. If the caller 1686 * should stop recovering open files and start over, either the ep->error 1687 * value or ep->stat will indicate an error (either something that requires 1688 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1689 * filehandles) may be handled silently by this routine. 1690 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1691 * will be started, so the caller should not do it. 1692 * 1693 * Gotos: 1694 * - kill_file : reopen failed in such a fashion to constitute marking the 1695 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1696 * is for cases where recovery is not possible. 1697 * - failed_reopen : same as above, except that the file has already been 1698 * marked dead, so no need to do it again. 1699 * - bailout : reopen failed but we are able to recover and retry the reopen - 1700 * either within this function immediately or via the calling function. 1701 */ 1702 1703 void 1704 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1705 open_claim_type4 claim, bool_t frc_use_claim_previous, 1706 bool_t is_recov) 1707 { 1708 COMPOUND4args_clnt args; 1709 COMPOUND4res_clnt res; 1710 nfs_argop4 argop[4]; 1711 nfs_resop4 *resop; 1712 OPEN4res *op_res = NULL; 1713 OPEN4cargs *open_args; 1714 GETFH4res *gf_res; 1715 rnode4_t *rp = VTOR4(vp); 1716 int doqueue = 1; 1717 cred_t *cr = NULL, *cred_otw = NULL; 1718 nfs4_open_owner_t *oop = NULL; 1719 seqid4 seqid; 1720 nfs4_ga_res_t *garp; 1721 char fn[MAXNAMELEN]; 1722 nfs4_recov_state_t recov = {NULL, 0}; 1723 nfs4_lost_rqst_t lost_rqst; 1724 mntinfo4_t *mi = VTOMI4(vp); 1725 bool_t abort; 1726 char *failed_msg = ""; 1727 int fh_different; 1728 hrtime_t t; 1729 nfs4_bseqid_entry_t *bsep = NULL; 1730 nfs4_rcsync_t *rcsync = NULL; 1731 1732 ASSERT(nfs4_consistent_type(vp)); 1733 ASSERT(nfs_zone() == mi->mi_zone); 1734 1735 nfs4_error_zinit(ep); 1736 1737 /* this is the cred used to find the open owner */ 1738 cr = state_to_cred(osp); 1739 if (cr == NULL) { 1740 failed_msg = "Couldn't reopen: no cred"; 1741 goto kill_file; 1742 } 1743 /* use this cred for OTW operations */ 1744 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1745 1746 top: 1747 nfs4_error_zinit(ep); 1748 1749 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1750 /* File system has been unmounted, quit */ 1751 ep->error = EIO; 1752 failed_msg = "Couldn't reopen: file system has been unmounted"; 1753 goto kill_file; 1754 } 1755 1756 oop = osp->os_open_owner; 1757 1758 ASSERT(oop != NULL); 1759 if (oop == NULL) { /* be defensive in non-DEBUG */ 1760 failed_msg = "can't reopen: no open owner"; 1761 goto kill_file; 1762 } 1763 open_owner_hold(oop); 1764 1765 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1766 if (ep->error) { 1767 open_owner_rele(oop); 1768 oop = NULL; 1769 goto bailout; 1770 } 1771 1772 /* 1773 * If the rnode has a delegation and the delegation has been 1774 * recovered and the server didn't request a recall and the caller 1775 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1776 * recovery) and the rnode hasn't been marked dead, then install 1777 * the delegation stateid in the open stream. Otherwise, proceed 1778 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1779 */ 1780 mutex_enter(&rp->r_statev4_lock); 1781 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1782 !rp->r_deleg_return_pending && 1783 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1784 !rp->r_deleg_needs_recall && 1785 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1786 !(rp->r_flags & R4RECOVERR)) { 1787 mutex_enter(&osp->os_sync_lock); 1788 osp->os_delegation = 1; 1789 osp->open_stateid = rp->r_deleg_stateid; 1790 mutex_exit(&osp->os_sync_lock); 1791 mutex_exit(&rp->r_statev4_lock); 1792 goto bailout; 1793 } 1794 mutex_exit(&rp->r_statev4_lock); 1795 1796 /* 1797 * If the file failed recovery, just quit. This failure need not 1798 * affect other reopens, so don't return an error. 1799 */ 1800 mutex_enter(&rp->r_statelock); 1801 if (rp->r_flags & R4RECOVERR) { 1802 mutex_exit(&rp->r_statelock); 1803 ep->error = 0; 1804 goto failed_reopen; 1805 } 1806 mutex_exit(&rp->r_statelock); 1807 1808 /* 1809 * argop is empty here 1810 * 1811 * PUTFH, OPEN, GETATTR 1812 */ 1813 args.ctag = TAG_REOPEN; 1814 args.array_len = 4; 1815 args.array = argop; 1816 1817 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1818 "nfs4_reopen: file is type %d, id %s", 1819 vp->v_type, rnode4info(VTOR4(vp)))); 1820 1821 argop[0].argop = OP_CPUTFH; 1822 1823 if (claim != CLAIM_PREVIOUS) { 1824 /* 1825 * if this is a file mount then 1826 * use the mntinfo parentfh 1827 */ 1828 argop[0].nfs_argop4_u.opcputfh.sfh = 1829 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1830 VTOSV(vp)->sv_dfh; 1831 } else { 1832 /* putfh fh to reopen */ 1833 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1834 } 1835 1836 argop[1].argop = OP_COPEN; 1837 open_args = &argop[1].nfs_argop4_u.opcopen; 1838 open_args->claim = claim; 1839 1840 if (claim == CLAIM_NULL) { 1841 1842 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1843 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1844 "failed for vp 0x%p for CLAIM_NULL with %m", 1845 (void *)vp); 1846 failed_msg = "Couldn't reopen: vtoname failed for " 1847 "CLAIM_NULL"; 1848 /* nothing allocated yet */ 1849 goto kill_file; 1850 } 1851 1852 open_args->open_claim4_u.cfile = fn; 1853 } else if (claim == CLAIM_PREVIOUS) { 1854 1855 /* 1856 * We have two cases to deal with here: 1857 * 1) We're being called to reopen files in order to satisfy 1858 * a lock operation request which requires us to explicitly 1859 * reopen files which were opened under a delegation. If 1860 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1861 * that case, frc_use_claim_previous is TRUE and we must 1862 * use the rnode's current delegation type (r_deleg_type). 1863 * 2) We're reopening files during some form of recovery. 1864 * In this case, frc_use_claim_previous is FALSE and we 1865 * use the delegation type appropriate for recovery 1866 * (r_deleg_needs_recovery). 1867 */ 1868 mutex_enter(&rp->r_statev4_lock); 1869 open_args->open_claim4_u.delegate_type = 1870 frc_use_claim_previous ? 1871 rp->r_deleg_type : 1872 rp->r_deleg_needs_recovery; 1873 mutex_exit(&rp->r_statev4_lock); 1874 1875 } else if (claim == CLAIM_DELEGATE_CUR) { 1876 1877 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1878 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1879 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1880 "with %m", (void *)vp); 1881 failed_msg = "Couldn't reopen: vtoname failed for " 1882 "CLAIM_DELEGATE_CUR"; 1883 /* nothing allocated yet */ 1884 goto kill_file; 1885 } 1886 1887 mutex_enter(&rp->r_statev4_lock); 1888 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1889 rp->r_deleg_stateid; 1890 mutex_exit(&rp->r_statev4_lock); 1891 1892 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1893 } 1894 open_args->opentype = OPEN4_NOCREATE; 1895 open_args->owner.clientid = mi2clientid(mi); 1896 open_args->owner.owner_len = sizeof (oop->oo_name); 1897 open_args->owner.owner_val = 1898 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1899 bcopy(&oop->oo_name, open_args->owner.owner_val, 1900 open_args->owner.owner_len); 1901 open_args->share_access = 0; 1902 open_args->share_deny = 0; 1903 1904 mutex_enter(&osp->os_sync_lock); 1905 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1906 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1907 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1908 (void *)osp, (void *)rp, osp->os_share_acc_read, 1909 osp->os_share_acc_write, osp->os_open_ref_count, 1910 osp->os_mmap_read, osp->os_mmap_write, claim)); 1911 1912 if (osp->os_share_acc_read || osp->os_mmap_read) 1913 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1914 if (osp->os_share_acc_write || osp->os_mmap_write) 1915 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1916 if (osp->os_share_deny_read) 1917 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1918 if (osp->os_share_deny_write) 1919 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1920 mutex_exit(&osp->os_sync_lock); 1921 1922 seqid = nfs4_get_open_seqid(oop) + 1; 1923 open_args->seqid = seqid; 1924 1925 /* Construct the getfh part of the compound */ 1926 argop[2].argop = OP_GETFH; 1927 1928 /* Construct the getattr part of the compound */ 1929 argop[3].argop = OP_GETATTR; 1930 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1931 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1932 1933 t = gethrtime(); 1934 1935 rcsync = nfs4_recall_sync_start(mi); 1936 1937 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1938 1939 if (ep->error) { 1940 if (!is_recov && !frc_use_claim_previous && 1941 (ep->error == EINTR || ep->error == ETIMEDOUT || 1942 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1943 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1944 cred_otw, vp, NULL, open_args); 1945 abort = nfs4_start_recovery(ep, 1946 VTOMI4(vp), vp, NULL, NULL, 1947 lost_rqst.lr_op == OP_OPEN ? 1948 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL); 1949 nfs4args_copen_free(open_args); 1950 goto bailout; 1951 } 1952 1953 nfs4args_copen_free(open_args); 1954 1955 if (ep->error == EACCES && cred_otw != cr) { 1956 crfree(cred_otw); 1957 cred_otw = cr; 1958 crhold(cred_otw); 1959 nfs4_recall_sync_end(mi, rcsync); 1960 rcsync = NULL; 1961 nfs4_end_open_seqid_sync(oop); 1962 open_owner_rele(oop); 1963 oop = NULL; 1964 goto top; 1965 } 1966 if (ep->error == ETIMEDOUT) 1967 goto bailout; 1968 failed_msg = "Couldn't reopen: rpc error"; 1969 goto kill_file; 1970 } 1971 1972 if (nfs4_need_to_bump_seqid(&res)) 1973 nfs4_set_open_seqid(seqid, oop, args.ctag); 1974 1975 switch (res.status) { 1976 case NFS4_OK: 1977 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1978 mutex_enter(&rp->r_statelock); 1979 rp->r_delay_interval = 0; 1980 mutex_exit(&rp->r_statelock); 1981 } 1982 break; 1983 case NFS4ERR_BAD_SEQID: 1984 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1985 args.ctag, open_args->seqid); 1986 1987 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1988 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1989 NULL, OP_OPEN, bsep, NULL, NULL); 1990 1991 nfs4args_copen_free(open_args); 1992 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1993 nfs4_end_open_seqid_sync(oop); 1994 open_owner_rele(oop); 1995 oop = NULL; 1996 kmem_free(bsep, sizeof (*bsep)); 1997 1998 goto kill_file; 1999 case NFS4ERR_NO_GRACE: 2000 nfs4args_copen_free(open_args); 2001 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2002 nfs4_recall_sync_end(mi, rcsync); 2003 rcsync = NULL; 2004 nfs4_end_open_seqid_sync(oop); 2005 open_owner_rele(oop); 2006 oop = NULL; 2007 if (claim == CLAIM_PREVIOUS) { 2008 /* 2009 * Retry as a plain open. We don't need to worry about 2010 * checking the changeinfo: it is acceptable for a 2011 * client to re-open a file and continue processing 2012 * (in the absence of locks). 2013 */ 2014 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2015 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 2016 "will retry as CLAIM_NULL")); 2017 claim = CLAIM_NULL; 2018 nfs4_mi_kstat_inc_no_grace(mi); 2019 goto top; 2020 } 2021 failed_msg = 2022 "Couldn't reopen: tried reclaim outside grace period. "; 2023 goto kill_file; 2024 case NFS4ERR_GRACE: 2025 nfs4_set_grace_wait(mi); 2026 nfs4args_copen_free(open_args); 2027 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2028 nfs4_recall_sync_end(mi, rcsync); 2029 rcsync = NULL; 2030 nfs4_end_open_seqid_sync(oop); 2031 open_owner_rele(oop); 2032 oop = NULL; 2033 ep->error = nfs4_wait_for_grace(mi, &recov); 2034 if (ep->error != 0) 2035 goto bailout; 2036 goto top; 2037 case NFS4ERR_DELAY: 2038 nfs4_set_delay_wait(vp); 2039 nfs4args_copen_free(open_args); 2040 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2041 nfs4_recall_sync_end(mi, rcsync); 2042 rcsync = NULL; 2043 nfs4_end_open_seqid_sync(oop); 2044 open_owner_rele(oop); 2045 oop = NULL; 2046 ep->error = nfs4_wait_for_delay(vp, &recov); 2047 nfs4_mi_kstat_inc_delay(mi); 2048 if (ep->error != 0) 2049 goto bailout; 2050 goto top; 2051 case NFS4ERR_FHEXPIRED: 2052 /* recover filehandle and retry */ 2053 abort = nfs4_start_recovery(ep, 2054 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL); 2055 nfs4args_copen_free(open_args); 2056 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2057 nfs4_recall_sync_end(mi, rcsync); 2058 rcsync = NULL; 2059 nfs4_end_open_seqid_sync(oop); 2060 open_owner_rele(oop); 2061 oop = NULL; 2062 if (abort == FALSE) 2063 goto top; 2064 failed_msg = "Couldn't reopen: recovery aborted"; 2065 goto kill_file; 2066 case NFS4ERR_RESOURCE: 2067 case NFS4ERR_STALE_CLIENTID: 2068 case NFS4ERR_WRONGSEC: 2069 case NFS4ERR_EXPIRED: 2070 /* 2071 * Do not mark the file dead and let the calling 2072 * function initiate recovery. 2073 */ 2074 nfs4args_copen_free(open_args); 2075 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2076 nfs4_recall_sync_end(mi, rcsync); 2077 rcsync = NULL; 2078 nfs4_end_open_seqid_sync(oop); 2079 open_owner_rele(oop); 2080 oop = NULL; 2081 goto bailout; 2082 case NFS4ERR_ACCESS: 2083 if (cred_otw != cr) { 2084 crfree(cred_otw); 2085 cred_otw = cr; 2086 crhold(cred_otw); 2087 nfs4args_copen_free(open_args); 2088 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2089 nfs4_recall_sync_end(mi, rcsync); 2090 rcsync = NULL; 2091 nfs4_end_open_seqid_sync(oop); 2092 open_owner_rele(oop); 2093 oop = NULL; 2094 goto top; 2095 } 2096 /* fall through */ 2097 default: 2098 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2099 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2100 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2101 rnode4info(VTOR4(vp)))); 2102 failed_msg = "Couldn't reopen: NFSv4 error"; 2103 nfs4args_copen_free(open_args); 2104 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2105 goto kill_file; 2106 } 2107 2108 resop = &res.array[1]; /* open res */ 2109 op_res = &resop->nfs_resop4_u.opopen; 2110 2111 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2112 2113 /* 2114 * Check if the path we reopened really is the same 2115 * file. We could end up in a situation where the file 2116 * was removed and a new file created with the same name. 2117 */ 2118 resop = &res.array[2]; 2119 gf_res = &resop->nfs_resop4_u.opgetfh; 2120 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2121 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2122 if (fh_different) { 2123 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2124 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2125 /* Oops, we don't have the same file */ 2126 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2127 failed_msg = "Couldn't reopen: Persistent " 2128 "file handle changed"; 2129 else 2130 failed_msg = "Couldn't reopen: Volatile " 2131 "(no expire on open) file handle changed"; 2132 2133 nfs4args_copen_free(open_args); 2134 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2135 nfs_rw_exit(&mi->mi_fh_lock); 2136 goto kill_file; 2137 2138 } else { 2139 /* 2140 * We have volatile file handles that don't compare. 2141 * If the fids are the same then we assume that the 2142 * file handle expired but the rnode still refers to 2143 * the same file object. 2144 * 2145 * First check that we have fids or not. 2146 * If we don't we have a dumb server so we will 2147 * just assume every thing is ok for now. 2148 */ 2149 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2150 rp->r_attr.va_mask & AT_NODEID && 2151 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2152 /* 2153 * We have fids, but they don't 2154 * compare. So kill the file. 2155 */ 2156 failed_msg = 2157 "Couldn't reopen: file handle changed" 2158 " due to mismatched fids"; 2159 nfs4args_copen_free(open_args); 2160 (void) xdr_free(xdr_COMPOUND4res_clnt, 2161 (caddr_t)&res); 2162 nfs_rw_exit(&mi->mi_fh_lock); 2163 goto kill_file; 2164 } else { 2165 /* 2166 * We have volatile file handles that refers 2167 * to the same file (at least they have the 2168 * same fid) or we don't have fids so we 2169 * can't tell. :(. We'll be a kind and accepting 2170 * client so we'll update the rnode's file 2171 * handle with the otw handle. 2172 * 2173 * We need to drop mi->mi_fh_lock since 2174 * sh4_update acquires it. Since there is 2175 * only one recovery thread there is no 2176 * race. 2177 */ 2178 nfs_rw_exit(&mi->mi_fh_lock); 2179 sfh4_update(rp->r_fh, &gf_res->object); 2180 } 2181 } 2182 } else { 2183 nfs_rw_exit(&mi->mi_fh_lock); 2184 } 2185 2186 ASSERT(nfs4_consistent_type(vp)); 2187 2188 /* 2189 * If the server wanted an OPEN_CONFIRM but that fails, just start 2190 * over. Presumably if there is a persistent error it will show up 2191 * when we resend the OPEN. 2192 */ 2193 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2194 bool_t retry_open = FALSE; 2195 2196 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2197 cred_otw, is_recov, &retry_open, 2198 oop, FALSE, ep, NULL); 2199 if (ep->error || ep->stat) { 2200 nfs4args_copen_free(open_args); 2201 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2202 nfs4_recall_sync_end(mi, rcsync); 2203 rcsync = NULL; 2204 nfs4_end_open_seqid_sync(oop); 2205 open_owner_rele(oop); 2206 oop = NULL; 2207 goto top; 2208 } 2209 } 2210 2211 mutex_enter(&osp->os_sync_lock); 2212 osp->open_stateid = op_res->stateid; 2213 osp->os_delegation = 0; 2214 /* 2215 * Need to reset this bitfield for the possible case where we were 2216 * going to OTW CLOSE the file, got a non-recoverable error, and before 2217 * we could retry the CLOSE, OPENed the file again. 2218 */ 2219 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2220 osp->os_final_close = 0; 2221 osp->os_force_close = 0; 2222 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2223 osp->os_dc_openacc = open_args->share_access; 2224 mutex_exit(&osp->os_sync_lock); 2225 2226 nfs4_end_open_seqid_sync(oop); 2227 2228 /* accept delegation, if any */ 2229 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2230 2231 nfs4_recall_sync_end(mi, rcsync); 2232 2233 nfs4args_copen_free(open_args); 2234 2235 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2236 2237 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2238 2239 ASSERT(nfs4_consistent_type(vp)); 2240 2241 open_owner_rele(oop); 2242 crfree(cr); 2243 crfree(cred_otw); 2244 return; 2245 2246 kill_file: 2247 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2248 failed_reopen: 2249 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2250 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2251 (void *)osp, (void *)cr, rnode4info(rp))); 2252 mutex_enter(&osp->os_sync_lock); 2253 osp->os_failed_reopen = 1; 2254 mutex_exit(&osp->os_sync_lock); 2255 bailout: 2256 if (rcsync != NULL) 2257 nfs4_recall_sync_end(mi, rcsync); 2258 if (oop != NULL) { 2259 nfs4_end_open_seqid_sync(oop); 2260 open_owner_rele(oop); 2261 } 2262 if (cr != NULL) 2263 crfree(cr); 2264 if (cred_otw != NULL) 2265 crfree(cred_otw); 2266 } 2267 2268 /* for . and .. OPENs */ 2269 /* ARGSUSED */ 2270 static int 2271 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2272 { 2273 rnode4_t *rp; 2274 nfs4_ga_res_t gar; 2275 2276 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2277 2278 /* 2279 * If close-to-open consistency checking is turned off or 2280 * if there is no cached data, we can avoid 2281 * the over the wire getattr. Otherwise, force a 2282 * call to the server to get fresh attributes and to 2283 * check caches. This is required for close-to-open 2284 * consistency. 2285 */ 2286 rp = VTOR4(*vpp); 2287 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2288 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2289 return (0); 2290 2291 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2292 } 2293 2294 /* 2295 * CLOSE a file 2296 */ 2297 /* ARGSUSED */ 2298 static int 2299 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2300 caller_context_t *ct) 2301 { 2302 rnode4_t *rp; 2303 int error = 0; 2304 int r_error = 0; 2305 int n4error = 0; 2306 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2307 2308 /* 2309 * Remove client state for this (lockowner, file) pair. 2310 * Issue otw v4 call to have the server do the same. 2311 */ 2312 2313 rp = VTOR4(vp); 2314 2315 /* 2316 * zone_enter(2) prevents processes from changing zones with NFS files 2317 * open; if we happen to get here from the wrong zone we can't do 2318 * anything over the wire. 2319 */ 2320 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2321 /* 2322 * We could attempt to clean up locks, except we're sure 2323 * that the current process didn't acquire any locks on 2324 * the file: any attempt to lock a file belong to another zone 2325 * will fail, and one can't lock an NFS file and then change 2326 * zones, as that fails too. 2327 * 2328 * Returning an error here is the sane thing to do. A 2329 * subsequent call to VN_RELE() which translates to a 2330 * nfs4_inactive() will clean up state: if the zone of the 2331 * vnode's origin is still alive and kicking, the inactive 2332 * thread will handle the request (from the correct zone), and 2333 * everything (minus the OTW close call) should be OK. If the 2334 * zone is going away nfs4_async_inactive() will throw away 2335 * delegations, open streams and cached pages inline. 2336 */ 2337 return (EIO); 2338 } 2339 2340 /* 2341 * If we are using local locking for this filesystem, then 2342 * release all of the SYSV style record locks. Otherwise, 2343 * we are doing network locking and we need to release all 2344 * of the network locks. All of the locks held by this 2345 * process on this file are released no matter what the 2346 * incoming reference count is. 2347 */ 2348 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2349 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2350 cleanshares(vp, ttoproc(curthread)->p_pid); 2351 } else 2352 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2353 2354 if (e.error) { 2355 struct lm_sysid *lmsid; 2356 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2357 if (lmsid == NULL) { 2358 DTRACE_PROBE2(unknown__sysid, int, e.error, 2359 vnode_t *, vp); 2360 } else { 2361 cleanlocks(vp, ttoproc(curthread)->p_pid, 2362 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2363 2364 lm_rel_sysid(lmsid); 2365 } 2366 return (e.error); 2367 } 2368 2369 if (count > 1) 2370 return (0); 2371 2372 /* 2373 * If the file has been `unlinked', then purge the 2374 * DNLC so that this vnode will get reycled quicker 2375 * and the .nfs* file on the server will get removed. 2376 */ 2377 if (rp->r_unldvp != NULL) 2378 dnlc_purge_vp(vp); 2379 2380 /* 2381 * If the file was open for write and there are pages, 2382 * do a synchronous flush and commit of all of the 2383 * dirty and uncommitted pages. 2384 */ 2385 ASSERT(!e.error); 2386 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2387 error = nfs4_putpage_commit(vp, 0, 0, cr); 2388 2389 mutex_enter(&rp->r_statelock); 2390 r_error = rp->r_error; 2391 rp->r_error = 0; 2392 mutex_exit(&rp->r_statelock); 2393 2394 /* 2395 * If this file type is one for which no explicit 'open' was 2396 * done, then bail now (ie. no need for protocol 'close'). If 2397 * there was an error w/the vm subsystem, return _that_ error, 2398 * otherwise, return any errors that may've been reported via 2399 * the rnode. 2400 */ 2401 if (vp->v_type != VREG) 2402 return (error ? error : r_error); 2403 2404 /* 2405 * The sync putpage commit may have failed above, but since 2406 * we're working w/a regular file, we need to do the protocol 2407 * 'close' (nfs4close_one will figure out if an otw close is 2408 * needed or not). Report any errors _after_ doing the protocol 2409 * 'close'. 2410 */ 2411 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2412 n4error = e.error ? e.error : geterrno4(e.stat); 2413 2414 /* 2415 * Error reporting prio (Hi -> Lo) 2416 * 2417 * i) nfs4_putpage_commit (error) 2418 * ii) rnode's (r_error) 2419 * iii) nfs4close_one (n4error) 2420 */ 2421 return (error ? error : (r_error ? r_error : n4error)); 2422 } 2423 2424 /* 2425 * Initialize *lost_rqstp. 2426 */ 2427 2428 static void 2429 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2430 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2431 vnode_t *vp) 2432 { 2433 if (error != ETIMEDOUT && error != EINTR && 2434 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2435 lost_rqstp->lr_op = 0; 2436 return; 2437 } 2438 2439 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2440 "nfs4close_save_lost_rqst: error %d", error)); 2441 2442 lost_rqstp->lr_op = OP_CLOSE; 2443 /* 2444 * The vp is held and rele'd via the recovery code. 2445 * See nfs4_save_lost_rqst. 2446 */ 2447 lost_rqstp->lr_vp = vp; 2448 lost_rqstp->lr_dvp = NULL; 2449 lost_rqstp->lr_oop = oop; 2450 lost_rqstp->lr_osp = osp; 2451 ASSERT(osp != NULL); 2452 ASSERT(mutex_owned(&osp->os_sync_lock)); 2453 osp->os_pending_close = 1; 2454 lost_rqstp->lr_lop = NULL; 2455 lost_rqstp->lr_cr = cr; 2456 lost_rqstp->lr_flk = NULL; 2457 lost_rqstp->lr_putfirst = FALSE; 2458 } 2459 2460 /* 2461 * Assumes you already have the open seqid sync grabbed as well as the 2462 * 'os_sync_lock'. Note: this will release the open seqid sync and 2463 * 'os_sync_lock' if client recovery starts. Calling functions have to 2464 * be prepared to handle this. 2465 * 2466 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2467 * was needed and was started, and that the calling function should retry 2468 * this function; otherwise it is returned as 0. 2469 * 2470 * Errors are returned via the nfs4_error_t parameter. 2471 */ 2472 static void 2473 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2474 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2475 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2476 { 2477 COMPOUND4args_clnt args; 2478 COMPOUND4res_clnt res; 2479 CLOSE4args *close_args; 2480 nfs_resop4 *resop; 2481 nfs_argop4 argop[3]; 2482 int doqueue = 1; 2483 mntinfo4_t *mi; 2484 seqid4 seqid; 2485 vnode_t *vp; 2486 bool_t needrecov = FALSE; 2487 nfs4_lost_rqst_t lost_rqst; 2488 hrtime_t t; 2489 2490 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2491 2492 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2493 2494 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2495 2496 /* Only set this to 1 if recovery is started */ 2497 *recov = 0; 2498 2499 /* do the OTW call to close the file */ 2500 2501 if (close_type == CLOSE_RESEND) 2502 args.ctag = TAG_CLOSE_LOST; 2503 else if (close_type == CLOSE_AFTER_RESEND) 2504 args.ctag = TAG_CLOSE_UNDO; 2505 else 2506 args.ctag = TAG_CLOSE; 2507 2508 args.array_len = 3; 2509 args.array = argop; 2510 2511 vp = RTOV4(rp); 2512 2513 mi = VTOMI4(vp); 2514 2515 /* putfh target fh */ 2516 argop[0].argop = OP_CPUTFH; 2517 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2518 2519 argop[1].argop = OP_GETATTR; 2520 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2521 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2522 2523 argop[2].argop = OP_CLOSE; 2524 close_args = &argop[2].nfs_argop4_u.opclose; 2525 2526 seqid = nfs4_get_open_seqid(oop) + 1; 2527 2528 close_args->seqid = seqid; 2529 close_args->open_stateid = osp->open_stateid; 2530 2531 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2532 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2533 rnode4info(rp))); 2534 2535 t = gethrtime(); 2536 2537 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2538 2539 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2540 nfs4_set_open_seqid(seqid, oop, args.ctag); 2541 } 2542 2543 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2544 if (ep->error && !needrecov) { 2545 /* 2546 * if there was an error and no recovery is to be done 2547 * then then set up the file to flush its cache if 2548 * needed for the next caller. 2549 */ 2550 mutex_enter(&rp->r_statelock); 2551 PURGE_ATTRCACHE4_LOCKED(rp); 2552 rp->r_flags &= ~R4WRITEMODIFIED; 2553 mutex_exit(&rp->r_statelock); 2554 return; 2555 } 2556 2557 if (needrecov) { 2558 bool_t abort; 2559 nfs4_bseqid_entry_t *bsep = NULL; 2560 2561 if (close_type != CLOSE_RESEND) 2562 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2563 osp, cred_otw, vp); 2564 2565 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2566 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2567 0, args.ctag, close_args->seqid); 2568 2569 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2570 "nfs4close_otw: initiating recovery. error %d " 2571 "res.status %d", ep->error, res.status)); 2572 2573 /* 2574 * Drop the 'os_sync_lock' here so we don't hit 2575 * a potential recursive mutex_enter via an 2576 * 'open_stream_hold()'. 2577 */ 2578 mutex_exit(&osp->os_sync_lock); 2579 *have_sync_lockp = 0; 2580 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2581 (close_type != CLOSE_RESEND && 2582 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2583 OP_CLOSE, bsep, NULL, NULL); 2584 2585 /* drop open seq sync, and let the calling function regrab it */ 2586 nfs4_end_open_seqid_sync(oop); 2587 *did_start_seqid_syncp = 0; 2588 2589 if (bsep) 2590 kmem_free(bsep, sizeof (*bsep)); 2591 /* 2592 * For signals, the caller wants to quit, so don't say to 2593 * retry. For forced unmount, if it's a user thread, it 2594 * wants to quit. If it's a recovery thread, the retry 2595 * will happen higher-up on the call stack. Either way, 2596 * don't say to retry. 2597 */ 2598 if (abort == FALSE && ep->error != EINTR && 2599 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2600 close_type != CLOSE_RESEND && 2601 close_type != CLOSE_AFTER_RESEND) 2602 *recov = 1; 2603 else 2604 *recov = 0; 2605 2606 if (!ep->error) 2607 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2608 return; 2609 } 2610 2611 if (res.status) { 2612 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2613 return; 2614 } 2615 2616 mutex_enter(&rp->r_statev4_lock); 2617 rp->created_v4 = 0; 2618 mutex_exit(&rp->r_statev4_lock); 2619 2620 resop = &res.array[2]; 2621 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2622 osp->os_valid = 0; 2623 2624 /* 2625 * This removes the reference obtained at OPEN; ie, when the 2626 * open stream structure was created. 2627 * 2628 * We don't have to worry about calling 'open_stream_rele' 2629 * since we our currently holding a reference to the open 2630 * stream which means the count cannot go to 0 with this 2631 * decrement. 2632 */ 2633 ASSERT(osp->os_ref_count >= 2); 2634 osp->os_ref_count--; 2635 2636 if (ep->error == 0) { 2637 mutex_exit(&osp->os_sync_lock); 2638 *have_sync_lockp = 0; 2639 2640 nfs4_attr_cache(vp, 2641 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2642 t, cred_otw, TRUE, NULL); 2643 } 2644 2645 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2646 " returning %d", ep->error)); 2647 2648 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2649 } 2650 2651 /* ARGSUSED */ 2652 static int 2653 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2654 caller_context_t *ct) 2655 { 2656 rnode4_t *rp; 2657 u_offset_t off; 2658 offset_t diff; 2659 uint_t on; 2660 uint_t n; 2661 caddr_t base; 2662 uint_t flags; 2663 int error; 2664 mntinfo4_t *mi; 2665 2666 rp = VTOR4(vp); 2667 2668 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2669 2670 if (IS_SHADOW(vp, rp)) 2671 vp = RTOV4(rp); 2672 2673 if (vp->v_type != VREG) 2674 return (EISDIR); 2675 2676 mi = VTOMI4(vp); 2677 2678 if (nfs_zone() != mi->mi_zone) 2679 return (EIO); 2680 2681 if (uiop->uio_resid == 0) 2682 return (0); 2683 2684 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2685 return (EINVAL); 2686 2687 mutex_enter(&rp->r_statelock); 2688 if (rp->r_flags & R4RECOVERRP) 2689 error = (rp->r_error ? rp->r_error : EIO); 2690 else 2691 error = 0; 2692 mutex_exit(&rp->r_statelock); 2693 if (error) 2694 return (error); 2695 2696 /* 2697 * Bypass VM if caching has been disabled (e.g., locking) or if 2698 * using client-side direct I/O and the file is not mmap'd and 2699 * there are no cached pages. 2700 */ 2701 if ((vp->v_flag & VNOCACHE) || 2702 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2703 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2704 size_t resid = 0; 2705 2706 return (nfs4read(vp, NULL, uiop->uio_loffset, 2707 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2708 } 2709 2710 error = 0; 2711 2712 do { 2713 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2714 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2715 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2716 2717 if (error = nfs4_validate_caches(vp, cr)) 2718 break; 2719 2720 mutex_enter(&rp->r_statelock); 2721 while (rp->r_flags & R4INCACHEPURGE) { 2722 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2723 mutex_exit(&rp->r_statelock); 2724 return (EINTR); 2725 } 2726 } 2727 diff = rp->r_size - uiop->uio_loffset; 2728 mutex_exit(&rp->r_statelock); 2729 if (diff <= 0) 2730 break; 2731 if (diff < n) 2732 n = (uint_t)diff; 2733 2734 if (vpm_enable) { 2735 /* 2736 * Copy data. 2737 */ 2738 error = vpm_data_copy(vp, off + on, n, uiop, 2739 1, NULL, 0, S_READ); 2740 } else { 2741 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2742 S_READ); 2743 2744 error = uiomove(base + on, n, UIO_READ, uiop); 2745 } 2746 2747 if (!error) { 2748 /* 2749 * If read a whole block or read to eof, 2750 * won't need this buffer again soon. 2751 */ 2752 mutex_enter(&rp->r_statelock); 2753 if (n + on == MAXBSIZE || 2754 uiop->uio_loffset == rp->r_size) 2755 flags = SM_DONTNEED; 2756 else 2757 flags = 0; 2758 mutex_exit(&rp->r_statelock); 2759 if (vpm_enable) { 2760 error = vpm_sync_pages(vp, off, n, flags); 2761 } else { 2762 error = segmap_release(segkmap, base, flags); 2763 } 2764 } else { 2765 if (vpm_enable) { 2766 (void) vpm_sync_pages(vp, off, n, 0); 2767 } else { 2768 (void) segmap_release(segkmap, base, 0); 2769 } 2770 } 2771 } while (!error && uiop->uio_resid > 0); 2772 2773 return (error); 2774 } 2775 2776 /* ARGSUSED */ 2777 static int 2778 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2779 caller_context_t *ct) 2780 { 2781 rlim64_t limit = uiop->uio_llimit; 2782 rnode4_t *rp; 2783 u_offset_t off; 2784 caddr_t base; 2785 uint_t flags; 2786 int remainder; 2787 size_t n; 2788 int on; 2789 int error; 2790 int resid; 2791 u_offset_t offset; 2792 mntinfo4_t *mi; 2793 uint_t bsize; 2794 2795 rp = VTOR4(vp); 2796 2797 if (IS_SHADOW(vp, rp)) 2798 vp = RTOV4(rp); 2799 2800 if (vp->v_type != VREG) 2801 return (EISDIR); 2802 2803 mi = VTOMI4(vp); 2804 2805 if (nfs_zone() != mi->mi_zone) 2806 return (EIO); 2807 2808 if (uiop->uio_resid == 0) 2809 return (0); 2810 2811 mutex_enter(&rp->r_statelock); 2812 if (rp->r_flags & R4RECOVERRP) 2813 error = (rp->r_error ? rp->r_error : EIO); 2814 else 2815 error = 0; 2816 mutex_exit(&rp->r_statelock); 2817 if (error) 2818 return (error); 2819 2820 if (ioflag & FAPPEND) { 2821 struct vattr va; 2822 2823 /* 2824 * Must serialize if appending. 2825 */ 2826 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2827 nfs_rw_exit(&rp->r_rwlock); 2828 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2829 INTR4(vp))) 2830 return (EINTR); 2831 } 2832 2833 va.va_mask = AT_SIZE; 2834 error = nfs4getattr(vp, &va, cr); 2835 if (error) 2836 return (error); 2837 uiop->uio_loffset = va.va_size; 2838 } 2839 2840 offset = uiop->uio_loffset + uiop->uio_resid; 2841 2842 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2843 return (EINVAL); 2844 2845 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2846 limit = MAXOFFSET_T; 2847 2848 /* 2849 * Check to make sure that the process will not exceed 2850 * its limit on file size. It is okay to write up to 2851 * the limit, but not beyond. Thus, the write which 2852 * reaches the limit will be short and the next write 2853 * will return an error. 2854 */ 2855 remainder = 0; 2856 if (offset > uiop->uio_llimit) { 2857 remainder = offset - uiop->uio_llimit; 2858 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2859 if (uiop->uio_resid <= 0) { 2860 proc_t *p = ttoproc(curthread); 2861 2862 uiop->uio_resid += remainder; 2863 mutex_enter(&p->p_lock); 2864 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2865 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2866 mutex_exit(&p->p_lock); 2867 return (EFBIG); 2868 } 2869 } 2870 2871 /* update the change attribute, if we have a write delegation */ 2872 2873 mutex_enter(&rp->r_statev4_lock); 2874 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2875 rp->r_deleg_change++; 2876 2877 mutex_exit(&rp->r_statev4_lock); 2878 2879 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, INTR4(vp))) 2880 return (EINTR); 2881 2882 /* 2883 * Bypass VM if caching has been disabled (e.g., locking) or if 2884 * using client-side direct I/O and the file is not mmap'd and 2885 * there are no cached pages. 2886 */ 2887 if ((vp->v_flag & VNOCACHE) || 2888 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2889 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2890 size_t bufsize; 2891 int count; 2892 u_offset_t org_offset; 2893 stable_how4 stab_comm; 2894 nfs4_fwrite: 2895 if (rp->r_flags & R4STALE) { 2896 resid = uiop->uio_resid; 2897 offset = uiop->uio_loffset; 2898 error = rp->r_error; 2899 /* 2900 * A close may have cleared r_error, if so, 2901 * propagate ESTALE error return properly 2902 */ 2903 if (error == 0) 2904 error = ESTALE; 2905 goto bottom; 2906 } 2907 2908 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2909 base = kmem_alloc(bufsize, KM_SLEEP); 2910 do { 2911 if (ioflag & FDSYNC) 2912 stab_comm = DATA_SYNC4; 2913 else 2914 stab_comm = FILE_SYNC4; 2915 resid = uiop->uio_resid; 2916 offset = uiop->uio_loffset; 2917 count = MIN(uiop->uio_resid, bufsize); 2918 org_offset = uiop->uio_loffset; 2919 error = uiomove(base, count, UIO_WRITE, uiop); 2920 if (!error) { 2921 error = nfs4write(vp, base, org_offset, 2922 count, cr, &stab_comm); 2923 if (!error) { 2924 mutex_enter(&rp->r_statelock); 2925 if (rp->r_size < uiop->uio_loffset) 2926 rp->r_size = uiop->uio_loffset; 2927 mutex_exit(&rp->r_statelock); 2928 } 2929 } 2930 } while (!error && uiop->uio_resid > 0); 2931 kmem_free(base, bufsize); 2932 goto bottom; 2933 } 2934 2935 bsize = vp->v_vfsp->vfs_bsize; 2936 2937 do { 2938 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2939 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2940 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2941 2942 resid = uiop->uio_resid; 2943 offset = uiop->uio_loffset; 2944 2945 if (rp->r_flags & R4STALE) { 2946 error = rp->r_error; 2947 /* 2948 * A close may have cleared r_error, if so, 2949 * propagate ESTALE error return properly 2950 */ 2951 if (error == 0) 2952 error = ESTALE; 2953 break; 2954 } 2955 2956 /* 2957 * Don't create dirty pages faster than they 2958 * can be cleaned so that the system doesn't 2959 * get imbalanced. If the async queue is 2960 * maxed out, then wait for it to drain before 2961 * creating more dirty pages. Also, wait for 2962 * any threads doing pagewalks in the vop_getattr 2963 * entry points so that they don't block for 2964 * long periods. 2965 */ 2966 mutex_enter(&rp->r_statelock); 2967 while ((mi->mi_max_threads != 0 && 2968 rp->r_awcount > 2 * mi->mi_max_threads) || 2969 rp->r_gcount > 0) { 2970 if (INTR4(vp)) { 2971 klwp_t *lwp = ttolwp(curthread); 2972 2973 if (lwp != NULL) 2974 lwp->lwp_nostop++; 2975 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2976 mutex_exit(&rp->r_statelock); 2977 if (lwp != NULL) 2978 lwp->lwp_nostop--; 2979 error = EINTR; 2980 goto bottom; 2981 } 2982 if (lwp != NULL) 2983 lwp->lwp_nostop--; 2984 } else 2985 cv_wait(&rp->r_cv, &rp->r_statelock); 2986 } 2987 mutex_exit(&rp->r_statelock); 2988 2989 /* 2990 * Touch the page and fault it in if it is not in core 2991 * before segmap_getmapflt or vpm_data_copy can lock it. 2992 * This is to avoid the deadlock if the buffer is mapped 2993 * to the same file through mmap which we want to write. 2994 */ 2995 uio_prefaultpages((long)n, uiop); 2996 2997 if (vpm_enable) { 2998 /* 2999 * It will use kpm mappings, so no need to 3000 * pass an address. 3001 */ 3002 error = writerp4(rp, NULL, n, uiop, 0); 3003 } else { 3004 if (segmap_kpm) { 3005 int pon = uiop->uio_loffset & PAGEOFFSET; 3006 size_t pn = MIN(PAGESIZE - pon, 3007 uiop->uio_resid); 3008 int pagecreate; 3009 3010 mutex_enter(&rp->r_statelock); 3011 pagecreate = (pon == 0) && (pn == PAGESIZE || 3012 uiop->uio_loffset + pn >= rp->r_size); 3013 mutex_exit(&rp->r_statelock); 3014 3015 base = segmap_getmapflt(segkmap, vp, off + on, 3016 pn, !pagecreate, S_WRITE); 3017 3018 error = writerp4(rp, base + pon, n, uiop, 3019 pagecreate); 3020 3021 } else { 3022 base = segmap_getmapflt(segkmap, vp, off + on, 3023 n, 0, S_READ); 3024 error = writerp4(rp, base + on, n, uiop, 0); 3025 } 3026 } 3027 3028 if (!error) { 3029 if (mi->mi_flags & MI4_NOAC) 3030 flags = SM_WRITE; 3031 else if ((uiop->uio_loffset % bsize) == 0 || 3032 IS_SWAPVP(vp)) { 3033 /* 3034 * Have written a whole block. 3035 * Start an asynchronous write 3036 * and mark the buffer to 3037 * indicate that it won't be 3038 * needed again soon. 3039 */ 3040 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 3041 } else 3042 flags = 0; 3043 if ((ioflag & (FSYNC|FDSYNC)) || 3044 (rp->r_flags & R4OUTOFSPACE)) { 3045 flags &= ~SM_ASYNC; 3046 flags |= SM_WRITE; 3047 } 3048 if (vpm_enable) { 3049 error = vpm_sync_pages(vp, off, n, flags); 3050 } else { 3051 error = segmap_release(segkmap, base, flags); 3052 } 3053 } else { 3054 if (vpm_enable) { 3055 (void) vpm_sync_pages(vp, off, n, 0); 3056 } else { 3057 (void) segmap_release(segkmap, base, 0); 3058 } 3059 /* 3060 * In the event that we got an access error while 3061 * faulting in a page for a write-only file just 3062 * force a write. 3063 */ 3064 if (error == EACCES) 3065 goto nfs4_fwrite; 3066 } 3067 } while (!error && uiop->uio_resid > 0); 3068 3069 bottom: 3070 if (error) { 3071 uiop->uio_resid = resid + remainder; 3072 uiop->uio_loffset = offset; 3073 } else { 3074 uiop->uio_resid += remainder; 3075 3076 mutex_enter(&rp->r_statev4_lock); 3077 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3078 gethrestime(&rp->r_attr.va_mtime); 3079 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3080 } 3081 mutex_exit(&rp->r_statev4_lock); 3082 } 3083 3084 nfs_rw_exit(&rp->r_lkserlock); 3085 3086 return (error); 3087 } 3088 3089 /* 3090 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3091 */ 3092 static int 3093 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3094 int flags, cred_t *cr) 3095 { 3096 struct buf *bp; 3097 int error; 3098 page_t *savepp; 3099 uchar_t fsdata; 3100 stable_how4 stab_comm; 3101 3102 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3103 bp = pageio_setup(pp, len, vp, flags); 3104 ASSERT(bp != NULL); 3105 3106 /* 3107 * pageio_setup should have set b_addr to 0. This 3108 * is correct since we want to do I/O on a page 3109 * boundary. bp_mapin will use this addr to calculate 3110 * an offset, and then set b_addr to the kernel virtual 3111 * address it allocated for us. 3112 */ 3113 ASSERT(bp->b_un.b_addr == 0); 3114 3115 bp->b_edev = 0; 3116 bp->b_dev = 0; 3117 bp->b_lblkno = lbtodb(off); 3118 bp->b_file = vp; 3119 bp->b_offset = (offset_t)off; 3120 bp_mapin(bp); 3121 3122 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3123 freemem > desfree) 3124 stab_comm = UNSTABLE4; 3125 else 3126 stab_comm = FILE_SYNC4; 3127 3128 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3129 3130 bp_mapout(bp); 3131 pageio_done(bp); 3132 3133 if (stab_comm == UNSTABLE4) 3134 fsdata = C_DELAYCOMMIT; 3135 else 3136 fsdata = C_NOCOMMIT; 3137 3138 savepp = pp; 3139 do { 3140 pp->p_fsdata = fsdata; 3141 } while ((pp = pp->p_next) != savepp); 3142 3143 return (error); 3144 } 3145 3146 /* 3147 */ 3148 static int 3149 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3150 { 3151 nfs4_open_owner_t *oop; 3152 nfs4_open_stream_t *osp; 3153 rnode4_t *rp = VTOR4(vp); 3154 mntinfo4_t *mi = VTOMI4(vp); 3155 int reopen_needed; 3156 3157 ASSERT(nfs_zone() == mi->mi_zone); 3158 3159 3160 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3161 if (!oop) 3162 return (EIO); 3163 3164 /* returns with 'os_sync_lock' held */ 3165 osp = find_open_stream(oop, rp); 3166 if (!osp) { 3167 open_owner_rele(oop); 3168 return (EIO); 3169 } 3170 3171 if (osp->os_failed_reopen) { 3172 mutex_exit(&osp->os_sync_lock); 3173 open_stream_rele(osp, rp); 3174 open_owner_rele(oop); 3175 return (EIO); 3176 } 3177 3178 /* 3179 * Determine whether a reopen is needed. If this 3180 * is a delegation open stream, then the os_delegation bit 3181 * should be set. 3182 */ 3183 3184 reopen_needed = osp->os_delegation; 3185 3186 mutex_exit(&osp->os_sync_lock); 3187 open_owner_rele(oop); 3188 3189 if (reopen_needed) { 3190 nfs4_error_zinit(ep); 3191 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3192 mutex_enter(&osp->os_sync_lock); 3193 if (ep->error || ep->stat || osp->os_failed_reopen) { 3194 mutex_exit(&osp->os_sync_lock); 3195 open_stream_rele(osp, rp); 3196 return (EIO); 3197 } 3198 mutex_exit(&osp->os_sync_lock); 3199 } 3200 open_stream_rele(osp, rp); 3201 3202 return (0); 3203 } 3204 3205 /* 3206 * Write to file. Writes to remote server in largest size 3207 * chunks that the server can handle. Write is synchronous. 3208 */ 3209 static int 3210 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3211 stable_how4 *stab_comm) 3212 { 3213 mntinfo4_t *mi; 3214 COMPOUND4args_clnt args; 3215 COMPOUND4res_clnt res; 3216 WRITE4args *wargs; 3217 WRITE4res *wres; 3218 nfs_argop4 argop[2]; 3219 nfs_resop4 *resop; 3220 int tsize; 3221 stable_how4 stable; 3222 rnode4_t *rp; 3223 int doqueue = 1; 3224 bool_t needrecov; 3225 nfs4_recov_state_t recov_state; 3226 nfs4_stateid_types_t sid_types; 3227 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3228 int recov; 3229 3230 rp = VTOR4(vp); 3231 mi = VTOMI4(vp); 3232 3233 ASSERT(nfs_zone() == mi->mi_zone); 3234 3235 stable = *stab_comm; 3236 *stab_comm = FILE_SYNC4; 3237 3238 needrecov = FALSE; 3239 recov_state.rs_flags = 0; 3240 recov_state.rs_num_retry_despite_err = 0; 3241 nfs4_init_stateid_types(&sid_types); 3242 3243 /* Is curthread the recovery thread? */ 3244 mutex_enter(&mi->mi_lock); 3245 recov = (mi->mi_recovthread == curthread); 3246 mutex_exit(&mi->mi_lock); 3247 3248 recov_retry: 3249 args.ctag = TAG_WRITE; 3250 args.array_len = 2; 3251 args.array = argop; 3252 3253 if (!recov) { 3254 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3255 &recov_state, NULL); 3256 if (e.error) 3257 return (e.error); 3258 } 3259 3260 /* 0. putfh target fh */ 3261 argop[0].argop = OP_CPUTFH; 3262 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3263 3264 /* 1. write */ 3265 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3266 3267 do { 3268 3269 wargs->offset = (offset4)offset; 3270 wargs->data_val = base; 3271 3272 if (mi->mi_io_kstats) { 3273 mutex_enter(&mi->mi_lock); 3274 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3275 mutex_exit(&mi->mi_lock); 3276 } 3277 3278 if ((vp->v_flag & VNOCACHE) || 3279 (rp->r_flags & R4DIRECTIO) || 3280 (mi->mi_flags & MI4_DIRECTIO)) 3281 tsize = MIN(mi->mi_stsize, count); 3282 else 3283 tsize = MIN(mi->mi_curwrite, count); 3284 wargs->data_len = (uint_t)tsize; 3285 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3286 3287 if (mi->mi_io_kstats) { 3288 mutex_enter(&mi->mi_lock); 3289 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3290 mutex_exit(&mi->mi_lock); 3291 } 3292 3293 if (!recov) { 3294 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3295 if (e.error && !needrecov) { 3296 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3297 &recov_state, needrecov); 3298 return (e.error); 3299 } 3300 } else { 3301 if (e.error) 3302 return (e.error); 3303 } 3304 3305 /* 3306 * Do handling of OLD_STATEID outside 3307 * of the normal recovery framework. 3308 * 3309 * If write receives a BAD stateid error while using a 3310 * delegation stateid, retry using the open stateid (if it 3311 * exists). If it doesn't have an open stateid, reopen the 3312 * file first, then retry. 3313 */ 3314 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3315 sid_types.cur_sid_type != SPEC_SID) { 3316 nfs4_save_stateid(&wargs->stateid, &sid_types); 3317 if (!recov) 3318 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3319 &recov_state, needrecov); 3320 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3321 goto recov_retry; 3322 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3323 sid_types.cur_sid_type == DEL_SID) { 3324 nfs4_save_stateid(&wargs->stateid, &sid_types); 3325 mutex_enter(&rp->r_statev4_lock); 3326 rp->r_deleg_return_pending = TRUE; 3327 mutex_exit(&rp->r_statev4_lock); 3328 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3329 if (!recov) 3330 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3331 &recov_state, needrecov); 3332 (void) xdr_free(xdr_COMPOUND4res_clnt, 3333 (caddr_t)&res); 3334 return (EIO); 3335 } 3336 if (!recov) 3337 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3338 &recov_state, needrecov); 3339 /* hold needed for nfs4delegreturn_thread */ 3340 VN_HOLD(vp); 3341 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3342 NFS4_DR_DISCARD), FALSE); 3343 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3344 goto recov_retry; 3345 } 3346 3347 if (needrecov) { 3348 bool_t abort; 3349 3350 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3351 "nfs4write: client got error %d, res.status %d" 3352 ", so start recovery", e.error, res.status)); 3353 3354 abort = nfs4_start_recovery(&e, 3355 VTOMI4(vp), vp, NULL, &wargs->stateid, 3356 NULL, OP_WRITE, NULL, NULL, NULL); 3357 if (!e.error) { 3358 e.error = geterrno4(res.status); 3359 (void) xdr_free(xdr_COMPOUND4res_clnt, 3360 (caddr_t)&res); 3361 } 3362 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3363 &recov_state, needrecov); 3364 if (abort == FALSE) 3365 goto recov_retry; 3366 return (e.error); 3367 } 3368 3369 if (res.status) { 3370 e.error = geterrno4(res.status); 3371 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3372 if (!recov) 3373 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3374 &recov_state, needrecov); 3375 return (e.error); 3376 } 3377 3378 resop = &res.array[1]; /* write res */ 3379 wres = &resop->nfs_resop4_u.opwrite; 3380 3381 if ((int)wres->count > tsize) { 3382 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3383 3384 zcmn_err(getzoneid(), CE_WARN, 3385 "nfs4write: server wrote %u, requested was %u", 3386 (int)wres->count, tsize); 3387 if (!recov) 3388 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3389 &recov_state, needrecov); 3390 return (EIO); 3391 } 3392 if (wres->committed == UNSTABLE4) { 3393 *stab_comm = UNSTABLE4; 3394 if (wargs->stable == DATA_SYNC4 || 3395 wargs->stable == FILE_SYNC4) { 3396 (void) xdr_free(xdr_COMPOUND4res_clnt, 3397 (caddr_t)&res); 3398 zcmn_err(getzoneid(), CE_WARN, 3399 "nfs4write: server %s did not commit " 3400 "to stable storage", 3401 rp->r_server->sv_hostname); 3402 if (!recov) 3403 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3404 OH_WRITE, &recov_state, needrecov); 3405 return (EIO); 3406 } 3407 } 3408 3409 tsize = (int)wres->count; 3410 count -= tsize; 3411 base += tsize; 3412 offset += tsize; 3413 if (mi->mi_io_kstats) { 3414 mutex_enter(&mi->mi_lock); 3415 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3416 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3417 tsize; 3418 mutex_exit(&mi->mi_lock); 3419 } 3420 lwp_stat_update(LWP_STAT_OUBLK, 1); 3421 mutex_enter(&rp->r_statelock); 3422 if (rp->r_flags & R4HAVEVERF) { 3423 if (rp->r_writeverf != wres->writeverf) { 3424 nfs4_set_mod(vp); 3425 rp->r_writeverf = wres->writeverf; 3426 } 3427 } else { 3428 rp->r_writeverf = wres->writeverf; 3429 rp->r_flags |= R4HAVEVERF; 3430 } 3431 PURGE_ATTRCACHE4_LOCKED(rp); 3432 rp->r_flags |= R4WRITEMODIFIED; 3433 gethrestime(&rp->r_attr.va_mtime); 3434 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3435 mutex_exit(&rp->r_statelock); 3436 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3437 } while (count); 3438 3439 if (!recov) 3440 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3441 needrecov); 3442 3443 return (e.error); 3444 } 3445 3446 /* 3447 * Read from a file. Reads data in largest chunks our interface can handle. 3448 */ 3449 static int 3450 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3451 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3452 { 3453 mntinfo4_t *mi; 3454 COMPOUND4args_clnt args; 3455 COMPOUND4res_clnt res; 3456 READ4args *rargs; 3457 nfs_argop4 argop[2]; 3458 int tsize; 3459 int doqueue; 3460 rnode4_t *rp; 3461 int data_len; 3462 bool_t is_eof; 3463 bool_t needrecov = FALSE; 3464 nfs4_recov_state_t recov_state; 3465 nfs4_stateid_types_t sid_types; 3466 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3467 3468 rp = VTOR4(vp); 3469 mi = VTOMI4(vp); 3470 doqueue = 1; 3471 3472 ASSERT(nfs_zone() == mi->mi_zone); 3473 3474 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3475 3476 args.array_len = 2; 3477 args.array = argop; 3478 3479 nfs4_init_stateid_types(&sid_types); 3480 3481 recov_state.rs_flags = 0; 3482 recov_state.rs_num_retry_despite_err = 0; 3483 3484 recov_retry: 3485 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3486 &recov_state, NULL); 3487 if (e.error) 3488 return (e.error); 3489 3490 /* putfh target fh */ 3491 argop[0].argop = OP_CPUTFH; 3492 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3493 3494 /* read */ 3495 argop[1].argop = OP_READ; 3496 rargs = &argop[1].nfs_argop4_u.opread; 3497 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3498 OP_READ, &sid_types, async); 3499 3500 do { 3501 if (mi->mi_io_kstats) { 3502 mutex_enter(&mi->mi_lock); 3503 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3504 mutex_exit(&mi->mi_lock); 3505 } 3506 3507 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3508 "nfs4read: %s call, rp %s", 3509 needrecov ? "recov" : "first", 3510 rnode4info(rp))); 3511 3512 if ((vp->v_flag & VNOCACHE) || 3513 (rp->r_flags & R4DIRECTIO) || 3514 (mi->mi_flags & MI4_DIRECTIO)) 3515 tsize = MIN(mi->mi_tsize, count); 3516 else 3517 tsize = MIN(mi->mi_curread, count); 3518 3519 rargs->offset = (offset4)offset; 3520 rargs->count = (count4)tsize; 3521 rargs->res_data_val_alt = NULL; 3522 rargs->res_mblk = NULL; 3523 rargs->res_uiop = NULL; 3524 rargs->res_maxsize = 0; 3525 rargs->wlist = NULL; 3526 3527 if (uiop) 3528 rargs->res_uiop = uiop; 3529 else 3530 rargs->res_data_val_alt = base; 3531 rargs->res_maxsize = tsize; 3532 3533 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3534 #ifdef DEBUG 3535 if (nfs4read_error_inject) { 3536 res.status = nfs4read_error_inject; 3537 nfs4read_error_inject = 0; 3538 } 3539 #endif 3540 3541 if (mi->mi_io_kstats) { 3542 mutex_enter(&mi->mi_lock); 3543 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3544 mutex_exit(&mi->mi_lock); 3545 } 3546 3547 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3548 if (e.error != 0 && !needrecov) { 3549 nfs4_end_fop(mi, vp, NULL, OH_READ, 3550 &recov_state, needrecov); 3551 return (e.error); 3552 } 3553 3554 /* 3555 * Do proper retry for OLD and BAD stateid errors outside 3556 * of the normal recovery framework. There are two differences 3557 * between async and sync reads. The first is that we allow 3558 * retry on BAD_STATEID for async reads, but not sync reads. 3559 * The second is that we mark the file dead for a failed 3560 * attempt with a special stateid for sync reads, but just 3561 * return EIO for async reads. 3562 * 3563 * If a sync read receives a BAD stateid error while using a 3564 * delegation stateid, retry using the open stateid (if it 3565 * exists). If it doesn't have an open stateid, reopen the 3566 * file first, then retry. 3567 */ 3568 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3569 res.status == NFS4ERR_BAD_STATEID) && async) { 3570 nfs4_end_fop(mi, vp, NULL, OH_READ, 3571 &recov_state, needrecov); 3572 if (sid_types.cur_sid_type == SPEC_SID) { 3573 (void) xdr_free(xdr_COMPOUND4res_clnt, 3574 (caddr_t)&res); 3575 return (EIO); 3576 } 3577 nfs4_save_stateid(&rargs->stateid, &sid_types); 3578 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3579 goto recov_retry; 3580 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3581 !async && sid_types.cur_sid_type != SPEC_SID) { 3582 nfs4_save_stateid(&rargs->stateid, &sid_types); 3583 nfs4_end_fop(mi, vp, NULL, OH_READ, 3584 &recov_state, needrecov); 3585 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3586 goto recov_retry; 3587 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3588 sid_types.cur_sid_type == DEL_SID) { 3589 nfs4_save_stateid(&rargs->stateid, &sid_types); 3590 mutex_enter(&rp->r_statev4_lock); 3591 rp->r_deleg_return_pending = TRUE; 3592 mutex_exit(&rp->r_statev4_lock); 3593 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3594 nfs4_end_fop(mi, vp, NULL, OH_READ, 3595 &recov_state, needrecov); 3596 (void) xdr_free(xdr_COMPOUND4res_clnt, 3597 (caddr_t)&res); 3598 return (EIO); 3599 } 3600 nfs4_end_fop(mi, vp, NULL, OH_READ, 3601 &recov_state, needrecov); 3602 /* hold needed for nfs4delegreturn_thread */ 3603 VN_HOLD(vp); 3604 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3605 NFS4_DR_DISCARD), FALSE); 3606 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3607 goto recov_retry; 3608 } 3609 if (needrecov) { 3610 bool_t abort; 3611 3612 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3613 "nfs4read: initiating recovery\n")); 3614 abort = nfs4_start_recovery(&e, 3615 mi, vp, NULL, &rargs->stateid, 3616 NULL, OP_READ, NULL, NULL, NULL); 3617 nfs4_end_fop(mi, vp, NULL, OH_READ, 3618 &recov_state, needrecov); 3619 /* 3620 * Do not retry if we got OLD_STATEID using a special 3621 * stateid. This avoids looping with a broken server. 3622 */ 3623 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3624 sid_types.cur_sid_type == SPEC_SID) 3625 abort = TRUE; 3626 3627 if (abort == FALSE) { 3628 /* 3629 * Need to retry all possible stateids in 3630 * case the recovery error wasn't stateid 3631 * related or the stateids have become 3632 * stale (server reboot). 3633 */ 3634 nfs4_init_stateid_types(&sid_types); 3635 (void) xdr_free(xdr_COMPOUND4res_clnt, 3636 (caddr_t)&res); 3637 goto recov_retry; 3638 } 3639 3640 if (!e.error) { 3641 e.error = geterrno4(res.status); 3642 (void) xdr_free(xdr_COMPOUND4res_clnt, 3643 (caddr_t)&res); 3644 } 3645 return (e.error); 3646 } 3647 3648 if (res.status) { 3649 e.error = geterrno4(res.status); 3650 nfs4_end_fop(mi, vp, NULL, OH_READ, 3651 &recov_state, needrecov); 3652 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3653 return (e.error); 3654 } 3655 3656 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3657 count -= data_len; 3658 if (base) 3659 base += data_len; 3660 offset += data_len; 3661 if (mi->mi_io_kstats) { 3662 mutex_enter(&mi->mi_lock); 3663 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3664 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3665 mutex_exit(&mi->mi_lock); 3666 } 3667 lwp_stat_update(LWP_STAT_INBLK, 1); 3668 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3669 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3670 3671 } while (count && !is_eof); 3672 3673 *residp = count; 3674 3675 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3676 3677 return (e.error); 3678 } 3679 3680 /* ARGSUSED */ 3681 static int 3682 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3683 caller_context_t *ct) 3684 { 3685 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3686 return (EIO); 3687 switch (cmd) { 3688 case _FIODIRECTIO: 3689 return (nfs4_directio(vp, (int)arg, cr)); 3690 default: 3691 return (ENOTTY); 3692 } 3693 } 3694 3695 /* ARGSUSED */ 3696 int 3697 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3698 caller_context_t *ct) 3699 { 3700 int error; 3701 rnode4_t *rp = VTOR4(vp); 3702 3703 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3704 return (EIO); 3705 /* 3706 * If it has been specified that the return value will 3707 * just be used as a hint, and we are only being asked 3708 * for size, fsid or rdevid, then return the client's 3709 * notion of these values without checking to make sure 3710 * that the attribute cache is up to date. 3711 * The whole point is to avoid an over the wire GETATTR 3712 * call. 3713 */ 3714 if (flags & ATTR_HINT) { 3715 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) { 3716 mutex_enter(&rp->r_statelock); 3717 if (vap->va_mask & AT_SIZE) 3718 vap->va_size = rp->r_size; 3719 if (vap->va_mask & AT_FSID) 3720 vap->va_fsid = rp->r_attr.va_fsid; 3721 if (vap->va_mask & AT_RDEV) 3722 vap->va_rdev = rp->r_attr.va_rdev; 3723 mutex_exit(&rp->r_statelock); 3724 return (0); 3725 } 3726 } 3727 3728 /* 3729 * Only need to flush pages if asking for the mtime 3730 * and if there any dirty pages or any outstanding 3731 * asynchronous (write) requests for this file. 3732 */ 3733 if (vap->va_mask & AT_MTIME) { 3734 rp = VTOR4(vp); 3735 if (nfs4_has_pages(vp)) { 3736 mutex_enter(&rp->r_statev4_lock); 3737 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3738 mutex_exit(&rp->r_statev4_lock); 3739 if (rp->r_flags & R4DIRTY || 3740 rp->r_awcount > 0) { 3741 mutex_enter(&rp->r_statelock); 3742 rp->r_gcount++; 3743 mutex_exit(&rp->r_statelock); 3744 error = 3745 nfs4_putpage(vp, (u_offset_t)0, 3746 0, 0, cr, NULL); 3747 mutex_enter(&rp->r_statelock); 3748 if (error && (error == ENOSPC || 3749 error == EDQUOT)) { 3750 if (!rp->r_error) 3751 rp->r_error = error; 3752 } 3753 if (--rp->r_gcount == 0) 3754 cv_broadcast(&rp->r_cv); 3755 mutex_exit(&rp->r_statelock); 3756 } 3757 } else { 3758 mutex_exit(&rp->r_statev4_lock); 3759 } 3760 } 3761 } 3762 return (nfs4getattr(vp, vap, cr)); 3763 } 3764 3765 int 3766 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3767 { 3768 /* 3769 * If these are the only two bits cleared 3770 * on the server then return 0 (OK) else 3771 * return 1 (BAD). 3772 */ 3773 on_client &= ~(S_ISUID|S_ISGID); 3774 if (on_client == from_server) 3775 return (0); 3776 else 3777 return (1); 3778 } 3779 3780 /*ARGSUSED4*/ 3781 static int 3782 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3783 caller_context_t *ct) 3784 { 3785 int error; 3786 3787 if (vap->va_mask & AT_NOSET) 3788 return (EINVAL); 3789 3790 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3791 return (EIO); 3792 3793 /* 3794 * Don't call secpolicy_vnode_setattr, the client cannot 3795 * use its cached attributes to make security decisions 3796 * as the server may be faking mode bits or mapping uid/gid. 3797 * Always just let the server to the checking. 3798 * If we provide the ability to remove basic priviledges 3799 * to setattr (e.g. basic without chmod) then we will 3800 * need to add a check here before calling the server. 3801 */ 3802 error = nfs4setattr(vp, vap, flags, cr, NULL); 3803 3804 if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) 3805 vnevent_truncate(vp, ct); 3806 3807 return (error); 3808 } 3809 3810 /* 3811 * To replace the "guarded" version 3 setattr, we use two types of compound 3812 * setattr requests: 3813 * 1. The "normal" setattr, used when the size of the file isn't being 3814 * changed - { Putfh <fh>; Setattr; Getattr }/ 3815 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3816 * with only ctime as the argument. If the server ctime differs from 3817 * what is cached on the client, the verify will fail, but we would 3818 * already have the ctime from the preceding getattr, so just set it 3819 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3820 * Setattr; Getattr }. 3821 * 3822 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3823 * this setattr and NULL if they are not. 3824 */ 3825 static int 3826 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3827 vsecattr_t *vsap) 3828 { 3829 COMPOUND4args_clnt args; 3830 COMPOUND4res_clnt res, *resp = NULL; 3831 nfs4_ga_res_t *garp = NULL; 3832 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3833 nfs_argop4 argop[5]; 3834 int verify_argop = -1; 3835 int setattr_argop = 1; 3836 nfs_resop4 *resop; 3837 vattr_t va; 3838 rnode4_t *rp; 3839 int doqueue = 1; 3840 uint_t mask = vap->va_mask; 3841 mode_t omode; 3842 vsecattr_t *vsp; 3843 timestruc_t ctime; 3844 bool_t needrecov = FALSE; 3845 nfs4_recov_state_t recov_state; 3846 nfs4_stateid_types_t sid_types; 3847 stateid4 stateid; 3848 hrtime_t t; 3849 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3850 servinfo4_t *svp; 3851 bitmap4 supp_attrs; 3852 3853 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3854 rp = VTOR4(vp); 3855 nfs4_init_stateid_types(&sid_types); 3856 3857 /* 3858 * Only need to flush pages if there are any pages and 3859 * if the file is marked as dirty in some fashion. The 3860 * file must be flushed so that we can accurately 3861 * determine the size of the file and the cached data 3862 * after the SETATTR returns. A file is considered to 3863 * be dirty if it is either marked with R4DIRTY, has 3864 * outstanding i/o's active, or is mmap'd. In this 3865 * last case, we can't tell whether there are dirty 3866 * pages, so we flush just to be sure. 3867 */ 3868 if (nfs4_has_pages(vp) && 3869 ((rp->r_flags & R4DIRTY) || 3870 rp->r_count > 0 || 3871 rp->r_mapcnt > 0)) { 3872 ASSERT(vp->v_type != VCHR); 3873 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3874 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3875 mutex_enter(&rp->r_statelock); 3876 if (!rp->r_error) 3877 rp->r_error = e.error; 3878 mutex_exit(&rp->r_statelock); 3879 } 3880 } 3881 3882 if (mask & AT_SIZE) { 3883 /* 3884 * Verification setattr compound for non-deleg AT_SIZE: 3885 * { Putfh; Getattr; Verify; Setattr; Getattr } 3886 * Set ctime local here (outside the do_again label) 3887 * so that subsequent retries (after failed VERIFY) 3888 * will use ctime from GETATTR results (from failed 3889 * verify compound) as VERIFY arg. 3890 * If file has delegation, then VERIFY(time_metadata) 3891 * is of little added value, so don't bother. 3892 */ 3893 mutex_enter(&rp->r_statev4_lock); 3894 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3895 rp->r_deleg_return_pending) { 3896 numops = 5; 3897 ctime = rp->r_attr.va_ctime; 3898 } 3899 mutex_exit(&rp->r_statev4_lock); 3900 } 3901 3902 recov_state.rs_flags = 0; 3903 recov_state.rs_num_retry_despite_err = 0; 3904 3905 args.ctag = TAG_SETATTR; 3906 do_again: 3907 recov_retry: 3908 setattr_argop = numops - 2; 3909 3910 args.array = argop; 3911 args.array_len = numops; 3912 3913 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3914 if (e.error) 3915 return (e.error); 3916 3917 3918 /* putfh target fh */ 3919 argop[0].argop = OP_CPUTFH; 3920 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3921 3922 if (numops == 5) { 3923 /* 3924 * We only care about the ctime, but need to get mtime 3925 * and size for proper cache update. 3926 */ 3927 /* getattr */ 3928 argop[1].argop = OP_GETATTR; 3929 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3930 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3931 3932 /* verify - set later in loop */ 3933 verify_argop = 2; 3934 } 3935 3936 /* setattr */ 3937 svp = rp->r_server; 3938 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3939 supp_attrs = svp->sv_supp_attrs; 3940 nfs_rw_exit(&svp->sv_lock); 3941 3942 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3943 supp_attrs, &e.error, &sid_types); 3944 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3945 if (e.error) { 3946 /* req time field(s) overflow - return immediately */ 3947 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3948 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3949 opsetattr.obj_attributes); 3950 return (e.error); 3951 } 3952 omode = rp->r_attr.va_mode; 3953 3954 /* getattr */ 3955 argop[numops-1].argop = OP_GETATTR; 3956 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3957 /* 3958 * If we are setting the ACL (indicated only by vsap != NULL), request 3959 * the ACL in this getattr. The ACL returned from this getattr will be 3960 * used in updating the ACL cache. 3961 */ 3962 if (vsap != NULL) 3963 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3964 FATTR4_ACL_MASK; 3965 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3966 3967 /* 3968 * setattr iterates if the object size is set and the cached ctime 3969 * does not match the file ctime. In that case, verify the ctime first. 3970 */ 3971 3972 do { 3973 if (verify_argop != -1) { 3974 /* 3975 * Verify that the ctime match before doing setattr. 3976 */ 3977 va.va_mask = AT_CTIME; 3978 va.va_ctime = ctime; 3979 svp = rp->r_server; 3980 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3981 supp_attrs = svp->sv_supp_attrs; 3982 nfs_rw_exit(&svp->sv_lock); 3983 e.error = nfs4args_verify(&argop[verify_argop], &va, 3984 OP_VERIFY, supp_attrs); 3985 if (e.error) { 3986 /* req time field(s) overflow - return */ 3987 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3988 needrecov); 3989 break; 3990 } 3991 } 3992 3993 doqueue = 1; 3994 3995 t = gethrtime(); 3996 3997 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3998 3999 /* 4000 * Purge the access cache and ACL cache if changing either the 4001 * owner of the file, the group owner, or the mode. These may 4002 * change the access permissions of the file, so purge old 4003 * information and start over again. 4004 */ 4005 if (mask & (AT_UID | AT_GID | AT_MODE)) { 4006 (void) nfs4_access_purge_rp(rp); 4007 if (rp->r_secattr != NULL) { 4008 mutex_enter(&rp->r_statelock); 4009 vsp = rp->r_secattr; 4010 rp->r_secattr = NULL; 4011 mutex_exit(&rp->r_statelock); 4012 if (vsp != NULL) 4013 nfs4_acl_free_cache(vsp); 4014 } 4015 } 4016 4017 /* 4018 * If res.array_len == numops, then everything succeeded, 4019 * except for possibly the final getattr. If only the 4020 * last getattr failed, give up, and don't try recovery. 4021 */ 4022 if (res.array_len == numops) { 4023 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4024 needrecov); 4025 if (! e.error) 4026 resp = &res; 4027 break; 4028 } 4029 4030 /* 4031 * if either rpc call failed or completely succeeded - done 4032 */ 4033 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4034 if (e.error) { 4035 PURGE_ATTRCACHE4(vp); 4036 if (!needrecov) { 4037 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4038 needrecov); 4039 break; 4040 } 4041 } 4042 4043 /* 4044 * Do proper retry for OLD_STATEID outside of the normal 4045 * recovery framework. 4046 */ 4047 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4048 sid_types.cur_sid_type != SPEC_SID && 4049 sid_types.cur_sid_type != NO_SID) { 4050 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4051 needrecov); 4052 nfs4_save_stateid(&stateid, &sid_types); 4053 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4054 opsetattr.obj_attributes); 4055 if (verify_argop != -1) { 4056 nfs4args_verify_free(&argop[verify_argop]); 4057 verify_argop = -1; 4058 } 4059 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4060 goto recov_retry; 4061 } 4062 4063 if (needrecov) { 4064 bool_t abort; 4065 4066 abort = nfs4_start_recovery(&e, 4067 VTOMI4(vp), vp, NULL, NULL, NULL, 4068 OP_SETATTR, NULL, NULL, NULL); 4069 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4070 needrecov); 4071 /* 4072 * Do not retry if we failed with OLD_STATEID using 4073 * a special stateid. This is done to avoid looping 4074 * with a broken server. 4075 */ 4076 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4077 (sid_types.cur_sid_type == SPEC_SID || 4078 sid_types.cur_sid_type == NO_SID)) 4079 abort = TRUE; 4080 if (!e.error) { 4081 if (res.status == NFS4ERR_BADOWNER) 4082 nfs4_log_badowner(VTOMI4(vp), 4083 OP_SETATTR); 4084 4085 e.error = geterrno4(res.status); 4086 (void) xdr_free(xdr_COMPOUND4res_clnt, 4087 (caddr_t)&res); 4088 } 4089 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4090 opsetattr.obj_attributes); 4091 if (verify_argop != -1) { 4092 nfs4args_verify_free(&argop[verify_argop]); 4093 verify_argop = -1; 4094 } 4095 if (abort == FALSE) { 4096 /* 4097 * Need to retry all possible stateids in 4098 * case the recovery error wasn't stateid 4099 * related or the stateids have become 4100 * stale (server reboot). 4101 */ 4102 nfs4_init_stateid_types(&sid_types); 4103 goto recov_retry; 4104 } 4105 return (e.error); 4106 } 4107 4108 /* 4109 * Need to call nfs4_end_op before nfs4getattr to 4110 * avoid potential nfs4_start_op deadlock. See RFE 4111 * 4777612. Calls to nfs4_invalidate_pages() and 4112 * nfs4_purge_stale_fh() might also generate over the 4113 * wire calls which my cause nfs4_start_op() deadlock. 4114 */ 4115 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4116 4117 /* 4118 * Check to update lease. 4119 */ 4120 resp = &res; 4121 if (res.status == NFS4_OK) { 4122 break; 4123 } 4124 4125 /* 4126 * Check if verify failed to see if try again 4127 */ 4128 if ((verify_argop == -1) || (res.array_len != 3)) { 4129 /* 4130 * can't continue... 4131 */ 4132 if (res.status == NFS4ERR_BADOWNER) 4133 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4134 4135 e.error = geterrno4(res.status); 4136 } else { 4137 /* 4138 * When the verify request fails, the client ctime is 4139 * not in sync with the server. This is the same as 4140 * the version 3 "not synchronized" error, and we 4141 * handle it in a similar manner (XXX do we need to???). 4142 * Use the ctime returned in the first getattr for 4143 * the input to the next verify. 4144 * If we couldn't get the attributes, then we give up 4145 * because we can't complete the operation as required. 4146 */ 4147 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4148 } 4149 if (e.error) { 4150 PURGE_ATTRCACHE4(vp); 4151 nfs4_purge_stale_fh(e.error, vp, cr); 4152 } else { 4153 /* 4154 * retry with a new verify value 4155 */ 4156 ctime = garp->n4g_va.va_ctime; 4157 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4158 resp = NULL; 4159 } 4160 if (!e.error) { 4161 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4162 opsetattr.obj_attributes); 4163 if (verify_argop != -1) { 4164 nfs4args_verify_free(&argop[verify_argop]); 4165 verify_argop = -1; 4166 } 4167 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4168 goto do_again; 4169 } 4170 } while (!e.error); 4171 4172 if (e.error) { 4173 /* 4174 * If we are here, rfs4call has an irrecoverable error - return 4175 */ 4176 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4177 opsetattr.obj_attributes); 4178 if (verify_argop != -1) { 4179 nfs4args_verify_free(&argop[verify_argop]); 4180 verify_argop = -1; 4181 } 4182 if (resp) 4183 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4184 return (e.error); 4185 } 4186 4187 4188 4189 /* 4190 * If changing the size of the file, invalidate 4191 * any local cached data which is no longer part 4192 * of the file. We also possibly invalidate the 4193 * last page in the file. We could use 4194 * pvn_vpzero(), but this would mark the page as 4195 * modified and require it to be written back to 4196 * the server for no particularly good reason. 4197 * This way, if we access it, then we bring it 4198 * back in. A read should be cheaper than a 4199 * write. 4200 */ 4201 if (mask & AT_SIZE) { 4202 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4203 } 4204 4205 /* either no error or one of the postop getattr failed */ 4206 4207 /* 4208 * XXX Perform a simplified version of wcc checking. Instead of 4209 * have another getattr to get pre-op, just purge cache if 4210 * any of the ops prior to and including the getattr failed. 4211 * If the getattr succeeded then update the attrcache accordingly. 4212 */ 4213 4214 garp = NULL; 4215 if (res.status == NFS4_OK) { 4216 /* 4217 * Last getattr 4218 */ 4219 resop = &res.array[numops - 1]; 4220 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4221 } 4222 /* 4223 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4224 * rather than filling it. See the function itself for details. 4225 */ 4226 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4227 if (garp != NULL) { 4228 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4229 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4230 vs_ace4_destroy(&garp->n4g_vsa); 4231 } else { 4232 if (vsap != NULL) { 4233 /* 4234 * The ACL was supposed to be set and to be 4235 * returned in the last getattr of this 4236 * compound, but for some reason the getattr 4237 * result doesn't contain the ACL. In this 4238 * case, purge the ACL cache. 4239 */ 4240 if (rp->r_secattr != NULL) { 4241 mutex_enter(&rp->r_statelock); 4242 vsp = rp->r_secattr; 4243 rp->r_secattr = NULL; 4244 mutex_exit(&rp->r_statelock); 4245 if (vsp != NULL) 4246 nfs4_acl_free_cache(vsp); 4247 } 4248 } 4249 } 4250 } 4251 4252 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4253 /* 4254 * Set the size, rather than relying on getting it updated 4255 * via a GETATTR. With delegations the client tries to 4256 * suppress GETATTR calls. 4257 */ 4258 mutex_enter(&rp->r_statelock); 4259 rp->r_size = vap->va_size; 4260 mutex_exit(&rp->r_statelock); 4261 } 4262 4263 /* 4264 * Can free up request args and res 4265 */ 4266 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4267 opsetattr.obj_attributes); 4268 if (verify_argop != -1) { 4269 nfs4args_verify_free(&argop[verify_argop]); 4270 verify_argop = -1; 4271 } 4272 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4273 4274 /* 4275 * Some servers will change the mode to clear the setuid 4276 * and setgid bits when changing the uid or gid. The 4277 * client needs to compensate appropriately. 4278 */ 4279 if (mask & (AT_UID | AT_GID)) { 4280 int terror, do_setattr; 4281 4282 do_setattr = 0; 4283 va.va_mask = AT_MODE; 4284 terror = nfs4getattr(vp, &va, cr); 4285 if (!terror && 4286 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4287 (!(mask & AT_MODE) && va.va_mode != omode))) { 4288 va.va_mask = AT_MODE; 4289 if (mask & AT_MODE) { 4290 /* 4291 * We asked the mode to be changed and what 4292 * we just got from the server in getattr is 4293 * not what we wanted it to be, so set it now. 4294 */ 4295 va.va_mode = vap->va_mode; 4296 do_setattr = 1; 4297 } else { 4298 /* 4299 * We did not ask the mode to be changed, 4300 * Check to see that the server just cleared 4301 * I_SUID and I_GUID from it. If not then 4302 * set mode to omode with UID/GID cleared. 4303 */ 4304 if (nfs4_compare_modes(va.va_mode, omode)) { 4305 omode &= ~(S_ISUID|S_ISGID); 4306 va.va_mode = omode; 4307 do_setattr = 1; 4308 } 4309 } 4310 4311 if (do_setattr) 4312 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4313 } 4314 } 4315 4316 return (e.error); 4317 } 4318 4319 /* ARGSUSED */ 4320 static int 4321 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4322 { 4323 COMPOUND4args_clnt args; 4324 COMPOUND4res_clnt res; 4325 int doqueue; 4326 uint32_t acc, resacc, argacc; 4327 rnode4_t *rp; 4328 cred_t *cred, *ncr, *ncrfree = NULL; 4329 nfs4_access_type_t cacc; 4330 int num_ops; 4331 nfs_argop4 argop[3]; 4332 nfs_resop4 *resop; 4333 bool_t needrecov = FALSE, do_getattr; 4334 nfs4_recov_state_t recov_state; 4335 int rpc_error; 4336 hrtime_t t; 4337 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4338 mntinfo4_t *mi = VTOMI4(vp); 4339 4340 if (nfs_zone() != mi->mi_zone) 4341 return (EIO); 4342 4343 acc = 0; 4344 if (mode & VREAD) 4345 acc |= ACCESS4_READ; 4346 if (mode & VWRITE) { 4347 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4348 return (EROFS); 4349 if (vp->v_type == VDIR) 4350 acc |= ACCESS4_DELETE; 4351 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4352 } 4353 if (mode & VEXEC) { 4354 if (vp->v_type == VDIR) 4355 acc |= ACCESS4_LOOKUP; 4356 else 4357 acc |= ACCESS4_EXECUTE; 4358 } 4359 4360 if (VTOR4(vp)->r_acache != NULL) { 4361 e.error = nfs4_validate_caches(vp, cr); 4362 if (e.error) 4363 return (e.error); 4364 } 4365 4366 rp = VTOR4(vp); 4367 if (vp->v_type == VDIR) 4368 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4369 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4370 else 4371 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4372 ACCESS4_EXECUTE; 4373 recov_state.rs_flags = 0; 4374 recov_state.rs_num_retry_despite_err = 0; 4375 4376 cred = cr; 4377 /* 4378 * ncr and ncrfree both initially 4379 * point to the memory area returned 4380 * by crnetadjust(); 4381 * ncrfree not NULL when exiting means 4382 * that we need to release it 4383 */ 4384 ncr = crnetadjust(cred); 4385 ncrfree = ncr; 4386 4387 tryagain: 4388 cacc = nfs4_access_check(rp, acc, cred); 4389 if (cacc == NFS4_ACCESS_ALLOWED) { 4390 if (ncrfree != NULL) 4391 crfree(ncrfree); 4392 return (0); 4393 } 4394 if (cacc == NFS4_ACCESS_DENIED) { 4395 /* 4396 * If the cred can be adjusted, try again 4397 * with the new cred. 4398 */ 4399 if (ncr != NULL) { 4400 cred = ncr; 4401 ncr = NULL; 4402 goto tryagain; 4403 } 4404 if (ncrfree != NULL) 4405 crfree(ncrfree); 4406 return (EACCES); 4407 } 4408 4409 recov_retry: 4410 /* 4411 * Don't take with r_statev4_lock here. r_deleg_type could 4412 * change as soon as lock is released. Since it is an int, 4413 * there is no atomicity issue. 4414 */ 4415 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4416 num_ops = do_getattr ? 3 : 2; 4417 4418 args.ctag = TAG_ACCESS; 4419 4420 args.array_len = num_ops; 4421 args.array = argop; 4422 4423 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4424 &recov_state, NULL)) { 4425 if (ncrfree != NULL) 4426 crfree(ncrfree); 4427 return (e.error); 4428 } 4429 4430 /* putfh target fh */ 4431 argop[0].argop = OP_CPUTFH; 4432 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4433 4434 /* access */ 4435 argop[1].argop = OP_ACCESS; 4436 argop[1].nfs_argop4_u.opaccess.access = argacc; 4437 4438 /* getattr */ 4439 if (do_getattr) { 4440 argop[2].argop = OP_GETATTR; 4441 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4442 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4443 } 4444 4445 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4446 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4447 rnode4info(VTOR4(vp)))); 4448 4449 doqueue = 1; 4450 t = gethrtime(); 4451 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4452 rpc_error = e.error; 4453 4454 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4455 if (needrecov) { 4456 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4457 "nfs4_access: initiating recovery\n")); 4458 4459 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4460 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) { 4461 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4462 &recov_state, needrecov); 4463 if (!e.error) 4464 (void) xdr_free(xdr_COMPOUND4res_clnt, 4465 (caddr_t)&res); 4466 goto recov_retry; 4467 } 4468 } 4469 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4470 4471 if (e.error) 4472 goto out; 4473 4474 if (res.status) { 4475 e.error = geterrno4(res.status); 4476 /* 4477 * This might generate over the wire calls throught 4478 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4479 * here to avoid a deadlock. 4480 */ 4481 nfs4_purge_stale_fh(e.error, vp, cr); 4482 goto out; 4483 } 4484 resop = &res.array[1]; /* access res */ 4485 4486 resacc = resop->nfs_resop4_u.opaccess.access; 4487 4488 if (do_getattr) { 4489 resop++; /* getattr res */ 4490 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4491 t, cr, FALSE, NULL); 4492 } 4493 4494 if (!e.error) { 4495 nfs4_access_cache(rp, argacc, resacc, cred); 4496 /* 4497 * we just cached results with cred; if cred is the 4498 * adjusted credentials from crnetadjust, we do not want 4499 * to release them before exiting: hence setting ncrfree 4500 * to NULL 4501 */ 4502 if (cred != cr) 4503 ncrfree = NULL; 4504 /* XXX check the supported bits too? */ 4505 if ((acc & resacc) != acc) { 4506 /* 4507 * The following code implements the semantic 4508 * that a setuid root program has *at least* the 4509 * permissions of the user that is running the 4510 * program. See rfs3call() for more portions 4511 * of the implementation of this functionality. 4512 */ 4513 /* XXX-LP */ 4514 if (ncr != NULL) { 4515 (void) xdr_free(xdr_COMPOUND4res_clnt, 4516 (caddr_t)&res); 4517 cred = ncr; 4518 ncr = NULL; 4519 goto tryagain; 4520 } 4521 e.error = EACCES; 4522 } 4523 } 4524 4525 out: 4526 if (!rpc_error) 4527 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4528 4529 if (ncrfree != NULL) 4530 crfree(ncrfree); 4531 4532 return (e.error); 4533 } 4534 4535 /* ARGSUSED */ 4536 static int 4537 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4538 { 4539 COMPOUND4args_clnt args; 4540 COMPOUND4res_clnt res; 4541 int doqueue; 4542 rnode4_t *rp; 4543 nfs_argop4 argop[3]; 4544 nfs_resop4 *resop; 4545 READLINK4res *lr_res; 4546 nfs4_ga_res_t *garp; 4547 uint_t len; 4548 char *linkdata; 4549 bool_t needrecov = FALSE; 4550 nfs4_recov_state_t recov_state; 4551 hrtime_t t; 4552 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4553 4554 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4555 return (EIO); 4556 /* 4557 * Can't readlink anything other than a symbolic link. 4558 */ 4559 if (vp->v_type != VLNK) 4560 return (EINVAL); 4561 4562 rp = VTOR4(vp); 4563 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4564 e.error = nfs4_validate_caches(vp, cr); 4565 if (e.error) 4566 return (e.error); 4567 mutex_enter(&rp->r_statelock); 4568 if (rp->r_symlink.contents != NULL) { 4569 e.error = uiomove(rp->r_symlink.contents, 4570 rp->r_symlink.len, UIO_READ, uiop); 4571 mutex_exit(&rp->r_statelock); 4572 return (e.error); 4573 } 4574 mutex_exit(&rp->r_statelock); 4575 } 4576 recov_state.rs_flags = 0; 4577 recov_state.rs_num_retry_despite_err = 0; 4578 4579 recov_retry: 4580 args.array_len = 3; 4581 args.array = argop; 4582 args.ctag = TAG_READLINK; 4583 4584 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4585 if (e.error) { 4586 return (e.error); 4587 } 4588 4589 /* 0. putfh symlink fh */ 4590 argop[0].argop = OP_CPUTFH; 4591 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4592 4593 /* 1. readlink */ 4594 argop[1].argop = OP_READLINK; 4595 4596 /* 2. getattr */ 4597 argop[2].argop = OP_GETATTR; 4598 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4599 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4600 4601 doqueue = 1; 4602 4603 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4604 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4605 rnode4info(VTOR4(vp)))); 4606 4607 t = gethrtime(); 4608 4609 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4610 4611 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4612 if (needrecov) { 4613 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4614 "nfs4_readlink: initiating recovery\n")); 4615 4616 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4617 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) { 4618 if (!e.error) 4619 (void) xdr_free(xdr_COMPOUND4res_clnt, 4620 (caddr_t)&res); 4621 4622 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4623 needrecov); 4624 goto recov_retry; 4625 } 4626 } 4627 4628 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4629 4630 if (e.error) 4631 return (e.error); 4632 4633 /* 4634 * There is an path in the code below which calls 4635 * nfs4_purge_stale_fh(), which may generate otw calls through 4636 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4637 * here to avoid nfs4_start_op() deadlock. 4638 */ 4639 4640 if (res.status && (res.array_len < args.array_len)) { 4641 /* 4642 * either Putfh or Link failed 4643 */ 4644 e.error = geterrno4(res.status); 4645 nfs4_purge_stale_fh(e.error, vp, cr); 4646 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4647 return (e.error); 4648 } 4649 4650 resop = &res.array[1]; /* readlink res */ 4651 lr_res = &resop->nfs_resop4_u.opreadlink; 4652 4653 /* 4654 * treat symlink names as data 4655 */ 4656 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL); 4657 if (linkdata != NULL) { 4658 int uio_len = len - 1; 4659 /* len includes null byte, which we won't uiomove */ 4660 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4661 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4662 mutex_enter(&rp->r_statelock); 4663 if (rp->r_symlink.contents == NULL) { 4664 rp->r_symlink.contents = linkdata; 4665 rp->r_symlink.len = uio_len; 4666 rp->r_symlink.size = len; 4667 mutex_exit(&rp->r_statelock); 4668 } else { 4669 mutex_exit(&rp->r_statelock); 4670 kmem_free(linkdata, len); 4671 } 4672 } else { 4673 kmem_free(linkdata, len); 4674 } 4675 } 4676 if (res.status == NFS4_OK) { 4677 resop++; /* getattr res */ 4678 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4679 } 4680 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4681 4682 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4683 4684 /* 4685 * The over the wire error for attempting to readlink something 4686 * other than a symbolic link is ENXIO. However, we need to 4687 * return EINVAL instead of ENXIO, so we map it here. 4688 */ 4689 return (e.error == ENXIO ? EINVAL : e.error); 4690 } 4691 4692 /* 4693 * Flush local dirty pages to stable storage on the server. 4694 * 4695 * If FNODSYNC is specified, then there is nothing to do because 4696 * metadata changes are not cached on the client before being 4697 * sent to the server. 4698 */ 4699 /* ARGSUSED */ 4700 static int 4701 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4702 { 4703 int error; 4704 4705 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4706 return (0); 4707 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4708 return (EIO); 4709 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4710 if (!error) 4711 error = VTOR4(vp)->r_error; 4712 return (error); 4713 } 4714 4715 /* 4716 * Weirdness: if the file was removed or the target of a rename 4717 * operation while it was open, it got renamed instead. Here we 4718 * remove the renamed file. 4719 */ 4720 /* ARGSUSED */ 4721 void 4722 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4723 { 4724 rnode4_t *rp; 4725 4726 ASSERT(vp != DNLC_NO_VNODE); 4727 4728 rp = VTOR4(vp); 4729 4730 if (IS_SHADOW(vp, rp)) { 4731 sv_inactive(vp); 4732 return; 4733 } 4734 4735 /* 4736 * If this is coming from the wrong zone, we let someone in the right 4737 * zone take care of it asynchronously. We can get here due to 4738 * VN_RELE() being called from pageout() or fsflush(). This call may 4739 * potentially turn into an expensive no-op if, for instance, v_count 4740 * gets incremented in the meantime, but it's still correct. 4741 */ 4742 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4743 nfs4_async_inactive(vp, cr); 4744 return; 4745 } 4746 4747 /* 4748 * Some of the cleanup steps might require over-the-wire 4749 * operations. Since VOP_INACTIVE can get called as a result of 4750 * other over-the-wire operations (e.g., an attribute cache update 4751 * can lead to a DNLC purge), doing those steps now would lead to a 4752 * nested call to the recovery framework, which can deadlock. So 4753 * do any over-the-wire cleanups asynchronously, in a separate 4754 * thread. 4755 */ 4756 4757 mutex_enter(&rp->r_os_lock); 4758 mutex_enter(&rp->r_statelock); 4759 mutex_enter(&rp->r_statev4_lock); 4760 4761 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4762 mutex_exit(&rp->r_statev4_lock); 4763 mutex_exit(&rp->r_statelock); 4764 mutex_exit(&rp->r_os_lock); 4765 nfs4_async_inactive(vp, cr); 4766 return; 4767 } 4768 4769 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4770 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4771 mutex_exit(&rp->r_statev4_lock); 4772 mutex_exit(&rp->r_statelock); 4773 mutex_exit(&rp->r_os_lock); 4774 nfs4_async_inactive(vp, cr); 4775 return; 4776 } 4777 4778 if (rp->r_unldvp != NULL) { 4779 mutex_exit(&rp->r_statev4_lock); 4780 mutex_exit(&rp->r_statelock); 4781 mutex_exit(&rp->r_os_lock); 4782 nfs4_async_inactive(vp, cr); 4783 return; 4784 } 4785 mutex_exit(&rp->r_statev4_lock); 4786 mutex_exit(&rp->r_statelock); 4787 mutex_exit(&rp->r_os_lock); 4788 4789 rp4_addfree(rp, cr); 4790 } 4791 4792 /* 4793 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4794 * various bits of state. The caller must not refer to vp after this call. 4795 */ 4796 4797 void 4798 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4799 { 4800 rnode4_t *rp = VTOR4(vp); 4801 nfs4_recov_state_t recov_state; 4802 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4803 vnode_t *unldvp; 4804 char *unlname; 4805 cred_t *unlcred; 4806 COMPOUND4args_clnt args; 4807 COMPOUND4res_clnt res, *resp; 4808 nfs_argop4 argop[2]; 4809 int doqueue; 4810 #ifdef DEBUG 4811 char *name; 4812 #endif 4813 4814 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4815 ASSERT(!IS_SHADOW(vp, rp)); 4816 4817 #ifdef DEBUG 4818 name = fn_name(VTOSV(vp)->sv_name); 4819 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4820 "release vnode %s", name)); 4821 kmem_free(name, MAXNAMELEN); 4822 #endif 4823 4824 if (vp->v_type == VREG) { 4825 bool_t recov_failed = FALSE; 4826 4827 e.error = nfs4close_all(vp, cr); 4828 if (e.error) { 4829 /* Check to see if recovery failed */ 4830 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4831 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4832 recov_failed = TRUE; 4833 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4834 if (!recov_failed) { 4835 mutex_enter(&rp->r_statelock); 4836 if (rp->r_flags & R4RECOVERR) 4837 recov_failed = TRUE; 4838 mutex_exit(&rp->r_statelock); 4839 } 4840 if (recov_failed) { 4841 NFS4_DEBUG(nfs4_client_recov_debug, 4842 (CE_NOTE, "nfs4_inactive_otw: " 4843 "close failed (recovery failure)")); 4844 } 4845 } 4846 } 4847 4848 redo: 4849 if (rp->r_unldvp == NULL) { 4850 rp4_addfree(rp, cr); 4851 return; 4852 } 4853 4854 /* 4855 * Save the vnode pointer for the directory where the 4856 * unlinked-open file got renamed, then set it to NULL 4857 * to prevent another thread from getting here before 4858 * we're done with the remove. While we have the 4859 * statelock, make local copies of the pertinent rnode 4860 * fields. If we weren't to do this in an atomic way, the 4861 * the unl* fields could become inconsistent with respect 4862 * to each other due to a race condition between this 4863 * code and nfs_remove(). See bug report 1034328. 4864 */ 4865 mutex_enter(&rp->r_statelock); 4866 if (rp->r_unldvp == NULL) { 4867 mutex_exit(&rp->r_statelock); 4868 rp4_addfree(rp, cr); 4869 return; 4870 } 4871 4872 unldvp = rp->r_unldvp; 4873 rp->r_unldvp = NULL; 4874 unlname = rp->r_unlname; 4875 rp->r_unlname = NULL; 4876 unlcred = rp->r_unlcred; 4877 rp->r_unlcred = NULL; 4878 mutex_exit(&rp->r_statelock); 4879 4880 /* 4881 * If there are any dirty pages left, then flush 4882 * them. This is unfortunate because they just 4883 * may get thrown away during the remove operation, 4884 * but we have to do this for correctness. 4885 */ 4886 if (nfs4_has_pages(vp) && 4887 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4888 ASSERT(vp->v_type != VCHR); 4889 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4890 if (e.error) { 4891 mutex_enter(&rp->r_statelock); 4892 if (!rp->r_error) 4893 rp->r_error = e.error; 4894 mutex_exit(&rp->r_statelock); 4895 } 4896 } 4897 4898 recov_state.rs_flags = 0; 4899 recov_state.rs_num_retry_despite_err = 0; 4900 recov_retry_remove: 4901 /* 4902 * Do the remove operation on the renamed file 4903 */ 4904 args.ctag = TAG_INACTIVE; 4905 4906 /* 4907 * Remove ops: putfh dir; remove 4908 */ 4909 args.array_len = 2; 4910 args.array = argop; 4911 4912 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4913 if (e.error) { 4914 kmem_free(unlname, MAXNAMELEN); 4915 crfree(unlcred); 4916 VN_RELE(unldvp); 4917 /* 4918 * Try again; this time around r_unldvp will be NULL, so we'll 4919 * just call rp4_addfree() and return. 4920 */ 4921 goto redo; 4922 } 4923 4924 /* putfh directory */ 4925 argop[0].argop = OP_CPUTFH; 4926 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4927 4928 /* remove */ 4929 argop[1].argop = OP_CREMOVE; 4930 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4931 4932 doqueue = 1; 4933 resp = &res; 4934 4935 #if 0 /* notyet */ 4936 /* 4937 * Can't do this yet. We may be being called from 4938 * dnlc_purge_XXX while that routine is holding a 4939 * mutex lock to the nc_rele list. The calls to 4940 * nfs3_cache_wcc_data may result in calls to 4941 * dnlc_purge_XXX. This will result in a deadlock. 4942 */ 4943 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4944 if (e.error) { 4945 PURGE_ATTRCACHE4(unldvp); 4946 resp = NULL; 4947 } else if (res.status) { 4948 e.error = geterrno4(res.status); 4949 PURGE_ATTRCACHE4(unldvp); 4950 /* 4951 * This code is inactive right now 4952 * but if made active there should 4953 * be a nfs4_end_op() call before 4954 * nfs4_purge_stale_fh to avoid start_op() 4955 * deadlock. See BugId: 4948726 4956 */ 4957 nfs4_purge_stale_fh(error, unldvp, cr); 4958 } else { 4959 nfs_resop4 *resop; 4960 REMOVE4res *rm_res; 4961 4962 resop = &res.array[1]; 4963 rm_res = &resop->nfs_resop4_u.opremove; 4964 /* 4965 * Update directory cache attribute, 4966 * readdir and dnlc caches. 4967 */ 4968 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4969 } 4970 #else 4971 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4972 4973 PURGE_ATTRCACHE4(unldvp); 4974 #endif 4975 4976 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4977 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4978 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 4979 if (!e.error) 4980 (void) xdr_free(xdr_COMPOUND4res_clnt, 4981 (caddr_t)&res); 4982 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4983 &recov_state, TRUE); 4984 goto recov_retry_remove; 4985 } 4986 } 4987 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4988 4989 /* 4990 * Release stuff held for the remove 4991 */ 4992 VN_RELE(unldvp); 4993 if (!e.error && resp) 4994 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4995 4996 kmem_free(unlname, MAXNAMELEN); 4997 crfree(unlcred); 4998 goto redo; 4999 } 5000 5001 /* 5002 * Remote file system operations having to do with directory manipulation. 5003 */ 5004 /* ARGSUSED3 */ 5005 int 5006 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 5007 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 5008 int *direntflags, pathname_t *realpnp) 5009 { 5010 int error; 5011 vnode_t *vp, *avp = NULL; 5012 rnode4_t *drp; 5013 5014 *vpp = NULL; 5015 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 5016 return (EPERM); 5017 /* 5018 * if LOOKUP_XATTR, must replace dvp (object) with 5019 * object's attrdir before continuing with lookup 5020 */ 5021 if (flags & LOOKUP_XATTR) { 5022 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 5023 if (error) 5024 return (error); 5025 5026 dvp = avp; 5027 5028 /* 5029 * If lookup is for "", just return dvp now. The attrdir 5030 * has already been activated (from nfs4lookup_xattr), and 5031 * the caller will RELE the original dvp -- not 5032 * the attrdir. So, set vpp and return. 5033 * Currently, when the LOOKUP_XATTR flag is 5034 * passed to VOP_LOOKUP, the name is always empty, and 5035 * shortcircuiting here avoids 3 unneeded lock/unlock 5036 * pairs. 5037 * 5038 * If a non-empty name was provided, then it is the 5039 * attribute name, and it will be looked up below. 5040 */ 5041 if (*nm == '\0') { 5042 *vpp = dvp; 5043 return (0); 5044 } 5045 5046 /* 5047 * The vfs layer never sends a name when asking for the 5048 * attrdir, so we should never get here (unless of course 5049 * name is passed at some time in future -- at which time 5050 * we'll blow up here). 5051 */ 5052 ASSERT(0); 5053 } 5054 5055 drp = VTOR4(dvp); 5056 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5057 return (EINTR); 5058 5059 error = nfs4lookup(dvp, nm, vpp, cr, 0); 5060 nfs_rw_exit(&drp->r_rwlock); 5061 5062 /* 5063 * If vnode is a device, create special vnode. 5064 */ 5065 if (!error && ISVDEV((*vpp)->v_type)) { 5066 vp = *vpp; 5067 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 5068 VN_RELE(vp); 5069 } 5070 5071 return (error); 5072 } 5073 5074 /* ARGSUSED */ 5075 static int 5076 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5077 { 5078 int error; 5079 rnode4_t *drp; 5080 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5081 mntinfo4_t *mi; 5082 5083 mi = VTOMI4(dvp); 5084 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5085 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5086 return (EINVAL); 5087 5088 drp = VTOR4(dvp); 5089 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5090 return (EINTR); 5091 5092 mutex_enter(&drp->r_statelock); 5093 /* 5094 * If the server doesn't support xattrs just return EINVAL 5095 */ 5096 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5097 mutex_exit(&drp->r_statelock); 5098 nfs_rw_exit(&drp->r_rwlock); 5099 return (EINVAL); 5100 } 5101 5102 /* 5103 * If there is a cached xattr directory entry, 5104 * use it as long as the attributes are valid. If the 5105 * attributes are not valid, take the simple approach and 5106 * free the cached value and re-fetch a new value. 5107 * 5108 * We don't negative entry cache for now, if we did we 5109 * would need to check if the file has changed on every 5110 * lookup. But xattrs don't exist very often and failing 5111 * an openattr is not much more expensive than and NVERIFY or GETATTR 5112 * so do an openattr over the wire for now. 5113 */ 5114 if (drp->r_xattr_dir != NULL) { 5115 if (ATTRCACHE4_VALID(dvp)) { 5116 VN_HOLD(drp->r_xattr_dir); 5117 *vpp = drp->r_xattr_dir; 5118 mutex_exit(&drp->r_statelock); 5119 nfs_rw_exit(&drp->r_rwlock); 5120 return (0); 5121 } 5122 VN_RELE(drp->r_xattr_dir); 5123 drp->r_xattr_dir = NULL; 5124 } 5125 mutex_exit(&drp->r_statelock); 5126 5127 error = nfs4openattr(dvp, vpp, cflag, cr); 5128 5129 nfs_rw_exit(&drp->r_rwlock); 5130 5131 return (error); 5132 } 5133 5134 static int 5135 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5136 { 5137 int error; 5138 rnode4_t *drp; 5139 5140 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5141 5142 /* 5143 * If lookup is for "", just return dvp. Don't need 5144 * to send it over the wire, look it up in the dnlc, 5145 * or perform any access checks. 5146 */ 5147 if (*nm == '\0') { 5148 VN_HOLD(dvp); 5149 *vpp = dvp; 5150 return (0); 5151 } 5152 5153 /* 5154 * Can't do lookups in non-directories. 5155 */ 5156 if (dvp->v_type != VDIR) 5157 return (ENOTDIR); 5158 5159 /* 5160 * If lookup is for ".", just return dvp. Don't need 5161 * to send it over the wire or look it up in the dnlc, 5162 * just need to check access. 5163 */ 5164 if (nm[0] == '.' && nm[1] == '\0') { 5165 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5166 if (error) 5167 return (error); 5168 VN_HOLD(dvp); 5169 *vpp = dvp; 5170 return (0); 5171 } 5172 5173 drp = VTOR4(dvp); 5174 if (!(drp->r_flags & R4LOOKUP)) { 5175 mutex_enter(&drp->r_statelock); 5176 drp->r_flags |= R4LOOKUP; 5177 mutex_exit(&drp->r_statelock); 5178 } 5179 5180 *vpp = NULL; 5181 /* 5182 * Lookup this name in the DNLC. If there is no entry 5183 * lookup over the wire. 5184 */ 5185 if (!skipdnlc) 5186 *vpp = dnlc_lookup(dvp, nm); 5187 if (*vpp == NULL) { 5188 /* 5189 * We need to go over the wire to lookup the name. 5190 */ 5191 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5192 } 5193 5194 /* 5195 * We hit on the dnlc 5196 */ 5197 if (*vpp != DNLC_NO_VNODE || 5198 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5199 /* 5200 * But our attrs may not be valid. 5201 */ 5202 if (ATTRCACHE4_VALID(dvp)) { 5203 error = nfs4_waitfor_purge_complete(dvp); 5204 if (error) { 5205 VN_RELE(*vpp); 5206 *vpp = NULL; 5207 return (error); 5208 } 5209 5210 /* 5211 * If after the purge completes, check to make sure 5212 * our attrs are still valid. 5213 */ 5214 if (ATTRCACHE4_VALID(dvp)) { 5215 /* 5216 * If we waited for a purge we may have 5217 * lost our vnode so look it up again. 5218 */ 5219 VN_RELE(*vpp); 5220 *vpp = dnlc_lookup(dvp, nm); 5221 if (*vpp == NULL) 5222 return (nfs4lookupnew_otw(dvp, 5223 nm, vpp, cr)); 5224 5225 /* 5226 * The access cache should almost always hit 5227 */ 5228 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5229 5230 if (error) { 5231 VN_RELE(*vpp); 5232 *vpp = NULL; 5233 return (error); 5234 } 5235 if (*vpp == DNLC_NO_VNODE) { 5236 VN_RELE(*vpp); 5237 *vpp = NULL; 5238 return (ENOENT); 5239 } 5240 return (0); 5241 } 5242 } 5243 } 5244 5245 ASSERT(*vpp != NULL); 5246 5247 /* 5248 * We may have gotten here we have one of the following cases: 5249 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5250 * need to validate them. 5251 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5252 * must validate. 5253 * 5254 * Go to the server and check if the directory has changed, if 5255 * it hasn't we are done and can use the dnlc entry. 5256 */ 5257 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5258 } 5259 5260 /* 5261 * Go to the server and check if the directory has changed, if 5262 * it hasn't we are done and can use the dnlc entry. If it 5263 * has changed we get a new copy of its attributes and check 5264 * the access for VEXEC, then relookup the filename and 5265 * get its filehandle and attributes. 5266 * 5267 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5268 * if the NVERIFY failed we must 5269 * purge the caches 5270 * cache new attributes (will set r_time_attr_inval) 5271 * cache new access 5272 * recheck VEXEC access 5273 * add name to dnlc, possibly negative 5274 * if LOOKUP succeeded 5275 * cache new attributes 5276 * else 5277 * set a new r_time_attr_inval for dvp 5278 * check to make sure we have access 5279 * 5280 * The vpp returned is the vnode passed in if the directory is valid, 5281 * a new vnode if successful lookup, or NULL on error. 5282 */ 5283 static int 5284 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5285 { 5286 COMPOUND4args_clnt args; 5287 COMPOUND4res_clnt res; 5288 fattr4 *ver_fattr; 5289 fattr4_change dchange; 5290 int32_t *ptr; 5291 int argoplist_size = 7 * sizeof (nfs_argop4); 5292 nfs_argop4 *argop; 5293 int doqueue; 5294 mntinfo4_t *mi; 5295 nfs4_recov_state_t recov_state; 5296 hrtime_t t; 5297 int isdotdot; 5298 vnode_t *nvp; 5299 nfs_fh4 *fhp; 5300 nfs4_sharedfh_t *sfhp; 5301 nfs4_access_type_t cacc; 5302 rnode4_t *nrp; 5303 rnode4_t *drp = VTOR4(dvp); 5304 nfs4_ga_res_t *garp = NULL; 5305 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5306 5307 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5308 ASSERT(nm != NULL); 5309 ASSERT(nm[0] != '\0'); 5310 ASSERT(dvp->v_type == VDIR); 5311 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5312 ASSERT(*vpp != NULL); 5313 5314 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5315 isdotdot = 1; 5316 args.ctag = TAG_LOOKUP_VPARENT; 5317 } else { 5318 /* 5319 * If dvp were a stub, it should have triggered and caused 5320 * a mount for us to get this far. 5321 */ 5322 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5323 5324 isdotdot = 0; 5325 args.ctag = TAG_LOOKUP_VALID; 5326 } 5327 5328 mi = VTOMI4(dvp); 5329 recov_state.rs_flags = 0; 5330 recov_state.rs_num_retry_despite_err = 0; 5331 5332 nvp = NULL; 5333 5334 /* Save the original mount point security information */ 5335 (void) save_mnt_secinfo(mi->mi_curr_serv); 5336 5337 recov_retry: 5338 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5339 &recov_state, NULL); 5340 if (e.error) { 5341 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5342 VN_RELE(*vpp); 5343 *vpp = NULL; 5344 return (e.error); 5345 } 5346 5347 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5348 5349 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5350 args.array_len = 7; 5351 args.array = argop; 5352 5353 /* 0. putfh file */ 5354 argop[0].argop = OP_CPUTFH; 5355 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5356 5357 /* 1. nverify the change info */ 5358 argop[1].argop = OP_NVERIFY; 5359 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5360 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5361 ver_fattr->attrlist4 = (char *)&dchange; 5362 ptr = (int32_t *)&dchange; 5363 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5364 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5365 5366 /* 2. getattr directory */ 5367 argop[2].argop = OP_GETATTR; 5368 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5369 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5370 5371 /* 3. access directory */ 5372 argop[3].argop = OP_ACCESS; 5373 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5374 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5375 5376 /* 4. lookup name */ 5377 if (isdotdot) { 5378 argop[4].argop = OP_LOOKUPP; 5379 } else { 5380 argop[4].argop = OP_CLOOKUP; 5381 argop[4].nfs_argop4_u.opclookup.cname = nm; 5382 } 5383 5384 /* 5. resulting file handle */ 5385 argop[5].argop = OP_GETFH; 5386 5387 /* 6. resulting file attributes */ 5388 argop[6].argop = OP_GETATTR; 5389 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5390 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5391 5392 doqueue = 1; 5393 t = gethrtime(); 5394 5395 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5396 5397 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5398 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5399 if (e.error != 0 && *vpp != NULL) 5400 VN_RELE(*vpp); 5401 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5402 &recov_state, FALSE); 5403 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5404 kmem_free(argop, argoplist_size); 5405 return (e.error); 5406 } 5407 5408 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5409 /* 5410 * For WRONGSEC of a non-dotdot case, send secinfo directly 5411 * from this thread, do not go thru the recovery thread since 5412 * we need the nm information. 5413 * 5414 * Not doing dotdot case because there is no specification 5415 * for (PUTFH, SECINFO "..") yet. 5416 */ 5417 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5418 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5419 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5420 &recov_state, FALSE); 5421 else 5422 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5423 &recov_state, TRUE); 5424 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5425 kmem_free(argop, argoplist_size); 5426 if (!e.error) 5427 goto recov_retry; 5428 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5429 VN_RELE(*vpp); 5430 *vpp = NULL; 5431 return (e.error); 5432 } 5433 5434 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5435 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5436 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5437 &recov_state, TRUE); 5438 5439 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5440 kmem_free(argop, argoplist_size); 5441 goto recov_retry; 5442 } 5443 } 5444 5445 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5446 5447 if (e.error || res.array_len == 0) { 5448 /* 5449 * If e.error isn't set, then reply has no ops (or we couldn't 5450 * be here). The only legal way to reply without an op array 5451 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5452 * be in the reply for all other status values. 5453 * 5454 * For valid replies without an ops array, return ENOTSUP 5455 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5456 * return EIO -- don't trust status. 5457 */ 5458 if (e.error == 0) 5459 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5460 ENOTSUP : EIO; 5461 VN_RELE(*vpp); 5462 *vpp = NULL; 5463 kmem_free(argop, argoplist_size); 5464 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5465 return (e.error); 5466 } 5467 5468 if (res.status != NFS4ERR_SAME) { 5469 e.error = geterrno4(res.status); 5470 5471 /* 5472 * The NVERIFY "failed" so the directory has changed 5473 * First make sure PUTFH succeeded and NVERIFY "failed" 5474 * cleanly. 5475 */ 5476 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5477 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5478 nfs4_purge_stale_fh(e.error, dvp, cr); 5479 VN_RELE(*vpp); 5480 *vpp = NULL; 5481 goto exit; 5482 } 5483 5484 /* 5485 * We know the NVERIFY "failed" so we must: 5486 * purge the caches (access and indirectly dnlc if needed) 5487 */ 5488 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5489 5490 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5491 nfs4_purge_stale_fh(e.error, dvp, cr); 5492 VN_RELE(*vpp); 5493 *vpp = NULL; 5494 goto exit; 5495 } 5496 5497 /* 5498 * Install new cached attributes for the directory 5499 */ 5500 nfs4_attr_cache(dvp, 5501 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5502 t, cr, FALSE, NULL); 5503 5504 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5505 nfs4_purge_stale_fh(e.error, dvp, cr); 5506 VN_RELE(*vpp); 5507 *vpp = NULL; 5508 e.error = geterrno4(res.status); 5509 goto exit; 5510 } 5511 5512 /* 5513 * Now we know the directory is valid, 5514 * cache new directory access 5515 */ 5516 nfs4_access_cache(drp, 5517 args.array[3].nfs_argop4_u.opaccess.access, 5518 res.array[3].nfs_resop4_u.opaccess.access, cr); 5519 5520 /* 5521 * recheck VEXEC access 5522 */ 5523 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5524 if (cacc != NFS4_ACCESS_ALLOWED) { 5525 /* 5526 * Directory permissions might have been revoked 5527 */ 5528 if (cacc == NFS4_ACCESS_DENIED) { 5529 e.error = EACCES; 5530 VN_RELE(*vpp); 5531 *vpp = NULL; 5532 goto exit; 5533 } 5534 5535 /* 5536 * Somehow we must not have asked for enough 5537 * so try a singleton ACCESS, should never happen. 5538 */ 5539 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5540 if (e.error) { 5541 VN_RELE(*vpp); 5542 *vpp = NULL; 5543 goto exit; 5544 } 5545 } 5546 5547 e.error = geterrno4(res.status); 5548 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5549 /* 5550 * The lookup failed, probably no entry 5551 */ 5552 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5553 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5554 } else { 5555 /* 5556 * Might be some other error, so remove 5557 * the dnlc entry to make sure we start all 5558 * over again, next time. 5559 */ 5560 dnlc_remove(dvp, nm); 5561 } 5562 VN_RELE(*vpp); 5563 *vpp = NULL; 5564 goto exit; 5565 } 5566 5567 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5568 /* 5569 * The file exists but we can't get its fh for 5570 * some unknown reason. Remove it from the dnlc 5571 * and error out to be safe. 5572 */ 5573 dnlc_remove(dvp, nm); 5574 VN_RELE(*vpp); 5575 *vpp = NULL; 5576 goto exit; 5577 } 5578 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5579 if (fhp->nfs_fh4_len == 0) { 5580 /* 5581 * The file exists but a bogus fh 5582 * some unknown reason. Remove it from the dnlc 5583 * and error out to be safe. 5584 */ 5585 e.error = ENOENT; 5586 dnlc_remove(dvp, nm); 5587 VN_RELE(*vpp); 5588 *vpp = NULL; 5589 goto exit; 5590 } 5591 sfhp = sfh4_get(fhp, mi); 5592 5593 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5594 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5595 5596 /* 5597 * Make the new rnode 5598 */ 5599 if (isdotdot) { 5600 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5601 if (e.error) { 5602 sfh4_rele(&sfhp); 5603 VN_RELE(*vpp); 5604 *vpp = NULL; 5605 goto exit; 5606 } 5607 /* 5608 * XXX if nfs4_make_dotdot uses an existing rnode 5609 * XXX it doesn't update the attributes. 5610 * XXX for now just save them again to save an OTW 5611 */ 5612 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5613 } else { 5614 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5615 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5616 /* 5617 * If v_type == VNON, then garp was NULL because 5618 * the last op in the compound failed and makenfs4node 5619 * could not find the vnode for sfhp. It created 5620 * a new vnode, so we have nothing to purge here. 5621 */ 5622 if (nvp->v_type == VNON) { 5623 vattr_t vattr; 5624 5625 vattr.va_mask = AT_TYPE; 5626 /* 5627 * N.B. We've already called nfs4_end_fop above. 5628 */ 5629 e.error = nfs4getattr(nvp, &vattr, cr); 5630 if (e.error) { 5631 sfh4_rele(&sfhp); 5632 VN_RELE(*vpp); 5633 *vpp = NULL; 5634 VN_RELE(nvp); 5635 goto exit; 5636 } 5637 nvp->v_type = vattr.va_type; 5638 } 5639 } 5640 sfh4_rele(&sfhp); 5641 5642 nrp = VTOR4(nvp); 5643 mutex_enter(&nrp->r_statev4_lock); 5644 if (!nrp->created_v4) { 5645 mutex_exit(&nrp->r_statev4_lock); 5646 dnlc_update(dvp, nm, nvp); 5647 } else 5648 mutex_exit(&nrp->r_statev4_lock); 5649 5650 VN_RELE(*vpp); 5651 *vpp = nvp; 5652 } else { 5653 hrtime_t now; 5654 hrtime_t delta = 0; 5655 5656 e.error = 0; 5657 5658 /* 5659 * Because the NVERIFY "succeeded" we know that the 5660 * directory attributes are still valid 5661 * so update r_time_attr_inval 5662 */ 5663 now = gethrtime(); 5664 mutex_enter(&drp->r_statelock); 5665 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5666 delta = now - drp->r_time_attr_saved; 5667 if (delta < mi->mi_acdirmin) 5668 delta = mi->mi_acdirmin; 5669 else if (delta > mi->mi_acdirmax) 5670 delta = mi->mi_acdirmax; 5671 } 5672 drp->r_time_attr_inval = now + delta; 5673 mutex_exit(&drp->r_statelock); 5674 dnlc_update(dvp, nm, *vpp); 5675 5676 /* 5677 * Even though we have a valid directory attr cache 5678 * and dnlc entry, we may not have access. 5679 * This should almost always hit the cache. 5680 */ 5681 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5682 if (e.error) { 5683 VN_RELE(*vpp); 5684 *vpp = NULL; 5685 } 5686 5687 if (*vpp == DNLC_NO_VNODE) { 5688 VN_RELE(*vpp); 5689 *vpp = NULL; 5690 e.error = ENOENT; 5691 } 5692 } 5693 5694 exit: 5695 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5696 kmem_free(argop, argoplist_size); 5697 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5698 return (e.error); 5699 } 5700 5701 /* 5702 * We need to go over the wire to lookup the name, but 5703 * while we are there verify the directory has not 5704 * changed but if it has, get new attributes and check access 5705 * 5706 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5707 * NVERIFY GETATTR ACCESS 5708 * 5709 * With the results: 5710 * if the NVERIFY failed we must purge the caches, add new attributes, 5711 * and cache new access. 5712 * set a new r_time_attr_inval 5713 * add name to dnlc, possibly negative 5714 * if LOOKUP succeeded 5715 * cache new attributes 5716 */ 5717 static int 5718 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5719 { 5720 COMPOUND4args_clnt args; 5721 COMPOUND4res_clnt res; 5722 fattr4 *ver_fattr; 5723 fattr4_change dchange; 5724 int32_t *ptr; 5725 nfs4_ga_res_t *garp = NULL; 5726 int argoplist_size = 9 * sizeof (nfs_argop4); 5727 nfs_argop4 *argop; 5728 int doqueue; 5729 mntinfo4_t *mi; 5730 nfs4_recov_state_t recov_state; 5731 hrtime_t t; 5732 int isdotdot; 5733 vnode_t *nvp; 5734 nfs_fh4 *fhp; 5735 nfs4_sharedfh_t *sfhp; 5736 nfs4_access_type_t cacc; 5737 rnode4_t *nrp; 5738 rnode4_t *drp = VTOR4(dvp); 5739 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5740 5741 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5742 ASSERT(nm != NULL); 5743 ASSERT(nm[0] != '\0'); 5744 ASSERT(dvp->v_type == VDIR); 5745 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5746 ASSERT(*vpp == NULL); 5747 5748 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5749 isdotdot = 1; 5750 args.ctag = TAG_LOOKUP_PARENT; 5751 } else { 5752 /* 5753 * If dvp were a stub, it should have triggered and caused 5754 * a mount for us to get this far. 5755 */ 5756 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5757 5758 isdotdot = 0; 5759 args.ctag = TAG_LOOKUP; 5760 } 5761 5762 mi = VTOMI4(dvp); 5763 recov_state.rs_flags = 0; 5764 recov_state.rs_num_retry_despite_err = 0; 5765 5766 nvp = NULL; 5767 5768 /* Save the original mount point security information */ 5769 (void) save_mnt_secinfo(mi->mi_curr_serv); 5770 5771 recov_retry: 5772 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5773 &recov_state, NULL); 5774 if (e.error) { 5775 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5776 return (e.error); 5777 } 5778 5779 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5780 5781 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5782 args.array_len = 9; 5783 args.array = argop; 5784 5785 /* 0. putfh file */ 5786 argop[0].argop = OP_CPUTFH; 5787 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5788 5789 /* 1. savefh for the nverify */ 5790 argop[1].argop = OP_SAVEFH; 5791 5792 /* 2. lookup name */ 5793 if (isdotdot) { 5794 argop[2].argop = OP_LOOKUPP; 5795 } else { 5796 argop[2].argop = OP_CLOOKUP; 5797 argop[2].nfs_argop4_u.opclookup.cname = nm; 5798 } 5799 5800 /* 3. resulting file handle */ 5801 argop[3].argop = OP_GETFH; 5802 5803 /* 4. resulting file attributes */ 5804 argop[4].argop = OP_GETATTR; 5805 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5806 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5807 5808 /* 5. restorefh back the directory for the nverify */ 5809 argop[5].argop = OP_RESTOREFH; 5810 5811 /* 6. nverify the change info */ 5812 argop[6].argop = OP_NVERIFY; 5813 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5814 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5815 ver_fattr->attrlist4 = (char *)&dchange; 5816 ptr = (int32_t *)&dchange; 5817 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5818 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5819 5820 /* 7. getattr directory */ 5821 argop[7].argop = OP_GETATTR; 5822 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5823 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5824 5825 /* 8. access directory */ 5826 argop[8].argop = OP_ACCESS; 5827 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5828 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5829 5830 doqueue = 1; 5831 t = gethrtime(); 5832 5833 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5834 5835 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5836 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5837 if (e.error != 0 && *vpp != NULL) 5838 VN_RELE(*vpp); 5839 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5840 &recov_state, FALSE); 5841 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5842 kmem_free(argop, argoplist_size); 5843 return (e.error); 5844 } 5845 5846 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5847 /* 5848 * For WRONGSEC of a non-dotdot case, send secinfo directly 5849 * from this thread, do not go thru the recovery thread since 5850 * we need the nm information. 5851 * 5852 * Not doing dotdot case because there is no specification 5853 * for (PUTFH, SECINFO "..") yet. 5854 */ 5855 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5856 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5857 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5858 &recov_state, FALSE); 5859 else 5860 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5861 &recov_state, TRUE); 5862 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5863 kmem_free(argop, argoplist_size); 5864 if (!e.error) 5865 goto recov_retry; 5866 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5867 return (e.error); 5868 } 5869 5870 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5871 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5872 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5873 &recov_state, TRUE); 5874 5875 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5876 kmem_free(argop, argoplist_size); 5877 goto recov_retry; 5878 } 5879 } 5880 5881 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5882 5883 if (e.error || res.array_len == 0) { 5884 /* 5885 * If e.error isn't set, then reply has no ops (or we couldn't 5886 * be here). The only legal way to reply without an op array 5887 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5888 * be in the reply for all other status values. 5889 * 5890 * For valid replies without an ops array, return ENOTSUP 5891 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5892 * return EIO -- don't trust status. 5893 */ 5894 if (e.error == 0) 5895 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5896 ENOTSUP : EIO; 5897 5898 kmem_free(argop, argoplist_size); 5899 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5900 return (e.error); 5901 } 5902 5903 e.error = geterrno4(res.status); 5904 5905 /* 5906 * The PUTFH and SAVEFH may have failed. 5907 */ 5908 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5909 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5910 nfs4_purge_stale_fh(e.error, dvp, cr); 5911 goto exit; 5912 } 5913 5914 /* 5915 * Check if the file exists, if it does delay entering 5916 * into the dnlc until after we update the directory 5917 * attributes so we don't cause it to get purged immediately. 5918 */ 5919 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5920 /* 5921 * The lookup failed, probably no entry 5922 */ 5923 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5924 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5925 goto exit; 5926 } 5927 5928 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5929 /* 5930 * The file exists but we can't get its fh for 5931 * some unknown reason. Error out to be safe. 5932 */ 5933 goto exit; 5934 } 5935 5936 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5937 if (fhp->nfs_fh4_len == 0) { 5938 /* 5939 * The file exists but a bogus fh 5940 * some unknown reason. Error out to be safe. 5941 */ 5942 e.error = EIO; 5943 goto exit; 5944 } 5945 sfhp = sfh4_get(fhp, mi); 5946 5947 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5948 sfh4_rele(&sfhp); 5949 goto exit; 5950 } 5951 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5952 5953 /* 5954 * The RESTOREFH may have failed 5955 */ 5956 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5957 sfh4_rele(&sfhp); 5958 e.error = EIO; 5959 goto exit; 5960 } 5961 5962 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5963 /* 5964 * First make sure the NVERIFY failed as we expected, 5965 * if it didn't then be conservative and error out 5966 * as we can't trust the directory. 5967 */ 5968 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5969 sfh4_rele(&sfhp); 5970 e.error = EIO; 5971 goto exit; 5972 } 5973 5974 /* 5975 * We know the NVERIFY "failed" so the directory has changed, 5976 * so we must: 5977 * purge the caches (access and indirectly dnlc if needed) 5978 */ 5979 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5980 5981 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5982 sfh4_rele(&sfhp); 5983 goto exit; 5984 } 5985 nfs4_attr_cache(dvp, 5986 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5987 t, cr, FALSE, NULL); 5988 5989 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5990 nfs4_purge_stale_fh(e.error, dvp, cr); 5991 sfh4_rele(&sfhp); 5992 e.error = geterrno4(res.status); 5993 goto exit; 5994 } 5995 5996 /* 5997 * Now we know the directory is valid, 5998 * cache new directory access 5999 */ 6000 nfs4_access_cache(drp, 6001 args.array[8].nfs_argop4_u.opaccess.access, 6002 res.array[8].nfs_resop4_u.opaccess.access, cr); 6003 6004 /* 6005 * recheck VEXEC access 6006 */ 6007 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 6008 if (cacc != NFS4_ACCESS_ALLOWED) { 6009 /* 6010 * Directory permissions might have been revoked 6011 */ 6012 if (cacc == NFS4_ACCESS_DENIED) { 6013 sfh4_rele(&sfhp); 6014 e.error = EACCES; 6015 goto exit; 6016 } 6017 6018 /* 6019 * Somehow we must not have asked for enough 6020 * so try a singleton ACCESS should never happen 6021 */ 6022 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 6023 if (e.error) { 6024 sfh4_rele(&sfhp); 6025 goto exit; 6026 } 6027 } 6028 6029 e.error = geterrno4(res.status); 6030 } else { 6031 hrtime_t now; 6032 hrtime_t delta = 0; 6033 6034 e.error = 0; 6035 6036 /* 6037 * Because the NVERIFY "succeeded" we know that the 6038 * directory attributes are still valid 6039 * so update r_time_attr_inval 6040 */ 6041 now = gethrtime(); 6042 mutex_enter(&drp->r_statelock); 6043 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 6044 delta = now - drp->r_time_attr_saved; 6045 if (delta < mi->mi_acdirmin) 6046 delta = mi->mi_acdirmin; 6047 else if (delta > mi->mi_acdirmax) 6048 delta = mi->mi_acdirmax; 6049 } 6050 drp->r_time_attr_inval = now + delta; 6051 mutex_exit(&drp->r_statelock); 6052 6053 /* 6054 * Even though we have a valid directory attr cache, 6055 * we may not have access. 6056 * This should almost always hit the cache. 6057 */ 6058 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 6059 if (e.error) { 6060 sfh4_rele(&sfhp); 6061 goto exit; 6062 } 6063 } 6064 6065 /* 6066 * Now we have successfully completed the lookup, if the 6067 * directory has changed we now have the valid attributes. 6068 * We also know we have directory access. 6069 * Create the new rnode and insert it in the dnlc. 6070 */ 6071 if (isdotdot) { 6072 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 6073 if (e.error) { 6074 sfh4_rele(&sfhp); 6075 goto exit; 6076 } 6077 /* 6078 * XXX if nfs4_make_dotdot uses an existing rnode 6079 * XXX it doesn't update the attributes. 6080 * XXX for now just save them again to save an OTW 6081 */ 6082 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 6083 } else { 6084 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 6085 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 6086 } 6087 sfh4_rele(&sfhp); 6088 6089 nrp = VTOR4(nvp); 6090 mutex_enter(&nrp->r_statev4_lock); 6091 if (!nrp->created_v4) { 6092 mutex_exit(&nrp->r_statev4_lock); 6093 dnlc_update(dvp, nm, nvp); 6094 } else 6095 mutex_exit(&nrp->r_statev4_lock); 6096 6097 *vpp = nvp; 6098 6099 exit: 6100 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6101 kmem_free(argop, argoplist_size); 6102 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6103 return (e.error); 6104 } 6105 6106 #ifdef DEBUG 6107 void 6108 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6109 { 6110 uint_t i, len; 6111 zoneid_t zoneid = getzoneid(); 6112 char *s; 6113 6114 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6115 for (i = 0; i < argcnt; i++) { 6116 nfs_argop4 *op = &argbase[i]; 6117 switch (op->argop) { 6118 case OP_CPUTFH: 6119 case OP_PUTFH: 6120 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6121 break; 6122 case OP_PUTROOTFH: 6123 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6124 break; 6125 case OP_CLOOKUP: 6126 s = op->nfs_argop4_u.opclookup.cname; 6127 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6128 break; 6129 case OP_LOOKUP: 6130 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6131 &len, NULL); 6132 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6133 kmem_free(s, len); 6134 break; 6135 case OP_LOOKUPP: 6136 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6137 break; 6138 case OP_GETFH: 6139 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6140 break; 6141 case OP_GETATTR: 6142 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6143 break; 6144 case OP_OPENATTR: 6145 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6146 break; 6147 default: 6148 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6149 op->argop); 6150 break; 6151 } 6152 } 6153 } 6154 #endif 6155 6156 /* 6157 * nfs4lookup_setup - constructs a multi-lookup compound request. 6158 * 6159 * Given the path "nm1/nm2/.../nmn", the following compound requests 6160 * may be created: 6161 * 6162 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6163 * is faster, for now. 6164 * 6165 * l4_getattrs indicates the type of compound requested. 6166 * 6167 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6168 * 6169 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6170 * 6171 * total number of ops is n + 1. 6172 * 6173 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6174 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6175 * before the last component, and only get attributes 6176 * for the last component. Note that the second-to-last 6177 * pathname component is XATTR_RPATH, which does NOT go 6178 * over-the-wire as a lookup. 6179 * 6180 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6181 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6182 * 6183 * and total number of ops is n + 5. 6184 * 6185 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6186 * attribute directory: create lookups plus an OPENATTR 6187 * replacing the last lookup. Note that the last pathname 6188 * component is XATTR_RPATH, which does NOT go over-the-wire 6189 * as a lookup. 6190 * 6191 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6192 * Openattr; Getfh; Getattr } 6193 * 6194 * and total number of ops is n + 5. 6195 * 6196 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6197 * nodes too. 6198 * 6199 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6200 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6201 * 6202 * and total number of ops is 3*n + 1. 6203 * 6204 * All cases: returns the index in the arg array of the final LOOKUP op, or 6205 * -1 if no LOOKUPs were used. 6206 */ 6207 int 6208 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6209 { 6210 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6211 nfs_argop4 *argbase, *argop; 6212 int arglen, argcnt; 6213 int n = 1; /* number of components */ 6214 int nga = 1; /* number of Getattr's in request */ 6215 char c = '\0', *s, *p; 6216 int lookup_idx = -1; 6217 int argoplist_size; 6218 6219 /* set lookuparg response result to 0 */ 6220 lookupargp->resp->status = NFS4_OK; 6221 6222 /* skip leading "/" or "." e.g. ".//./" if there is */ 6223 for (; ; nm++) { 6224 if (*nm != '/' && *nm != '.') 6225 break; 6226 6227 /* ".." is counted as 1 component */ 6228 if (*nm == '.' && *(nm + 1) != '/') 6229 break; 6230 } 6231 6232 /* 6233 * Find n = number of components - nm must be null terminated 6234 * Skip "." components. 6235 */ 6236 if (*nm != '\0') 6237 for (n = 1, s = nm; *s != '\0'; s++) { 6238 if ((*s == '/') && (*(s + 1) != '/') && 6239 (*(s + 1) != '\0') && 6240 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6241 *(s + 2) == '\0'))) 6242 n++; 6243 } 6244 else 6245 n = 0; 6246 6247 /* 6248 * nga is number of components that need Getfh+Getattr 6249 */ 6250 switch (l4_getattrs) { 6251 case LKP4_NO_ATTRIBUTES: 6252 nga = 0; 6253 break; 6254 case LKP4_ALL_ATTRIBUTES: 6255 nga = n; 6256 /* 6257 * Always have at least 1 getfh, getattr pair 6258 */ 6259 if (nga == 0) 6260 nga++; 6261 break; 6262 case LKP4_LAST_ATTRDIR: 6263 case LKP4_LAST_NAMED_ATTR: 6264 nga = n+1; 6265 break; 6266 } 6267 6268 /* 6269 * If change to use the filehandle attr instead of getfh 6270 * the following line can be deleted. 6271 */ 6272 nga *= 2; 6273 6274 /* 6275 * calculate number of ops in request as 6276 * header + trailer + lookups + getattrs 6277 */ 6278 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6279 6280 argoplist_size = arglen * sizeof (nfs_argop4); 6281 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6282 lookupargp->argsp->array = argop; 6283 6284 argcnt = lookupargp->header_len; 6285 argop += argcnt; 6286 6287 /* 6288 * loop and create a lookup op and possibly getattr/getfh for 6289 * each component. Skip "." components. 6290 */ 6291 for (s = nm; *s != '\0'; s = p) { 6292 /* 6293 * Set up a pathname struct for each component if needed 6294 */ 6295 while (*s == '/') 6296 s++; 6297 if (*s == '\0') 6298 break; 6299 6300 for (p = s; (*p != '/') && (*p != '\0'); p++) 6301 ; 6302 c = *p; 6303 *p = '\0'; 6304 6305 if (s[0] == '.' && s[1] == '\0') { 6306 *p = c; 6307 continue; 6308 } 6309 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6310 strcmp(s, XATTR_RPATH) == 0) { 6311 /* getfh XXX may not be needed in future */ 6312 argop->argop = OP_GETFH; 6313 argop++; 6314 argcnt++; 6315 6316 /* getattr */ 6317 argop->argop = OP_GETATTR; 6318 argop->nfs_argop4_u.opgetattr.attr_request = 6319 lookupargp->ga_bits; 6320 argop->nfs_argop4_u.opgetattr.mi = 6321 lookupargp->mi; 6322 argop++; 6323 argcnt++; 6324 6325 /* openattr */ 6326 argop->argop = OP_OPENATTR; 6327 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6328 strcmp(s, XATTR_RPATH) == 0) { 6329 /* openattr */ 6330 argop->argop = OP_OPENATTR; 6331 argop++; 6332 argcnt++; 6333 6334 /* getfh XXX may not be needed in future */ 6335 argop->argop = OP_GETFH; 6336 argop++; 6337 argcnt++; 6338 6339 /* getattr */ 6340 argop->argop = OP_GETATTR; 6341 argop->nfs_argop4_u.opgetattr.attr_request = 6342 lookupargp->ga_bits; 6343 argop->nfs_argop4_u.opgetattr.mi = 6344 lookupargp->mi; 6345 argop++; 6346 argcnt++; 6347 *p = c; 6348 continue; 6349 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6350 /* lookupp */ 6351 argop->argop = OP_LOOKUPP; 6352 } else { 6353 /* lookup */ 6354 argop->argop = OP_LOOKUP; 6355 (void) str_to_utf8(s, 6356 &argop->nfs_argop4_u.oplookup.objname); 6357 } 6358 lookup_idx = argcnt; 6359 argop++; 6360 argcnt++; 6361 6362 *p = c; 6363 6364 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6365 /* getfh XXX may not be needed in future */ 6366 argop->argop = OP_GETFH; 6367 argop++; 6368 argcnt++; 6369 6370 /* getattr */ 6371 argop->argop = OP_GETATTR; 6372 argop->nfs_argop4_u.opgetattr.attr_request = 6373 lookupargp->ga_bits; 6374 argop->nfs_argop4_u.opgetattr.mi = 6375 lookupargp->mi; 6376 argop++; 6377 argcnt++; 6378 } 6379 } 6380 6381 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6382 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6383 if (needgetfh) { 6384 /* stick in a post-lookup getfh */ 6385 argop->argop = OP_GETFH; 6386 argcnt++; 6387 argop++; 6388 } 6389 /* post-lookup getattr */ 6390 argop->argop = OP_GETATTR; 6391 argop->nfs_argop4_u.opgetattr.attr_request = 6392 lookupargp->ga_bits; 6393 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6394 argcnt++; 6395 } 6396 argcnt += lookupargp->trailer_len; /* actual op count */ 6397 lookupargp->argsp->array_len = argcnt; 6398 lookupargp->arglen = arglen; 6399 6400 #ifdef DEBUG 6401 if (nfs4_client_lookup_debug) 6402 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6403 #endif 6404 6405 return (lookup_idx); 6406 } 6407 6408 static int 6409 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6410 { 6411 COMPOUND4args_clnt args; 6412 COMPOUND4res_clnt res; 6413 GETFH4res *gf_res = NULL; 6414 nfs_argop4 argop[4]; 6415 nfs_resop4 *resop = NULL; 6416 nfs4_sharedfh_t *sfhp; 6417 hrtime_t t; 6418 nfs4_error_t e; 6419 6420 rnode4_t *drp; 6421 int doqueue = 1; 6422 vnode_t *vp; 6423 int needrecov = 0; 6424 nfs4_recov_state_t recov_state; 6425 6426 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6427 6428 *avp = NULL; 6429 recov_state.rs_flags = 0; 6430 recov_state.rs_num_retry_despite_err = 0; 6431 6432 recov_retry: 6433 /* COMPOUND: putfh, openattr, getfh, getattr */ 6434 args.array_len = 4; 6435 args.array = argop; 6436 args.ctag = TAG_OPENATTR; 6437 6438 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6439 if (e.error) 6440 return (e.error); 6441 6442 drp = VTOR4(dvp); 6443 6444 /* putfh */ 6445 argop[0].argop = OP_CPUTFH; 6446 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6447 6448 /* openattr */ 6449 argop[1].argop = OP_OPENATTR; 6450 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6451 6452 /* getfh */ 6453 argop[2].argop = OP_GETFH; 6454 6455 /* getattr */ 6456 argop[3].argop = OP_GETATTR; 6457 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6458 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6459 6460 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6461 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6462 rnode4info(drp))); 6463 6464 t = gethrtime(); 6465 6466 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6467 6468 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6469 if (needrecov) { 6470 bool_t abort; 6471 6472 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6473 "nfs4openattr: initiating recovery\n")); 6474 6475 abort = nfs4_start_recovery(&e, 6476 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6477 OP_OPENATTR, NULL, NULL, NULL); 6478 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6479 if (!e.error) { 6480 e.error = geterrno4(res.status); 6481 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6482 } 6483 if (abort == FALSE) 6484 goto recov_retry; 6485 return (e.error); 6486 } 6487 6488 if (e.error) { 6489 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6490 return (e.error); 6491 } 6492 6493 if (res.status) { 6494 /* 6495 * If OTW errro is NOTSUPP, then it should be 6496 * translated to EINVAL. All Solaris file system 6497 * implementations return EINVAL to the syscall layer 6498 * when the attrdir cannot be created due to an 6499 * implementation restriction or noxattr mount option. 6500 */ 6501 if (res.status == NFS4ERR_NOTSUPP) { 6502 mutex_enter(&drp->r_statelock); 6503 if (drp->r_xattr_dir) 6504 VN_RELE(drp->r_xattr_dir); 6505 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6506 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6507 mutex_exit(&drp->r_statelock); 6508 6509 e.error = EINVAL; 6510 } else { 6511 e.error = geterrno4(res.status); 6512 } 6513 6514 if (e.error) { 6515 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6516 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6517 needrecov); 6518 return (e.error); 6519 } 6520 } 6521 6522 resop = &res.array[0]; /* putfh res */ 6523 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6524 6525 resop = &res.array[1]; /* openattr res */ 6526 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6527 6528 resop = &res.array[2]; /* getfh res */ 6529 gf_res = &resop->nfs_resop4_u.opgetfh; 6530 if (gf_res->object.nfs_fh4_len == 0) { 6531 *avp = NULL; 6532 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6533 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6534 return (ENOENT); 6535 } 6536 6537 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6538 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6539 dvp->v_vfsp, t, cr, dvp, 6540 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6541 sfh4_rele(&sfhp); 6542 6543 if (e.error) 6544 PURGE_ATTRCACHE4(vp); 6545 6546 mutex_enter(&vp->v_lock); 6547 vp->v_flag |= V_XATTRDIR; 6548 mutex_exit(&vp->v_lock); 6549 6550 *avp = vp; 6551 6552 mutex_enter(&drp->r_statelock); 6553 if (drp->r_xattr_dir) 6554 VN_RELE(drp->r_xattr_dir); 6555 VN_HOLD(vp); 6556 drp->r_xattr_dir = vp; 6557 6558 /* 6559 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6560 * NULL. xattrs could be created at any time, and we have no 6561 * way to update pc4_xattr_exists in the base object if/when 6562 * it happens. 6563 */ 6564 drp->r_pathconf.pc4_xattr_valid = 0; 6565 6566 mutex_exit(&drp->r_statelock); 6567 6568 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6569 6570 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6571 6572 return (0); 6573 } 6574 6575 /* ARGSUSED */ 6576 static int 6577 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6578 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6579 vsecattr_t *vsecp) 6580 { 6581 int error; 6582 vnode_t *vp = NULL; 6583 rnode4_t *rp; 6584 struct vattr vattr; 6585 rnode4_t *drp; 6586 vnode_t *tempvp; 6587 enum createmode4 createmode; 6588 bool_t must_trunc = FALSE; 6589 int truncating = 0; 6590 6591 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6592 return (EPERM); 6593 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6594 return (EINVAL); 6595 } 6596 6597 /* . and .. have special meaning in the protocol, reject them. */ 6598 6599 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6600 return (EISDIR); 6601 6602 drp = VTOR4(dvp); 6603 6604 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6605 return (EINTR); 6606 6607 top: 6608 /* 6609 * We make a copy of the attributes because the caller does not 6610 * expect us to change what va points to. 6611 */ 6612 vattr = *va; 6613 6614 /* 6615 * If the pathname is "", then dvp is the root vnode of 6616 * a remote file mounted over a local directory. 6617 * All that needs to be done is access 6618 * checking and truncation. Note that we avoid doing 6619 * open w/ create because the parent directory might 6620 * be in pseudo-fs and the open would fail. 6621 */ 6622 if (*nm == '\0') { 6623 error = 0; 6624 VN_HOLD(dvp); 6625 vp = dvp; 6626 must_trunc = TRUE; 6627 } else { 6628 /* 6629 * We need to go over the wire, just to be sure whether the 6630 * file exists or not. Using the DNLC can be dangerous in 6631 * this case when making a decision regarding existence. 6632 */ 6633 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6634 } 6635 6636 if (exclusive) 6637 createmode = EXCLUSIVE4; 6638 else 6639 createmode = GUARDED4; 6640 6641 /* 6642 * error would be set if the file does not exist on the 6643 * server, so lets go create it. 6644 */ 6645 if (error) { 6646 goto create_otw; 6647 } 6648 6649 /* 6650 * File does exist on the server 6651 */ 6652 if (exclusive == EXCL) 6653 error = EEXIST; 6654 else if (vp->v_type == VDIR && (mode & VWRITE)) 6655 error = EISDIR; 6656 else { 6657 /* 6658 * If vnode is a device, create special vnode. 6659 */ 6660 if (ISVDEV(vp->v_type)) { 6661 tempvp = vp; 6662 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6663 VN_RELE(tempvp); 6664 } 6665 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6666 if ((vattr.va_mask & AT_SIZE) && 6667 vp->v_type == VREG) { 6668 rp = VTOR4(vp); 6669 /* 6670 * Check here for large file handled 6671 * by LF-unaware process (as 6672 * ufs_create() does) 6673 */ 6674 if (!(flags & FOFFMAX)) { 6675 mutex_enter(&rp->r_statelock); 6676 if (rp->r_size > MAXOFF32_T) 6677 error = EOVERFLOW; 6678 mutex_exit(&rp->r_statelock); 6679 } 6680 6681 /* if error is set then we need to return */ 6682 if (error) { 6683 nfs_rw_exit(&drp->r_rwlock); 6684 VN_RELE(vp); 6685 return (error); 6686 } 6687 6688 if (must_trunc) { 6689 vattr.va_mask = AT_SIZE; 6690 error = nfs4setattr(vp, &vattr, 0, cr, 6691 NULL); 6692 } else { 6693 /* 6694 * we know we have a regular file that already 6695 * exists and we may end up truncating the file 6696 * as a result of the open_otw, so flush out 6697 * any dirty pages for this file first. 6698 */ 6699 if (nfs4_has_pages(vp) && 6700 ((rp->r_flags & R4DIRTY) || 6701 rp->r_count > 0 || 6702 rp->r_mapcnt > 0)) { 6703 error = nfs4_putpage(vp, 6704 (offset_t)0, 0, 0, cr, ct); 6705 if (error && (error == ENOSPC || 6706 error == EDQUOT)) { 6707 mutex_enter( 6708 &rp->r_statelock); 6709 if (!rp->r_error) 6710 rp->r_error = 6711 error; 6712 mutex_exit( 6713 &rp->r_statelock); 6714 } 6715 } 6716 vattr.va_mask = (AT_SIZE | 6717 AT_TYPE | AT_MODE); 6718 vattr.va_type = VREG; 6719 createmode = UNCHECKED4; 6720 truncating = 1; 6721 goto create_otw; 6722 } 6723 } 6724 } 6725 } 6726 nfs_rw_exit(&drp->r_rwlock); 6727 if (error) { 6728 VN_RELE(vp); 6729 } else { 6730 vnode_t *tvp; 6731 rnode4_t *trp; 6732 tvp = vp; 6733 if (vp->v_type == VREG) { 6734 trp = VTOR4(vp); 6735 if (IS_SHADOW(vp, trp)) 6736 tvp = RTOV4(trp); 6737 } 6738 6739 if (must_trunc) { 6740 /* 6741 * existing file got truncated, notify. 6742 */ 6743 vnevent_create(tvp, ct); 6744 } 6745 6746 *vpp = vp; 6747 } 6748 return (error); 6749 6750 create_otw: 6751 dnlc_remove(dvp, nm); 6752 6753 ASSERT(vattr.va_mask & AT_TYPE); 6754 6755 /* 6756 * If not a regular file let nfs4mknod() handle it. 6757 */ 6758 if (vattr.va_type != VREG) { 6759 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6760 nfs_rw_exit(&drp->r_rwlock); 6761 return (error); 6762 } 6763 6764 /* 6765 * It _is_ a regular file. 6766 */ 6767 ASSERT(vattr.va_mask & AT_MODE); 6768 if (MANDMODE(vattr.va_mode)) { 6769 nfs_rw_exit(&drp->r_rwlock); 6770 return (EACCES); 6771 } 6772 6773 /* 6774 * If this happens to be a mknod of a regular file, then flags will 6775 * have neither FREAD or FWRITE. However, we must set at least one 6776 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6777 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6778 * set (based on openmode specified by app). 6779 */ 6780 if ((flags & (FREAD|FWRITE)) == 0) 6781 flags |= (FREAD|FWRITE); 6782 6783 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6784 6785 if (vp != NULL) { 6786 /* if create was successful, throw away the file's pages */ 6787 if (!error && (vattr.va_mask & AT_SIZE)) 6788 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6789 cr); 6790 /* release the lookup hold */ 6791 VN_RELE(vp); 6792 vp = NULL; 6793 } 6794 6795 /* 6796 * validate that we opened a regular file. This handles a misbehaving 6797 * server that returns an incorrect FH. 6798 */ 6799 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6800 error = EISDIR; 6801 VN_RELE(*vpp); 6802 } 6803 6804 /* 6805 * If this is not an exclusive create, then the CREATE 6806 * request will be made with the GUARDED mode set. This 6807 * means that the server will return EEXIST if the file 6808 * exists. The file could exist because of a retransmitted 6809 * request. In this case, we recover by starting over and 6810 * checking to see whether the file exists. This second 6811 * time through it should and a CREATE request will not be 6812 * sent. 6813 * 6814 * This handles the problem of a dangling CREATE request 6815 * which contains attributes which indicate that the file 6816 * should be truncated. This retransmitted request could 6817 * possibly truncate valid data in the file if not caught 6818 * by the duplicate request mechanism on the server or if 6819 * not caught by other means. The scenario is: 6820 * 6821 * Client transmits CREATE request with size = 0 6822 * Client times out, retransmits request. 6823 * Response to the first request arrives from the server 6824 * and the client proceeds on. 6825 * Client writes data to the file. 6826 * The server now processes retransmitted CREATE request 6827 * and truncates file. 6828 * 6829 * The use of the GUARDED CREATE request prevents this from 6830 * happening because the retransmitted CREATE would fail 6831 * with EEXIST and would not truncate the file. 6832 */ 6833 if (error == EEXIST && exclusive == NONEXCL) { 6834 #ifdef DEBUG 6835 nfs4_create_misses++; 6836 #endif 6837 goto top; 6838 } 6839 nfs_rw_exit(&drp->r_rwlock); 6840 if (truncating && !error && *vpp) { 6841 vnode_t *tvp; 6842 rnode4_t *trp; 6843 /* 6844 * existing file got truncated, notify. 6845 */ 6846 tvp = *vpp; 6847 trp = VTOR4(tvp); 6848 if (IS_SHADOW(tvp, trp)) 6849 tvp = RTOV4(trp); 6850 vnevent_create(tvp, ct); 6851 } 6852 return (error); 6853 } 6854 6855 /* 6856 * Create compound (for mkdir, mknod, symlink): 6857 * { Putfh <dfh>; Create; Getfh; Getattr } 6858 * It's okay if setattr failed to set gid - this is not considered 6859 * an error, but purge attrs in that case. 6860 */ 6861 static int 6862 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6863 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6864 { 6865 int need_end_op = FALSE; 6866 COMPOUND4args_clnt args; 6867 COMPOUND4res_clnt res, *resp = NULL; 6868 nfs_argop4 *argop; 6869 nfs_resop4 *resop; 6870 int doqueue; 6871 mntinfo4_t *mi; 6872 rnode4_t *drp = VTOR4(dvp); 6873 change_info4 *cinfo; 6874 GETFH4res *gf_res; 6875 struct vattr vattr; 6876 vnode_t *vp; 6877 fattr4 *crattr; 6878 bool_t needrecov = FALSE; 6879 nfs4_recov_state_t recov_state; 6880 nfs4_sharedfh_t *sfhp = NULL; 6881 hrtime_t t; 6882 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6883 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6884 dirattr_info_t dinfo, *dinfop; 6885 servinfo4_t *svp; 6886 bitmap4 supp_attrs; 6887 6888 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6889 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6890 6891 mi = VTOMI4(dvp); 6892 6893 /* 6894 * Make sure we properly deal with setting the right gid 6895 * on a new directory to reflect the parent's setgid bit 6896 */ 6897 setgid_flag = 0; 6898 if (type == NF4DIR) { 6899 struct vattr dva; 6900 6901 va->va_mode &= ~VSGID; 6902 dva.va_mask = AT_MODE | AT_GID; 6903 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6904 6905 /* 6906 * If the parent's directory has the setgid bit set 6907 * _and_ the client was able to get a valid mapping 6908 * for the parent dir's owner_group, we want to 6909 * append NVERIFY(owner_group == dva.va_gid) and 6910 * SETTATTR to the CREATE compound. 6911 */ 6912 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6913 setgid_flag = 1; 6914 va->va_mode |= VSGID; 6915 if (dva.va_gid != GID_NOBODY) { 6916 va->va_mask |= AT_GID; 6917 va->va_gid = dva.va_gid; 6918 } 6919 } 6920 } 6921 } 6922 6923 /* 6924 * Create ops: 6925 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6926 * 5:restorefh(dir) 6:getattr(dir) 6927 * 6928 * if (setgid) 6929 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6930 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6931 * 8:nverify 9:setattr 6932 */ 6933 if (setgid_flag) { 6934 numops = 10; 6935 idx_create = 1; 6936 idx_fattr = 3; 6937 } else { 6938 numops = 7; 6939 idx_create = 2; 6940 idx_fattr = 4; 6941 } 6942 6943 ASSERT(nfs_zone() == mi->mi_zone); 6944 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6945 return (EINTR); 6946 } 6947 recov_state.rs_flags = 0; 6948 recov_state.rs_num_retry_despite_err = 0; 6949 6950 argoplist_size = numops * sizeof (nfs_argop4); 6951 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6952 6953 recov_retry: 6954 if (type == NF4LNK) 6955 args.ctag = TAG_SYMLINK; 6956 else if (type == NF4DIR) 6957 args.ctag = TAG_MKDIR; 6958 else 6959 args.ctag = TAG_MKNOD; 6960 6961 args.array_len = numops; 6962 args.array = argop; 6963 6964 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6965 nfs_rw_exit(&drp->r_rwlock); 6966 kmem_free(argop, argoplist_size); 6967 return (e.error); 6968 } 6969 need_end_op = TRUE; 6970 6971 6972 /* 0: putfh directory */ 6973 argop[0].argop = OP_CPUTFH; 6974 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6975 6976 /* 1/2: Create object */ 6977 argop[idx_create].argop = OP_CCREATE; 6978 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6979 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6980 if (type == NF4LNK) { 6981 /* 6982 * symlink, treat name as data 6983 */ 6984 ASSERT(data != NULL); 6985 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6986 (char *)data; 6987 } 6988 if (type == NF4BLK || type == NF4CHR) { 6989 ASSERT(data != NULL); 6990 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6991 *((specdata4 *)data); 6992 } 6993 6994 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6995 6996 svp = drp->r_server; 6997 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6998 supp_attrs = svp->sv_supp_attrs; 6999 nfs_rw_exit(&svp->sv_lock); 7000 7001 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 7002 nfs_rw_exit(&drp->r_rwlock); 7003 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7004 e.error = EINVAL; 7005 kmem_free(argop, argoplist_size); 7006 return (e.error); 7007 } 7008 7009 /* 2/3: getfh fh of created object */ 7010 ASSERT(idx_create + 1 == idx_fattr - 1); 7011 argop[idx_create + 1].argop = OP_GETFH; 7012 7013 /* 3/4: getattr of new object */ 7014 argop[idx_fattr].argop = OP_GETATTR; 7015 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7016 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 7017 7018 if (setgid_flag) { 7019 vattr_t _v; 7020 7021 argop[4].argop = OP_SAVEFH; 7022 7023 argop[5].argop = OP_CPUTFH; 7024 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7025 7026 argop[6].argop = OP_GETATTR; 7027 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7028 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7029 7030 argop[7].argop = OP_RESTOREFH; 7031 7032 /* 7033 * nverify 7034 * 7035 * XXX - Revisit the last argument to nfs4_end_op() 7036 * once 5020486 is fixed. 7037 */ 7038 _v.va_mask = AT_GID; 7039 _v.va_gid = va->va_gid; 7040 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 7041 supp_attrs)) { 7042 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 7043 nfs_rw_exit(&drp->r_rwlock); 7044 nfs4_fattr4_free(crattr); 7045 kmem_free(argop, argoplist_size); 7046 return (e.error); 7047 } 7048 7049 /* 7050 * setattr 7051 * 7052 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 7053 * so no need for stateid or flags. Also we specify NULL 7054 * rp since we're only interested in setting owner_group 7055 * attributes. 7056 */ 7057 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 7058 &e.error, 0); 7059 7060 if (e.error) { 7061 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 7062 nfs_rw_exit(&drp->r_rwlock); 7063 nfs4_fattr4_free(crattr); 7064 nfs4args_verify_free(&argop[8]); 7065 kmem_free(argop, argoplist_size); 7066 return (e.error); 7067 } 7068 } else { 7069 argop[1].argop = OP_SAVEFH; 7070 7071 argop[5].argop = OP_RESTOREFH; 7072 7073 argop[6].argop = OP_GETATTR; 7074 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7075 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7076 } 7077 7078 dnlc_remove(dvp, nm); 7079 7080 doqueue = 1; 7081 t = gethrtime(); 7082 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7083 7084 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7085 if (e.error) { 7086 PURGE_ATTRCACHE4(dvp); 7087 if (!needrecov) 7088 goto out; 7089 } 7090 7091 if (needrecov) { 7092 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 7093 OP_CREATE, NULL, NULL, NULL) == FALSE) { 7094 nfs4_end_op(mi, dvp, NULL, &recov_state, 7095 needrecov); 7096 need_end_op = FALSE; 7097 nfs4_fattr4_free(crattr); 7098 if (setgid_flag) { 7099 nfs4args_verify_free(&argop[8]); 7100 nfs4args_setattr_free(&argop[9]); 7101 } 7102 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7103 goto recov_retry; 7104 } 7105 } 7106 7107 resp = &res; 7108 7109 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7110 7111 if (res.status == NFS4ERR_BADOWNER) 7112 nfs4_log_badowner(mi, OP_CREATE); 7113 7114 e.error = geterrno4(res.status); 7115 7116 /* 7117 * This check is left over from when create was implemented 7118 * using a setattr op (instead of createattrs). If the 7119 * putfh/create/getfh failed, the error was returned. If 7120 * setattr/getattr failed, we keep going. 7121 * 7122 * It might be better to get rid of the GETFH also, and just 7123 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7124 * Then if any of the operations failed, we could return the 7125 * error now, and remove much of the error code below. 7126 */ 7127 if (res.array_len <= idx_fattr) { 7128 /* 7129 * Either Putfh, Create or Getfh failed. 7130 */ 7131 PURGE_ATTRCACHE4(dvp); 7132 /* 7133 * nfs4_purge_stale_fh() may generate otw calls through 7134 * nfs4_invalidate_pages. Hence the need to call 7135 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7136 */ 7137 nfs4_end_op(mi, dvp, NULL, &recov_state, 7138 needrecov); 7139 need_end_op = FALSE; 7140 nfs4_purge_stale_fh(e.error, dvp, cr); 7141 goto out; 7142 } 7143 } 7144 7145 resop = &res.array[idx_create]; /* create res */ 7146 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7147 7148 resop = &res.array[idx_create + 1]; /* getfh res */ 7149 gf_res = &resop->nfs_resop4_u.opgetfh; 7150 7151 sfhp = sfh4_get(&gf_res->object, mi); 7152 if (e.error) { 7153 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7154 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7155 if (vp->v_type == VNON) { 7156 vattr.va_mask = AT_TYPE; 7157 /* 7158 * Need to call nfs4_end_op before nfs4getattr to avoid 7159 * potential nfs4_start_op deadlock. See RFE 4777612. 7160 */ 7161 nfs4_end_op(mi, dvp, NULL, &recov_state, 7162 needrecov); 7163 need_end_op = FALSE; 7164 e.error = nfs4getattr(vp, &vattr, cr); 7165 if (e.error) { 7166 VN_RELE(vp); 7167 *vpp = NULL; 7168 goto out; 7169 } 7170 vp->v_type = vattr.va_type; 7171 } 7172 e.error = 0; 7173 } else { 7174 *vpp = vp = makenfs4node(sfhp, 7175 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7176 dvp->v_vfsp, t, cr, 7177 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7178 } 7179 7180 /* 7181 * If compound succeeded, then update dir attrs 7182 */ 7183 if (res.status == NFS4_OK) { 7184 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7185 dinfo.di_cred = cr; 7186 dinfo.di_time_call = t; 7187 dinfop = &dinfo; 7188 } else 7189 dinfop = NULL; 7190 7191 /* Update directory cache attribute, readdir and dnlc caches */ 7192 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7193 7194 out: 7195 if (sfhp != NULL) 7196 sfh4_rele(&sfhp); 7197 nfs_rw_exit(&drp->r_rwlock); 7198 nfs4_fattr4_free(crattr); 7199 if (setgid_flag) { 7200 nfs4args_verify_free(&argop[8]); 7201 nfs4args_setattr_free(&argop[9]); 7202 } 7203 if (resp) 7204 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7205 if (need_end_op) 7206 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7207 7208 kmem_free(argop, argoplist_size); 7209 return (e.error); 7210 } 7211 7212 /* ARGSUSED */ 7213 static int 7214 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7215 int mode, vnode_t **vpp, cred_t *cr) 7216 { 7217 int error; 7218 vnode_t *vp; 7219 nfs_ftype4 type; 7220 specdata4 spec, *specp = NULL; 7221 7222 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7223 7224 switch (va->va_type) { 7225 case VCHR: 7226 case VBLK: 7227 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7228 spec.specdata1 = getmajor(va->va_rdev); 7229 spec.specdata2 = getminor(va->va_rdev); 7230 specp = &spec; 7231 break; 7232 7233 case VFIFO: 7234 type = NF4FIFO; 7235 break; 7236 case VSOCK: 7237 type = NF4SOCK; 7238 break; 7239 7240 default: 7241 return (EINVAL); 7242 } 7243 7244 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7245 if (error) { 7246 return (error); 7247 } 7248 7249 /* 7250 * This might not be needed any more; special case to deal 7251 * with problematic v2/v3 servers. Since create was unable 7252 * to set group correctly, not sure what hope setattr has. 7253 */ 7254 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7255 va->va_mask = AT_GID; 7256 (void) nfs4setattr(vp, va, 0, cr, NULL); 7257 } 7258 7259 /* 7260 * If vnode is a device create special vnode 7261 */ 7262 if (ISVDEV(vp->v_type)) { 7263 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7264 VN_RELE(vp); 7265 } else { 7266 *vpp = vp; 7267 } 7268 return (error); 7269 } 7270 7271 /* 7272 * Remove requires that the current fh be the target directory. 7273 * After the operation, the current fh is unchanged. 7274 * The compound op structure is: 7275 * PUTFH(targetdir), REMOVE 7276 * 7277 * Weirdness: if the vnode to be removed is open 7278 * we rename it instead of removing it and nfs_inactive 7279 * will remove the new name. 7280 */ 7281 /* ARGSUSED */ 7282 static int 7283 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7284 { 7285 COMPOUND4args_clnt args; 7286 COMPOUND4res_clnt res, *resp = NULL; 7287 REMOVE4res *rm_res; 7288 nfs_argop4 argop[3]; 7289 nfs_resop4 *resop; 7290 vnode_t *vp; 7291 char *tmpname; 7292 int doqueue; 7293 mntinfo4_t *mi; 7294 rnode4_t *rp; 7295 rnode4_t *drp; 7296 int needrecov = 0; 7297 nfs4_recov_state_t recov_state; 7298 int isopen; 7299 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7300 dirattr_info_t dinfo; 7301 7302 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7303 return (EPERM); 7304 drp = VTOR4(dvp); 7305 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7306 return (EINTR); 7307 7308 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7309 if (e.error) { 7310 nfs_rw_exit(&drp->r_rwlock); 7311 return (e.error); 7312 } 7313 7314 if (vp->v_type == VDIR) { 7315 VN_RELE(vp); 7316 nfs_rw_exit(&drp->r_rwlock); 7317 return (EISDIR); 7318 } 7319 7320 /* 7321 * First just remove the entry from the name cache, as it 7322 * is most likely the only entry for this vp. 7323 */ 7324 dnlc_remove(dvp, nm); 7325 7326 rp = VTOR4(vp); 7327 7328 /* 7329 * For regular file types, check to see if the file is open by looking 7330 * at the open streams. 7331 * For all other types, check the reference count on the vnode. Since 7332 * they are not opened OTW they never have an open stream. 7333 * 7334 * If the file is open, rename it to .nfsXXXX. 7335 */ 7336 if (vp->v_type != VREG) { 7337 /* 7338 * If the file has a v_count > 1 then there may be more than one 7339 * entry in the name cache due multiple links or an open file, 7340 * but we don't have the real reference count so flush all 7341 * possible entries. 7342 */ 7343 if (vp->v_count > 1) 7344 dnlc_purge_vp(vp); 7345 7346 /* 7347 * Now we have the real reference count. 7348 */ 7349 isopen = vp->v_count > 1; 7350 } else { 7351 mutex_enter(&rp->r_os_lock); 7352 isopen = list_head(&rp->r_open_streams) != NULL; 7353 mutex_exit(&rp->r_os_lock); 7354 } 7355 7356 mutex_enter(&rp->r_statelock); 7357 if (isopen && 7358 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7359 mutex_exit(&rp->r_statelock); 7360 tmpname = newname(); 7361 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7362 if (e.error) 7363 kmem_free(tmpname, MAXNAMELEN); 7364 else { 7365 mutex_enter(&rp->r_statelock); 7366 if (rp->r_unldvp == NULL) { 7367 VN_HOLD(dvp); 7368 rp->r_unldvp = dvp; 7369 if (rp->r_unlcred != NULL) 7370 crfree(rp->r_unlcred); 7371 crhold(cr); 7372 rp->r_unlcred = cr; 7373 rp->r_unlname = tmpname; 7374 } else { 7375 kmem_free(rp->r_unlname, MAXNAMELEN); 7376 rp->r_unlname = tmpname; 7377 } 7378 mutex_exit(&rp->r_statelock); 7379 } 7380 VN_RELE(vp); 7381 nfs_rw_exit(&drp->r_rwlock); 7382 return (e.error); 7383 } 7384 /* 7385 * Actually remove the file/dir 7386 */ 7387 mutex_exit(&rp->r_statelock); 7388 7389 /* 7390 * We need to flush any dirty pages which happen to 7391 * be hanging around before removing the file. 7392 * This shouldn't happen very often since in NFSv4 7393 * we should be close to open consistent. 7394 */ 7395 if (nfs4_has_pages(vp) && 7396 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7397 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7398 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7399 mutex_enter(&rp->r_statelock); 7400 if (!rp->r_error) 7401 rp->r_error = e.error; 7402 mutex_exit(&rp->r_statelock); 7403 } 7404 } 7405 7406 mi = VTOMI4(dvp); 7407 7408 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7409 recov_state.rs_flags = 0; 7410 recov_state.rs_num_retry_despite_err = 0; 7411 7412 recov_retry: 7413 /* 7414 * Remove ops: putfh dir; remove 7415 */ 7416 args.ctag = TAG_REMOVE; 7417 args.array_len = 3; 7418 args.array = argop; 7419 7420 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7421 if (e.error) { 7422 nfs_rw_exit(&drp->r_rwlock); 7423 VN_RELE(vp); 7424 return (e.error); 7425 } 7426 7427 /* putfh directory */ 7428 argop[0].argop = OP_CPUTFH; 7429 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7430 7431 /* remove */ 7432 argop[1].argop = OP_CREMOVE; 7433 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7434 7435 /* getattr dir */ 7436 argop[2].argop = OP_GETATTR; 7437 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7438 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7439 7440 doqueue = 1; 7441 dinfo.di_time_call = gethrtime(); 7442 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7443 7444 PURGE_ATTRCACHE4(vp); 7445 7446 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7447 if (e.error) 7448 PURGE_ATTRCACHE4(dvp); 7449 7450 if (needrecov) { 7451 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7452 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 7453 if (!e.error) 7454 (void) xdr_free(xdr_COMPOUND4res_clnt, 7455 (caddr_t)&res); 7456 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7457 needrecov); 7458 goto recov_retry; 7459 } 7460 } 7461 7462 /* 7463 * Matching nfs4_end_op() for start_op() above. 7464 * There is a path in the code below which calls 7465 * nfs4_purge_stale_fh(), which may generate otw calls through 7466 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7467 * here to avoid nfs4_start_op() deadlock. 7468 */ 7469 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7470 7471 if (!e.error) { 7472 resp = &res; 7473 7474 if (res.status) { 7475 e.error = geterrno4(res.status); 7476 PURGE_ATTRCACHE4(dvp); 7477 nfs4_purge_stale_fh(e.error, dvp, cr); 7478 } else { 7479 resop = &res.array[1]; /* remove res */ 7480 rm_res = &resop->nfs_resop4_u.opremove; 7481 7482 dinfo.di_garp = 7483 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7484 dinfo.di_cred = cr; 7485 7486 /* Update directory attr, readdir and dnlc caches */ 7487 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7488 &dinfo); 7489 } 7490 } 7491 nfs_rw_exit(&drp->r_rwlock); 7492 if (resp) 7493 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7494 7495 if (e.error == 0) { 7496 vnode_t *tvp; 7497 rnode4_t *trp; 7498 trp = VTOR4(vp); 7499 tvp = vp; 7500 if (IS_SHADOW(vp, trp)) 7501 tvp = RTOV4(trp); 7502 vnevent_remove(tvp, dvp, nm, ct); 7503 } 7504 VN_RELE(vp); 7505 return (e.error); 7506 } 7507 7508 /* 7509 * Link requires that the current fh be the target directory and the 7510 * saved fh be the source fh. After the operation, the current fh is unchanged. 7511 * Thus the compound op structure is: 7512 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7513 * GETATTR(file) 7514 */ 7515 /* ARGSUSED */ 7516 static int 7517 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7518 caller_context_t *ct, int flags) 7519 { 7520 COMPOUND4args_clnt args; 7521 COMPOUND4res_clnt res, *resp = NULL; 7522 LINK4res *ln_res; 7523 int argoplist_size = 7 * sizeof (nfs_argop4); 7524 nfs_argop4 *argop; 7525 nfs_resop4 *resop; 7526 vnode_t *realvp, *nvp; 7527 int doqueue; 7528 mntinfo4_t *mi; 7529 rnode4_t *tdrp; 7530 bool_t needrecov = FALSE; 7531 nfs4_recov_state_t recov_state; 7532 hrtime_t t; 7533 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7534 dirattr_info_t dinfo; 7535 7536 ASSERT(*tnm != '\0'); 7537 ASSERT(tdvp->v_type == VDIR); 7538 ASSERT(nfs4_consistent_type(tdvp)); 7539 ASSERT(nfs4_consistent_type(svp)); 7540 7541 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7542 return (EPERM); 7543 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7544 svp = realvp; 7545 ASSERT(nfs4_consistent_type(svp)); 7546 } 7547 7548 tdrp = VTOR4(tdvp); 7549 mi = VTOMI4(svp); 7550 7551 if (!(mi->mi_flags & MI4_LINK)) { 7552 return (EOPNOTSUPP); 7553 } 7554 recov_state.rs_flags = 0; 7555 recov_state.rs_num_retry_despite_err = 0; 7556 7557 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7558 return (EINTR); 7559 7560 recov_retry: 7561 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7562 7563 args.ctag = TAG_LINK; 7564 7565 /* 7566 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7567 * restorefh; getattr(fl) 7568 */ 7569 args.array_len = 7; 7570 args.array = argop; 7571 7572 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7573 if (e.error) { 7574 kmem_free(argop, argoplist_size); 7575 nfs_rw_exit(&tdrp->r_rwlock); 7576 return (e.error); 7577 } 7578 7579 /* 0. putfh file */ 7580 argop[0].argop = OP_CPUTFH; 7581 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7582 7583 /* 1. save current fh to free up the space for the dir */ 7584 argop[1].argop = OP_SAVEFH; 7585 7586 /* 2. putfh targetdir */ 7587 argop[2].argop = OP_CPUTFH; 7588 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7589 7590 /* 3. link: current_fh is targetdir, saved_fh is source */ 7591 argop[3].argop = OP_CLINK; 7592 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7593 7594 /* 4. Get attributes of dir */ 7595 argop[4].argop = OP_GETATTR; 7596 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7597 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7598 7599 /* 5. If link was successful, restore current vp to file */ 7600 argop[5].argop = OP_RESTOREFH; 7601 7602 /* 6. Get attributes of linked object */ 7603 argop[6].argop = OP_GETATTR; 7604 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7605 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7606 7607 dnlc_remove(tdvp, tnm); 7608 7609 doqueue = 1; 7610 t = gethrtime(); 7611 7612 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7613 7614 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7615 if (e.error != 0 && !needrecov) { 7616 PURGE_ATTRCACHE4(tdvp); 7617 PURGE_ATTRCACHE4(svp); 7618 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7619 goto out; 7620 } 7621 7622 if (needrecov) { 7623 bool_t abort; 7624 7625 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7626 NULL, NULL, OP_LINK, NULL, NULL, NULL); 7627 if (abort == FALSE) { 7628 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7629 needrecov); 7630 kmem_free(argop, argoplist_size); 7631 if (!e.error) 7632 (void) xdr_free(xdr_COMPOUND4res_clnt, 7633 (caddr_t)&res); 7634 goto recov_retry; 7635 } else { 7636 if (e.error != 0) { 7637 PURGE_ATTRCACHE4(tdvp); 7638 PURGE_ATTRCACHE4(svp); 7639 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7640 &recov_state, needrecov); 7641 goto out; 7642 } 7643 /* fall through for res.status case */ 7644 } 7645 } 7646 7647 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7648 7649 resp = &res; 7650 if (res.status) { 7651 /* If link succeeded, then don't return error */ 7652 e.error = geterrno4(res.status); 7653 if (res.array_len <= 4) { 7654 /* 7655 * Either Putfh, Savefh, Putfh dir, or Link failed 7656 */ 7657 PURGE_ATTRCACHE4(svp); 7658 PURGE_ATTRCACHE4(tdvp); 7659 if (e.error == EOPNOTSUPP) { 7660 mutex_enter(&mi->mi_lock); 7661 mi->mi_flags &= ~MI4_LINK; 7662 mutex_exit(&mi->mi_lock); 7663 } 7664 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7665 /* XXX-LP */ 7666 if (e.error == EISDIR && crgetuid(cr) != 0) 7667 e.error = EPERM; 7668 goto out; 7669 } 7670 } 7671 7672 /* either no error or one of the postop getattr failed */ 7673 7674 /* 7675 * XXX - if LINK succeeded, but no attrs were returned for link 7676 * file, purge its cache. 7677 * 7678 * XXX Perform a simplified version of wcc checking. Instead of 7679 * have another getattr to get pre-op, just purge cache if 7680 * any of the ops prior to and including the getattr failed. 7681 * If the getattr succeeded then update the attrcache accordingly. 7682 */ 7683 7684 /* 7685 * update cache with link file postattrs. 7686 * Note: at this point resop points to link res. 7687 */ 7688 resop = &res.array[3]; /* link res */ 7689 ln_res = &resop->nfs_resop4_u.oplink; 7690 if (res.status == NFS4_OK) 7691 e.error = nfs4_update_attrcache(res.status, 7692 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7693 t, svp, cr); 7694 7695 /* 7696 * Call makenfs4node to create the new shadow vp for tnm. 7697 * We pass NULL attrs because we just cached attrs for 7698 * the src object. All we're trying to accomplish is to 7699 * to create the new shadow vnode. 7700 */ 7701 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7702 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7703 7704 /* Update target cache attribute, readdir and dnlc caches */ 7705 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7706 dinfo.di_time_call = t; 7707 dinfo.di_cred = cr; 7708 7709 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7710 ASSERT(nfs4_consistent_type(tdvp)); 7711 ASSERT(nfs4_consistent_type(svp)); 7712 ASSERT(nfs4_consistent_type(nvp)); 7713 VN_RELE(nvp); 7714 7715 if (!e.error) { 7716 vnode_t *tvp; 7717 rnode4_t *trp; 7718 /* 7719 * Notify the source file of this link operation. 7720 */ 7721 trp = VTOR4(svp); 7722 tvp = svp; 7723 if (IS_SHADOW(svp, trp)) 7724 tvp = RTOV4(trp); 7725 vnevent_link(tvp, ct); 7726 } 7727 out: 7728 kmem_free(argop, argoplist_size); 7729 if (resp) 7730 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7731 7732 nfs_rw_exit(&tdrp->r_rwlock); 7733 7734 return (e.error); 7735 } 7736 7737 /* ARGSUSED */ 7738 static int 7739 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7740 caller_context_t *ct, int flags) 7741 { 7742 vnode_t *realvp; 7743 7744 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7745 return (EPERM); 7746 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7747 ndvp = realvp; 7748 7749 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7750 } 7751 7752 /* 7753 * nfs4rename does the real work of renaming in NFS Version 4. 7754 * 7755 * A file handle is considered volatile for renaming purposes if either 7756 * of the volatile bits are turned on. However, the compound may differ 7757 * based on the likelihood of the filehandle to change during rename. 7758 */ 7759 static int 7760 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7761 caller_context_t *ct) 7762 { 7763 int error; 7764 mntinfo4_t *mi; 7765 vnode_t *nvp = NULL; 7766 vnode_t *ovp = NULL; 7767 char *tmpname = NULL; 7768 rnode4_t *rp; 7769 rnode4_t *odrp; 7770 rnode4_t *ndrp; 7771 int did_link = 0; 7772 int do_link = 1; 7773 nfsstat4 stat = NFS4_OK; 7774 7775 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7776 ASSERT(nfs4_consistent_type(odvp)); 7777 ASSERT(nfs4_consistent_type(ndvp)); 7778 7779 if (onm[0] == '.' && (onm[1] == '\0' || 7780 (onm[1] == '.' && onm[2] == '\0'))) 7781 return (EINVAL); 7782 7783 if (nnm[0] == '.' && (nnm[1] == '\0' || 7784 (nnm[1] == '.' && nnm[2] == '\0'))) 7785 return (EINVAL); 7786 7787 odrp = VTOR4(odvp); 7788 ndrp = VTOR4(ndvp); 7789 if ((intptr_t)odrp < (intptr_t)ndrp) { 7790 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7791 return (EINTR); 7792 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7793 nfs_rw_exit(&odrp->r_rwlock); 7794 return (EINTR); 7795 } 7796 } else { 7797 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7798 return (EINTR); 7799 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7800 nfs_rw_exit(&ndrp->r_rwlock); 7801 return (EINTR); 7802 } 7803 } 7804 7805 /* 7806 * Lookup the target file. If it exists, it needs to be 7807 * checked to see whether it is a mount point and whether 7808 * it is active (open). 7809 */ 7810 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7811 if (!error) { 7812 int isactive; 7813 7814 ASSERT(nfs4_consistent_type(nvp)); 7815 /* 7816 * If this file has been mounted on, then just 7817 * return busy because renaming to it would remove 7818 * the mounted file system from the name space. 7819 */ 7820 if (vn_ismntpt(nvp)) { 7821 VN_RELE(nvp); 7822 nfs_rw_exit(&odrp->r_rwlock); 7823 nfs_rw_exit(&ndrp->r_rwlock); 7824 return (EBUSY); 7825 } 7826 7827 /* 7828 * First just remove the entry from the name cache, as it 7829 * is most likely the only entry for this vp. 7830 */ 7831 dnlc_remove(ndvp, nnm); 7832 7833 rp = VTOR4(nvp); 7834 7835 if (nvp->v_type != VREG) { 7836 /* 7837 * Purge the name cache of all references to this vnode 7838 * so that we can check the reference count to infer 7839 * whether it is active or not. 7840 */ 7841 if (nvp->v_count > 1) 7842 dnlc_purge_vp(nvp); 7843 7844 isactive = nvp->v_count > 1; 7845 } else { 7846 mutex_enter(&rp->r_os_lock); 7847 isactive = list_head(&rp->r_open_streams) != NULL; 7848 mutex_exit(&rp->r_os_lock); 7849 } 7850 7851 /* 7852 * If the vnode is active and is not a directory, 7853 * arrange to rename it to a 7854 * temporary file so that it will continue to be 7855 * accessible. This implements the "unlink-open-file" 7856 * semantics for the target of a rename operation. 7857 * Before doing this though, make sure that the 7858 * source and target files are not already the same. 7859 */ 7860 if (isactive && nvp->v_type != VDIR) { 7861 /* 7862 * Lookup the source name. 7863 */ 7864 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7865 7866 /* 7867 * The source name *should* already exist. 7868 */ 7869 if (error) { 7870 VN_RELE(nvp); 7871 nfs_rw_exit(&odrp->r_rwlock); 7872 nfs_rw_exit(&ndrp->r_rwlock); 7873 return (error); 7874 } 7875 7876 ASSERT(nfs4_consistent_type(ovp)); 7877 7878 /* 7879 * Compare the two vnodes. If they are the same, 7880 * just release all held vnodes and return success. 7881 */ 7882 if (VN_CMP(ovp, nvp)) { 7883 VN_RELE(ovp); 7884 VN_RELE(nvp); 7885 nfs_rw_exit(&odrp->r_rwlock); 7886 nfs_rw_exit(&ndrp->r_rwlock); 7887 return (0); 7888 } 7889 7890 /* 7891 * Can't mix and match directories and non- 7892 * directories in rename operations. We already 7893 * know that the target is not a directory. If 7894 * the source is a directory, return an error. 7895 */ 7896 if (ovp->v_type == VDIR) { 7897 VN_RELE(ovp); 7898 VN_RELE(nvp); 7899 nfs_rw_exit(&odrp->r_rwlock); 7900 nfs_rw_exit(&ndrp->r_rwlock); 7901 return (ENOTDIR); 7902 } 7903 link_call: 7904 /* 7905 * The target file exists, is not the same as 7906 * the source file, and is active. We first 7907 * try to Link it to a temporary filename to 7908 * avoid having the server removing the file 7909 * completely (which could cause data loss to 7910 * the user's POV in the event the Rename fails 7911 * -- see bug 1165874). 7912 */ 7913 /* 7914 * The do_link and did_link booleans are 7915 * introduced in the event we get NFS4ERR_FILE_OPEN 7916 * returned for the Rename. Some servers can 7917 * not Rename over an Open file, so they return 7918 * this error. The client needs to Remove the 7919 * newly created Link and do two Renames, just 7920 * as if the server didn't support LINK. 7921 */ 7922 tmpname = newname(); 7923 error = 0; 7924 7925 if (do_link) { 7926 error = nfs4_link(ndvp, nvp, tmpname, cr, 7927 NULL, 0); 7928 } 7929 if (error == EOPNOTSUPP || !do_link) { 7930 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7931 cr, NULL, 0); 7932 did_link = 0; 7933 } else { 7934 did_link = 1; 7935 } 7936 if (error) { 7937 kmem_free(tmpname, MAXNAMELEN); 7938 VN_RELE(ovp); 7939 VN_RELE(nvp); 7940 nfs_rw_exit(&odrp->r_rwlock); 7941 nfs_rw_exit(&ndrp->r_rwlock); 7942 return (error); 7943 } 7944 7945 mutex_enter(&rp->r_statelock); 7946 if (rp->r_unldvp == NULL) { 7947 VN_HOLD(ndvp); 7948 rp->r_unldvp = ndvp; 7949 if (rp->r_unlcred != NULL) 7950 crfree(rp->r_unlcred); 7951 crhold(cr); 7952 rp->r_unlcred = cr; 7953 rp->r_unlname = tmpname; 7954 } else { 7955 if (rp->r_unlname) 7956 kmem_free(rp->r_unlname, MAXNAMELEN); 7957 rp->r_unlname = tmpname; 7958 } 7959 mutex_exit(&rp->r_statelock); 7960 } 7961 7962 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7963 7964 ASSERT(nfs4_consistent_type(nvp)); 7965 } 7966 7967 if (ovp == NULL) { 7968 /* 7969 * When renaming directories to be a subdirectory of a 7970 * different parent, the dnlc entry for ".." will no 7971 * longer be valid, so it must be removed. 7972 * 7973 * We do a lookup here to determine whether we are renaming 7974 * a directory and we need to check if we are renaming 7975 * an unlinked file. This might have already been done 7976 * in previous code, so we check ovp == NULL to avoid 7977 * doing it twice. 7978 */ 7979 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7980 /* 7981 * The source name *should* already exist. 7982 */ 7983 if (error) { 7984 nfs_rw_exit(&odrp->r_rwlock); 7985 nfs_rw_exit(&ndrp->r_rwlock); 7986 if (nvp) { 7987 VN_RELE(nvp); 7988 } 7989 return (error); 7990 } 7991 ASSERT(ovp != NULL); 7992 ASSERT(nfs4_consistent_type(ovp)); 7993 } 7994 7995 /* 7996 * Is the object being renamed a dir, and if so, is 7997 * it being renamed to a child of itself? The underlying 7998 * fs should ultimately return EINVAL for this case; 7999 * however, buggy beta non-Solaris NFSv4 servers at 8000 * interop testing events have allowed this behavior, 8001 * and it caused our client to panic due to a recursive 8002 * mutex_enter in fn_move. 8003 * 8004 * The tedious locking in fn_move could be changed to 8005 * deal with this case, and the client could avoid the 8006 * panic; however, the client would just confuse itself 8007 * later and misbehave. A better way to handle the broken 8008 * server is to detect this condition and return EINVAL 8009 * without ever sending the the bogus rename to the server. 8010 * We know the rename is invalid -- just fail it now. 8011 */ 8012 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 8013 VN_RELE(ovp); 8014 nfs_rw_exit(&odrp->r_rwlock); 8015 nfs_rw_exit(&ndrp->r_rwlock); 8016 if (nvp) { 8017 VN_RELE(nvp); 8018 } 8019 return (EINVAL); 8020 } 8021 8022 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 8023 8024 /* 8025 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 8026 * possible for the filehandle to change due to the rename. 8027 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 8028 * the fh will not change because of the rename, but we still need 8029 * to update its rnode entry with the new name for 8030 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 8031 * has no effect on these for now, but for future improvements, 8032 * we might want to use it too to simplify handling of files 8033 * that are open with that flag on. (XXX) 8034 */ 8035 mi = VTOMI4(odvp); 8036 if (NFS4_VOLATILE_FH(mi)) 8037 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 8038 &stat); 8039 else 8040 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 8041 &stat); 8042 8043 ASSERT(nfs4_consistent_type(odvp)); 8044 ASSERT(nfs4_consistent_type(ndvp)); 8045 ASSERT(nfs4_consistent_type(ovp)); 8046 8047 if (stat == NFS4ERR_FILE_OPEN && did_link) { 8048 do_link = 0; 8049 /* 8050 * Before the 'link_call' code, we did a nfs4_lookup 8051 * that puts a VN_HOLD on nvp. After the nfs4_link 8052 * call we call VN_RELE to match that hold. We need 8053 * to place an additional VN_HOLD here since we will 8054 * be hitting that VN_RELE again. 8055 */ 8056 VN_HOLD(nvp); 8057 8058 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 8059 8060 /* Undo the unlinked file naming stuff we just did */ 8061 mutex_enter(&rp->r_statelock); 8062 if (rp->r_unldvp) { 8063 VN_RELE(ndvp); 8064 rp->r_unldvp = NULL; 8065 if (rp->r_unlcred != NULL) 8066 crfree(rp->r_unlcred); 8067 rp->r_unlcred = NULL; 8068 /* rp->r_unlanme points to tmpname */ 8069 if (rp->r_unlname) 8070 kmem_free(rp->r_unlname, MAXNAMELEN); 8071 rp->r_unlname = NULL; 8072 } 8073 mutex_exit(&rp->r_statelock); 8074 8075 if (nvp) { 8076 VN_RELE(nvp); 8077 } 8078 goto link_call; 8079 } 8080 8081 if (error) { 8082 VN_RELE(ovp); 8083 nfs_rw_exit(&odrp->r_rwlock); 8084 nfs_rw_exit(&ndrp->r_rwlock); 8085 if (nvp) { 8086 VN_RELE(nvp); 8087 } 8088 return (error); 8089 } 8090 8091 /* 8092 * when renaming directories to be a subdirectory of a 8093 * different parent, the dnlc entry for ".." will no 8094 * longer be valid, so it must be removed 8095 */ 8096 rp = VTOR4(ovp); 8097 if (ndvp != odvp) { 8098 if (ovp->v_type == VDIR) { 8099 dnlc_remove(ovp, ".."); 8100 if (rp->r_dir != NULL) 8101 nfs4_purge_rddir_cache(ovp); 8102 } 8103 } 8104 8105 /* 8106 * If we are renaming the unlinked file, update the 8107 * r_unldvp and r_unlname as needed. 8108 */ 8109 mutex_enter(&rp->r_statelock); 8110 if (rp->r_unldvp != NULL) { 8111 if (strcmp(rp->r_unlname, onm) == 0) { 8112 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8113 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8114 if (ndvp != rp->r_unldvp) { 8115 VN_RELE(rp->r_unldvp); 8116 rp->r_unldvp = ndvp; 8117 VN_HOLD(ndvp); 8118 } 8119 } 8120 } 8121 mutex_exit(&rp->r_statelock); 8122 8123 /* 8124 * Notify the rename vnevents to source vnode, and to the target 8125 * vnode if it already existed. 8126 */ 8127 if (error == 0) { 8128 vnode_t *tvp; 8129 rnode4_t *trp; 8130 /* 8131 * Notify the vnode. Each links is represented by 8132 * a different vnode, in nfsv4. 8133 */ 8134 if (nvp) { 8135 trp = VTOR4(nvp); 8136 tvp = nvp; 8137 if (IS_SHADOW(nvp, trp)) 8138 tvp = RTOV4(trp); 8139 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8140 } 8141 8142 /* 8143 * if the source and destination directory are not the 8144 * same notify the destination directory. 8145 */ 8146 if (VTOR4(odvp) != VTOR4(ndvp)) { 8147 trp = VTOR4(ndvp); 8148 tvp = ndvp; 8149 if (IS_SHADOW(ndvp, trp)) 8150 tvp = RTOV4(trp); 8151 vnevent_rename_dest_dir(tvp, ct); 8152 } 8153 8154 trp = VTOR4(ovp); 8155 tvp = ovp; 8156 if (IS_SHADOW(ovp, trp)) 8157 tvp = RTOV4(trp); 8158 vnevent_rename_src(tvp, odvp, onm, ct); 8159 } 8160 8161 if (nvp) { 8162 VN_RELE(nvp); 8163 } 8164 VN_RELE(ovp); 8165 8166 nfs_rw_exit(&odrp->r_rwlock); 8167 nfs_rw_exit(&ndrp->r_rwlock); 8168 8169 return (error); 8170 } 8171 8172 /* 8173 * When the parent directory has changed, sv_dfh must be updated 8174 */ 8175 static void 8176 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8177 { 8178 svnode_t *sv = VTOSV(vp); 8179 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8180 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8181 8182 sfh4_hold(new_dfh); 8183 sv->sv_dfh = new_dfh; 8184 sfh4_rele(&old_dfh); 8185 } 8186 8187 /* 8188 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8189 * when it is known that the filehandle is persistent through rename. 8190 * 8191 * Rename requires that the current fh be the target directory and the 8192 * saved fh be the source directory. After the operation, the current fh 8193 * is unchanged. 8194 * The compound op structure for persistent fh rename is: 8195 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8196 * Rather than bother with the directory postop args, we'll simply 8197 * update that a change occurred in the cache, so no post-op getattrs. 8198 */ 8199 static int 8200 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8201 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8202 { 8203 COMPOUND4args_clnt args; 8204 COMPOUND4res_clnt res, *resp = NULL; 8205 nfs_argop4 *argop; 8206 nfs_resop4 *resop; 8207 int doqueue, argoplist_size; 8208 mntinfo4_t *mi; 8209 rnode4_t *odrp = VTOR4(odvp); 8210 rnode4_t *ndrp = VTOR4(ndvp); 8211 RENAME4res *rn_res; 8212 bool_t needrecov; 8213 nfs4_recov_state_t recov_state; 8214 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8215 dirattr_info_t dinfo, *dinfop; 8216 8217 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8218 8219 recov_state.rs_flags = 0; 8220 recov_state.rs_num_retry_despite_err = 0; 8221 8222 /* 8223 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8224 * 8225 * If source/target are different dirs, then append putfh(src); getattr 8226 */ 8227 args.array_len = (odvp == ndvp) ? 5 : 7; 8228 argoplist_size = args.array_len * sizeof (nfs_argop4); 8229 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8230 8231 recov_retry: 8232 *statp = NFS4_OK; 8233 8234 /* No need to Lookup the file, persistent fh */ 8235 args.ctag = TAG_RENAME; 8236 8237 mi = VTOMI4(odvp); 8238 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8239 if (e.error) { 8240 kmem_free(argop, argoplist_size); 8241 return (e.error); 8242 } 8243 8244 /* 0: putfh source directory */ 8245 argop[0].argop = OP_CPUTFH; 8246 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8247 8248 /* 1: Save source fh to free up current for target */ 8249 argop[1].argop = OP_SAVEFH; 8250 8251 /* 2: putfh targetdir */ 8252 argop[2].argop = OP_CPUTFH; 8253 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8254 8255 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8256 argop[3].argop = OP_CRENAME; 8257 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8258 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8259 8260 /* 4: getattr (targetdir) */ 8261 argop[4].argop = OP_GETATTR; 8262 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8263 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8264 8265 if (ndvp != odvp) { 8266 8267 /* 5: putfh (sourcedir) */ 8268 argop[5].argop = OP_CPUTFH; 8269 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8270 8271 /* 6: getattr (sourcedir) */ 8272 argop[6].argop = OP_GETATTR; 8273 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8274 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8275 } 8276 8277 dnlc_remove(odvp, onm); 8278 dnlc_remove(ndvp, nnm); 8279 8280 doqueue = 1; 8281 dinfo.di_time_call = gethrtime(); 8282 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8283 8284 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8285 if (e.error) { 8286 PURGE_ATTRCACHE4(odvp); 8287 PURGE_ATTRCACHE4(ndvp); 8288 } else { 8289 *statp = res.status; 8290 } 8291 8292 if (needrecov) { 8293 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8294 OP_RENAME, NULL, NULL, NULL) == FALSE) { 8295 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8296 if (!e.error) 8297 (void) xdr_free(xdr_COMPOUND4res_clnt, 8298 (caddr_t)&res); 8299 goto recov_retry; 8300 } 8301 } 8302 8303 if (!e.error) { 8304 resp = &res; 8305 /* 8306 * as long as OP_RENAME 8307 */ 8308 if (res.status != NFS4_OK && res.array_len <= 4) { 8309 e.error = geterrno4(res.status); 8310 PURGE_ATTRCACHE4(odvp); 8311 PURGE_ATTRCACHE4(ndvp); 8312 /* 8313 * System V defines rename to return EEXIST, not 8314 * ENOTEMPTY if the target directory is not empty. 8315 * Over the wire, the error is NFSERR_ENOTEMPTY 8316 * which geterrno4 maps to ENOTEMPTY. 8317 */ 8318 if (e.error == ENOTEMPTY) 8319 e.error = EEXIST; 8320 } else { 8321 8322 resop = &res.array[3]; /* rename res */ 8323 rn_res = &resop->nfs_resop4_u.oprename; 8324 8325 if (res.status == NFS4_OK) { 8326 /* 8327 * Update target attribute, readdir and dnlc 8328 * caches. 8329 */ 8330 dinfo.di_garp = 8331 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8332 dinfo.di_cred = cr; 8333 dinfop = &dinfo; 8334 } else 8335 dinfop = NULL; 8336 8337 nfs4_update_dircaches(&rn_res->target_cinfo, 8338 ndvp, NULL, NULL, dinfop); 8339 8340 /* 8341 * Update source attribute, readdir and dnlc caches 8342 * 8343 */ 8344 if (ndvp != odvp) { 8345 update_parentdir_sfh(renvp, ndvp); 8346 8347 if (dinfop) 8348 dinfo.di_garp = 8349 &(res.array[6].nfs_resop4_u. 8350 opgetattr.ga_res); 8351 8352 nfs4_update_dircaches(&rn_res->source_cinfo, 8353 odvp, NULL, NULL, dinfop); 8354 } 8355 8356 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8357 nnm); 8358 } 8359 } 8360 8361 if (resp) 8362 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8363 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8364 kmem_free(argop, argoplist_size); 8365 8366 return (e.error); 8367 } 8368 8369 /* 8370 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8371 * it is possible for the filehandle to change due to the rename. 8372 * 8373 * The compound req in this case includes a post-rename lookup and getattr 8374 * to ensure that we have the correct fh and attributes for the object. 8375 * 8376 * Rename requires that the current fh be the target directory and the 8377 * saved fh be the source directory. After the operation, the current fh 8378 * is unchanged. 8379 * 8380 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8381 * update the filehandle for the renamed object. We also get the old 8382 * filehandle for historical reasons; this should be taken out sometime. 8383 * This results in a rather cumbersome compound... 8384 * 8385 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8386 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8387 * 8388 */ 8389 static int 8390 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8391 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8392 { 8393 COMPOUND4args_clnt args; 8394 COMPOUND4res_clnt res, *resp = NULL; 8395 int argoplist_size; 8396 nfs_argop4 *argop; 8397 nfs_resop4 *resop; 8398 int doqueue; 8399 mntinfo4_t *mi; 8400 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8401 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8402 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8403 RENAME4res *rn_res; 8404 GETFH4res *ngf_res; 8405 bool_t needrecov; 8406 nfs4_recov_state_t recov_state; 8407 hrtime_t t; 8408 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8409 dirattr_info_t dinfo, *dinfop = &dinfo; 8410 8411 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8412 8413 recov_state.rs_flags = 0; 8414 recov_state.rs_num_retry_despite_err = 0; 8415 8416 recov_retry: 8417 *statp = NFS4_OK; 8418 8419 /* 8420 * There is a window between the RPC and updating the path and 8421 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8422 * code, so that it doesn't try to use the old path during that 8423 * window. 8424 */ 8425 mutex_enter(&orp->r_statelock); 8426 while (orp->r_flags & R4RECEXPFH) { 8427 klwp_t *lwp = ttolwp(curthread); 8428 8429 if (lwp != NULL) 8430 lwp->lwp_nostop++; 8431 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8432 mutex_exit(&orp->r_statelock); 8433 if (lwp != NULL) 8434 lwp->lwp_nostop--; 8435 return (EINTR); 8436 } 8437 if (lwp != NULL) 8438 lwp->lwp_nostop--; 8439 } 8440 orp->r_flags |= R4RECEXPFH; 8441 mutex_exit(&orp->r_statelock); 8442 8443 mi = VTOMI4(odvp); 8444 8445 args.ctag = TAG_RENAME_VFH; 8446 args.array_len = (odvp == ndvp) ? 10 : 12; 8447 argoplist_size = args.array_len * sizeof (nfs_argop4); 8448 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8449 8450 /* 8451 * Rename ops: 8452 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8453 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8454 * LOOKUP(trgt), GETFH(new), GETATTR, 8455 * 8456 * if (odvp != ndvp) 8457 * add putfh(sourcedir), getattr(sourcedir) } 8458 */ 8459 args.array = argop; 8460 8461 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8462 &recov_state, NULL); 8463 if (e.error) { 8464 kmem_free(argop, argoplist_size); 8465 mutex_enter(&orp->r_statelock); 8466 orp->r_flags &= ~R4RECEXPFH; 8467 cv_broadcast(&orp->r_cv); 8468 mutex_exit(&orp->r_statelock); 8469 return (e.error); 8470 } 8471 8472 /* 0: putfh source directory */ 8473 argop[0].argop = OP_CPUTFH; 8474 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8475 8476 /* 1: Save source fh to free up current for target */ 8477 argop[1].argop = OP_SAVEFH; 8478 8479 /* 2: Lookup pre-rename fh of renamed object */ 8480 argop[2].argop = OP_CLOOKUP; 8481 argop[2].nfs_argop4_u.opclookup.cname = onm; 8482 8483 /* 3: getfh fh of renamed object (before rename) */ 8484 argop[3].argop = OP_GETFH; 8485 8486 /* 4: putfh targetdir */ 8487 argop[4].argop = OP_CPUTFH; 8488 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8489 8490 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8491 argop[5].argop = OP_CRENAME; 8492 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8493 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8494 8495 /* 6: getattr of target dir (post op attrs) */ 8496 argop[6].argop = OP_GETATTR; 8497 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8498 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8499 8500 /* 7: Lookup post-rename fh of renamed object */ 8501 argop[7].argop = OP_CLOOKUP; 8502 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8503 8504 /* 8: getfh fh of renamed object (after rename) */ 8505 argop[8].argop = OP_GETFH; 8506 8507 /* 9: getattr of renamed object */ 8508 argop[9].argop = OP_GETATTR; 8509 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8510 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8511 8512 /* 8513 * If source/target dirs are different, then get new post-op 8514 * attrs for source dir also. 8515 */ 8516 if (ndvp != odvp) { 8517 /* 10: putfh (sourcedir) */ 8518 argop[10].argop = OP_CPUTFH; 8519 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8520 8521 /* 11: getattr (sourcedir) */ 8522 argop[11].argop = OP_GETATTR; 8523 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8524 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8525 } 8526 8527 dnlc_remove(odvp, onm); 8528 dnlc_remove(ndvp, nnm); 8529 8530 doqueue = 1; 8531 t = gethrtime(); 8532 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8533 8534 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8535 if (e.error) { 8536 PURGE_ATTRCACHE4(odvp); 8537 PURGE_ATTRCACHE4(ndvp); 8538 if (!needrecov) { 8539 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8540 &recov_state, needrecov); 8541 goto out; 8542 } 8543 } else { 8544 *statp = res.status; 8545 } 8546 8547 if (needrecov) { 8548 bool_t abort; 8549 8550 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8551 OP_RENAME, NULL, NULL, NULL); 8552 if (abort == FALSE) { 8553 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8554 &recov_state, needrecov); 8555 kmem_free(argop, argoplist_size); 8556 if (!e.error) 8557 (void) xdr_free(xdr_COMPOUND4res_clnt, 8558 (caddr_t)&res); 8559 mutex_enter(&orp->r_statelock); 8560 orp->r_flags &= ~R4RECEXPFH; 8561 cv_broadcast(&orp->r_cv); 8562 mutex_exit(&orp->r_statelock); 8563 goto recov_retry; 8564 } else { 8565 if (e.error != 0) { 8566 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8567 &recov_state, needrecov); 8568 goto out; 8569 } 8570 /* fall through for res.status case */ 8571 } 8572 } 8573 8574 resp = &res; 8575 /* 8576 * If OP_RENAME (or any prev op) failed, then return an error. 8577 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8578 */ 8579 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8580 /* 8581 * Error in an op other than last Getattr 8582 */ 8583 e.error = geterrno4(res.status); 8584 PURGE_ATTRCACHE4(odvp); 8585 PURGE_ATTRCACHE4(ndvp); 8586 /* 8587 * System V defines rename to return EEXIST, not 8588 * ENOTEMPTY if the target directory is not empty. 8589 * Over the wire, the error is NFSERR_ENOTEMPTY 8590 * which geterrno4 maps to ENOTEMPTY. 8591 */ 8592 if (e.error == ENOTEMPTY) 8593 e.error = EEXIST; 8594 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8595 needrecov); 8596 goto out; 8597 } 8598 8599 /* rename results */ 8600 rn_res = &res.array[5].nfs_resop4_u.oprename; 8601 8602 if (res.status == NFS4_OK) { 8603 /* Update target attribute, readdir and dnlc caches */ 8604 dinfo.di_garp = 8605 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8606 dinfo.di_cred = cr; 8607 dinfo.di_time_call = t; 8608 } else 8609 dinfop = NULL; 8610 8611 /* Update source cache attribute, readdir and dnlc caches */ 8612 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8613 8614 /* Update source cache attribute, readdir and dnlc caches */ 8615 if (ndvp != odvp) { 8616 update_parentdir_sfh(ovp, ndvp); 8617 8618 /* 8619 * If dinfop is non-NULL, then compound succeded, so 8620 * set di_garp to attrs for source dir. dinfop is only 8621 * set to NULL when compound fails. 8622 */ 8623 if (dinfop) 8624 dinfo.di_garp = 8625 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8626 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8627 dinfop); 8628 } 8629 8630 /* 8631 * Update the rnode with the new component name and args, 8632 * and if the file handle changed, also update it with the new fh. 8633 * This is only necessary if the target object has an rnode 8634 * entry and there is no need to create one for it. 8635 */ 8636 resop = &res.array[8]; /* getfh new res */ 8637 ngf_res = &resop->nfs_resop4_u.opgetfh; 8638 8639 /* 8640 * Update the path and filehandle for the renamed object. 8641 */ 8642 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8643 8644 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8645 8646 if (res.status == NFS4_OK) { 8647 resop++; /* getattr res */ 8648 e.error = nfs4_update_attrcache(res.status, 8649 &resop->nfs_resop4_u.opgetattr.ga_res, 8650 t, ovp, cr); 8651 } 8652 8653 out: 8654 kmem_free(argop, argoplist_size); 8655 if (resp) 8656 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8657 mutex_enter(&orp->r_statelock); 8658 orp->r_flags &= ~R4RECEXPFH; 8659 cv_broadcast(&orp->r_cv); 8660 mutex_exit(&orp->r_statelock); 8661 8662 return (e.error); 8663 } 8664 8665 /* ARGSUSED */ 8666 static int 8667 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8668 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8669 { 8670 int error; 8671 vnode_t *vp; 8672 8673 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8674 return (EPERM); 8675 /* 8676 * As ".." has special meaning and rather than send a mkdir 8677 * over the wire to just let the server freak out, we just 8678 * short circuit it here and return EEXIST 8679 */ 8680 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8681 return (EEXIST); 8682 8683 /* 8684 * Decision to get the right gid and setgid bit of the 8685 * new directory is now made in call_nfs4_create_req. 8686 */ 8687 va->va_mask |= AT_MODE; 8688 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8689 if (error) 8690 return (error); 8691 8692 *vpp = vp; 8693 return (0); 8694 } 8695 8696 8697 /* 8698 * rmdir is using the same remove v4 op as does remove. 8699 * Remove requires that the current fh be the target directory. 8700 * After the operation, the current fh is unchanged. 8701 * The compound op structure is: 8702 * PUTFH(targetdir), REMOVE 8703 */ 8704 /*ARGSUSED4*/ 8705 static int 8706 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8707 caller_context_t *ct, int flags) 8708 { 8709 int need_end_op = FALSE; 8710 COMPOUND4args_clnt args; 8711 COMPOUND4res_clnt res, *resp = NULL; 8712 REMOVE4res *rm_res; 8713 nfs_argop4 argop[3]; 8714 nfs_resop4 *resop; 8715 vnode_t *vp; 8716 int doqueue; 8717 mntinfo4_t *mi; 8718 rnode4_t *drp; 8719 bool_t needrecov = FALSE; 8720 nfs4_recov_state_t recov_state; 8721 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8722 dirattr_info_t dinfo, *dinfop; 8723 8724 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8725 return (EPERM); 8726 /* 8727 * As ".." has special meaning and rather than send a rmdir 8728 * over the wire to just let the server freak out, we just 8729 * short circuit it here and return EEXIST 8730 */ 8731 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8732 return (EEXIST); 8733 8734 drp = VTOR4(dvp); 8735 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8736 return (EINTR); 8737 8738 /* 8739 * Attempt to prevent a rmdir(".") from succeeding. 8740 */ 8741 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8742 if (e.error) { 8743 nfs_rw_exit(&drp->r_rwlock); 8744 return (e.error); 8745 } 8746 if (vp == cdir) { 8747 VN_RELE(vp); 8748 nfs_rw_exit(&drp->r_rwlock); 8749 return (EINVAL); 8750 } 8751 8752 /* 8753 * Since nfsv4 remove op works on both files and directories, 8754 * check that the removed object is indeed a directory. 8755 */ 8756 if (vp->v_type != VDIR) { 8757 VN_RELE(vp); 8758 nfs_rw_exit(&drp->r_rwlock); 8759 return (ENOTDIR); 8760 } 8761 8762 /* 8763 * First just remove the entry from the name cache, as it 8764 * is most likely an entry for this vp. 8765 */ 8766 dnlc_remove(dvp, nm); 8767 8768 /* 8769 * If there vnode reference count is greater than one, then 8770 * there may be additional references in the DNLC which will 8771 * need to be purged. First, trying removing the entry for 8772 * the parent directory and see if that removes the additional 8773 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8774 * to completely remove any references to the directory which 8775 * might still exist in the DNLC. 8776 */ 8777 if (vp->v_count > 1) { 8778 dnlc_remove(vp, ".."); 8779 if (vp->v_count > 1) 8780 dnlc_purge_vp(vp); 8781 } 8782 8783 mi = VTOMI4(dvp); 8784 recov_state.rs_flags = 0; 8785 recov_state.rs_num_retry_despite_err = 0; 8786 8787 recov_retry: 8788 args.ctag = TAG_RMDIR; 8789 8790 /* 8791 * Rmdir ops: putfh dir; remove 8792 */ 8793 args.array_len = 3; 8794 args.array = argop; 8795 8796 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8797 if (e.error) { 8798 nfs_rw_exit(&drp->r_rwlock); 8799 return (e.error); 8800 } 8801 need_end_op = TRUE; 8802 8803 /* putfh directory */ 8804 argop[0].argop = OP_CPUTFH; 8805 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8806 8807 /* remove */ 8808 argop[1].argop = OP_CREMOVE; 8809 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8810 8811 /* getattr (postop attrs for dir that contained removed dir) */ 8812 argop[2].argop = OP_GETATTR; 8813 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8814 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8815 8816 dinfo.di_time_call = gethrtime(); 8817 doqueue = 1; 8818 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8819 8820 PURGE_ATTRCACHE4(vp); 8821 8822 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8823 if (e.error) { 8824 PURGE_ATTRCACHE4(dvp); 8825 } 8826 8827 if (needrecov) { 8828 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8829 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 8830 if (!e.error) 8831 (void) xdr_free(xdr_COMPOUND4res_clnt, 8832 (caddr_t)&res); 8833 8834 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8835 needrecov); 8836 need_end_op = FALSE; 8837 goto recov_retry; 8838 } 8839 } 8840 8841 if (!e.error) { 8842 resp = &res; 8843 8844 /* 8845 * Only return error if first 2 ops (OP_REMOVE or earlier) 8846 * failed. 8847 */ 8848 if (res.status != NFS4_OK && res.array_len <= 2) { 8849 e.error = geterrno4(res.status); 8850 PURGE_ATTRCACHE4(dvp); 8851 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8852 &recov_state, needrecov); 8853 need_end_op = FALSE; 8854 nfs4_purge_stale_fh(e.error, dvp, cr); 8855 /* 8856 * System V defines rmdir to return EEXIST, not 8857 * ENOTEMPTY if the directory is not empty. Over 8858 * the wire, the error is NFSERR_ENOTEMPTY which 8859 * geterrno4 maps to ENOTEMPTY. 8860 */ 8861 if (e.error == ENOTEMPTY) 8862 e.error = EEXIST; 8863 } else { 8864 resop = &res.array[1]; /* remove res */ 8865 rm_res = &resop->nfs_resop4_u.opremove; 8866 8867 if (res.status == NFS4_OK) { 8868 resop = &res.array[2]; /* dir attrs */ 8869 dinfo.di_garp = 8870 &resop->nfs_resop4_u.opgetattr.ga_res; 8871 dinfo.di_cred = cr; 8872 dinfop = &dinfo; 8873 } else 8874 dinfop = NULL; 8875 8876 /* Update dir attribute, readdir and dnlc caches */ 8877 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8878 dinfop); 8879 8880 /* destroy rddir cache for dir that was removed */ 8881 if (VTOR4(vp)->r_dir != NULL) 8882 nfs4_purge_rddir_cache(vp); 8883 } 8884 } 8885 8886 if (need_end_op) 8887 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8888 8889 nfs_rw_exit(&drp->r_rwlock); 8890 8891 if (resp) 8892 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8893 8894 if (e.error == 0) { 8895 vnode_t *tvp; 8896 rnode4_t *trp; 8897 trp = VTOR4(vp); 8898 tvp = vp; 8899 if (IS_SHADOW(vp, trp)) 8900 tvp = RTOV4(trp); 8901 vnevent_rmdir(tvp, dvp, nm, ct); 8902 } 8903 8904 VN_RELE(vp); 8905 8906 return (e.error); 8907 } 8908 8909 /* ARGSUSED */ 8910 static int 8911 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8912 caller_context_t *ct, int flags) 8913 { 8914 int error; 8915 vnode_t *vp; 8916 rnode4_t *rp; 8917 char *contents; 8918 mntinfo4_t *mi = VTOMI4(dvp); 8919 8920 if (nfs_zone() != mi->mi_zone) 8921 return (EPERM); 8922 if (!(mi->mi_flags & MI4_SYMLINK)) 8923 return (EOPNOTSUPP); 8924 8925 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8926 if (error) 8927 return (error); 8928 8929 ASSERT(nfs4_consistent_type(vp)); 8930 rp = VTOR4(vp); 8931 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8932 8933 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8934 8935 if (contents != NULL) { 8936 mutex_enter(&rp->r_statelock); 8937 if (rp->r_symlink.contents == NULL) { 8938 rp->r_symlink.len = strlen(tnm); 8939 bcopy(tnm, contents, rp->r_symlink.len); 8940 rp->r_symlink.contents = contents; 8941 rp->r_symlink.size = MAXPATHLEN; 8942 mutex_exit(&rp->r_statelock); 8943 } else { 8944 mutex_exit(&rp->r_statelock); 8945 kmem_free((void *)contents, MAXPATHLEN); 8946 } 8947 } 8948 } 8949 VN_RELE(vp); 8950 8951 return (error); 8952 } 8953 8954 8955 /* 8956 * Read directory entries. 8957 * There are some weird things to look out for here. The uio_loffset 8958 * field is either 0 or it is the offset returned from a previous 8959 * readdir. It is an opaque value used by the server to find the 8960 * correct directory block to read. The count field is the number 8961 * of blocks to read on the server. This is advisory only, the server 8962 * may return only one block's worth of entries. Entries may be compressed 8963 * on the server. 8964 */ 8965 /* ARGSUSED */ 8966 static int 8967 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8968 caller_context_t *ct, int flags) 8969 { 8970 int error; 8971 uint_t count; 8972 rnode4_t *rp; 8973 rddir4_cache *rdc; 8974 rddir4_cache *rrdc; 8975 8976 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8977 return (EIO); 8978 rp = VTOR4(vp); 8979 8980 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8981 8982 /* 8983 * Make sure that the directory cache is valid. 8984 */ 8985 if (rp->r_dir != NULL) { 8986 if (nfs_disable_rddir_cache != 0) { 8987 /* 8988 * Setting nfs_disable_rddir_cache in /etc/system 8989 * allows interoperability with servers that do not 8990 * properly update the attributes of directories. 8991 * Any cached information gets purged before an 8992 * access is made to it. 8993 */ 8994 nfs4_purge_rddir_cache(vp); 8995 } 8996 8997 error = nfs4_validate_caches(vp, cr); 8998 if (error) 8999 return (error); 9000 } 9001 9002 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 9003 9004 /* 9005 * Short circuit last readdir which always returns 0 bytes. 9006 * This can be done after the directory has been read through 9007 * completely at least once. This will set r_direof which 9008 * can be used to find the value of the last cookie. 9009 */ 9010 mutex_enter(&rp->r_statelock); 9011 if (rp->r_direof != NULL && 9012 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 9013 mutex_exit(&rp->r_statelock); 9014 #ifdef DEBUG 9015 nfs4_readdir_cache_shorts++; 9016 #endif 9017 if (eofp) 9018 *eofp = 1; 9019 return (0); 9020 } 9021 9022 /* 9023 * Look for a cache entry. Cache entries are identified 9024 * by the NFS cookie value and the byte count requested. 9025 */ 9026 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 9027 9028 /* 9029 * If rdc is NULL then the lookup resulted in an unrecoverable error. 9030 */ 9031 if (rdc == NULL) { 9032 mutex_exit(&rp->r_statelock); 9033 return (EINTR); 9034 } 9035 9036 /* 9037 * Check to see if we need to fill this entry in. 9038 */ 9039 if (rdc->flags & RDDIRREQ) { 9040 rdc->flags &= ~RDDIRREQ; 9041 rdc->flags |= RDDIR; 9042 mutex_exit(&rp->r_statelock); 9043 9044 /* 9045 * Do the readdir. 9046 */ 9047 nfs4readdir(vp, rdc, cr); 9048 9049 /* 9050 * Reacquire the lock, so that we can continue 9051 */ 9052 mutex_enter(&rp->r_statelock); 9053 /* 9054 * The entry is now complete 9055 */ 9056 rdc->flags &= ~RDDIR; 9057 } 9058 9059 ASSERT(!(rdc->flags & RDDIR)); 9060 9061 /* 9062 * If an error occurred while attempting 9063 * to fill the cache entry, mark the entry invalid and 9064 * just return the error. 9065 */ 9066 if (rdc->error) { 9067 error = rdc->error; 9068 rdc->flags |= RDDIRREQ; 9069 rddir4_cache_rele(rp, rdc); 9070 mutex_exit(&rp->r_statelock); 9071 return (error); 9072 } 9073 9074 /* 9075 * The cache entry is complete and good, 9076 * copyout the dirent structs to the calling 9077 * thread. 9078 */ 9079 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 9080 9081 /* 9082 * If no error occurred during the copyout, 9083 * update the offset in the uio struct to 9084 * contain the value of the next NFS 4 cookie 9085 * and set the eof value appropriately. 9086 */ 9087 if (!error) { 9088 uiop->uio_loffset = rdc->nfs4_ncookie; 9089 if (eofp) 9090 *eofp = rdc->eof; 9091 } 9092 9093 /* 9094 * Decide whether to do readahead. Don't if we 9095 * have already read to the end of directory. 9096 */ 9097 if (rdc->eof) { 9098 /* 9099 * Make the entry the direof only if it is cached 9100 */ 9101 if (rdc->flags & RDDIRCACHED) 9102 rp->r_direof = rdc; 9103 rddir4_cache_rele(rp, rdc); 9104 mutex_exit(&rp->r_statelock); 9105 return (error); 9106 } 9107 9108 /* Determine if a readdir readahead should be done */ 9109 if (!(rp->r_flags & R4LOOKUP)) { 9110 rddir4_cache_rele(rp, rdc); 9111 mutex_exit(&rp->r_statelock); 9112 return (error); 9113 } 9114 9115 /* 9116 * Now look for a readahead entry. 9117 * 9118 * Check to see whether we found an entry for the readahead. 9119 * If so, we don't need to do anything further, so free the new 9120 * entry if one was allocated. Otherwise, allocate a new entry, add 9121 * it to the cache, and then initiate an asynchronous readdir 9122 * operation to fill it. 9123 */ 9124 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9125 9126 /* 9127 * A readdir cache entry could not be obtained for the readahead. In 9128 * this case we skip the readahead and return. 9129 */ 9130 if (rrdc == NULL) { 9131 rddir4_cache_rele(rp, rdc); 9132 mutex_exit(&rp->r_statelock); 9133 return (error); 9134 } 9135 9136 /* 9137 * Check to see if we need to fill this entry in. 9138 */ 9139 if (rrdc->flags & RDDIRREQ) { 9140 rrdc->flags &= ~RDDIRREQ; 9141 rrdc->flags |= RDDIR; 9142 rddir4_cache_rele(rp, rdc); 9143 mutex_exit(&rp->r_statelock); 9144 #ifdef DEBUG 9145 nfs4_readdir_readahead++; 9146 #endif 9147 /* 9148 * Do the readdir. 9149 */ 9150 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9151 return (error); 9152 } 9153 9154 rddir4_cache_rele(rp, rrdc); 9155 rddir4_cache_rele(rp, rdc); 9156 mutex_exit(&rp->r_statelock); 9157 return (error); 9158 } 9159 9160 static int 9161 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9162 { 9163 int error; 9164 rnode4_t *rp; 9165 9166 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9167 9168 rp = VTOR4(vp); 9169 9170 /* 9171 * Obtain the readdir results for the caller. 9172 */ 9173 nfs4readdir(vp, rdc, cr); 9174 9175 mutex_enter(&rp->r_statelock); 9176 /* 9177 * The entry is now complete 9178 */ 9179 rdc->flags &= ~RDDIR; 9180 9181 error = rdc->error; 9182 if (error) 9183 rdc->flags |= RDDIRREQ; 9184 rddir4_cache_rele(rp, rdc); 9185 mutex_exit(&rp->r_statelock); 9186 9187 return (error); 9188 } 9189 9190 /* 9191 * Read directory entries. 9192 * There are some weird things to look out for here. The uio_loffset 9193 * field is either 0 or it is the offset returned from a previous 9194 * readdir. It is an opaque value used by the server to find the 9195 * correct directory block to read. The count field is the number 9196 * of blocks to read on the server. This is advisory only, the server 9197 * may return only one block's worth of entries. Entries may be compressed 9198 * on the server. 9199 * 9200 * Generates the following compound request: 9201 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9202 * must include a Lookupp as well. In this case, send: 9203 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9204 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9205 * 9206 * Get complete attributes and filehandles for entries if this is the 9207 * first read of the directory. Otherwise, just get fileid's. 9208 */ 9209 static void 9210 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9211 { 9212 COMPOUND4args_clnt args; 9213 COMPOUND4res_clnt res; 9214 READDIR4args *rargs; 9215 READDIR4res_clnt *rd_res; 9216 bitmap4 rd_bitsval; 9217 nfs_argop4 argop[5]; 9218 nfs_resop4 *resop; 9219 rnode4_t *rp = VTOR4(vp); 9220 mntinfo4_t *mi = VTOMI4(vp); 9221 int doqueue; 9222 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9223 vnode_t *dvp; 9224 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9225 int num_ops, res_opcnt; 9226 bool_t needrecov = FALSE; 9227 nfs4_recov_state_t recov_state; 9228 hrtime_t t; 9229 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9230 9231 ASSERT(nfs_zone() == mi->mi_zone); 9232 ASSERT(rdc->flags & RDDIR); 9233 ASSERT(rdc->entries == NULL); 9234 9235 /* 9236 * If rp were a stub, it should have triggered and caused 9237 * a mount for us to get this far. 9238 */ 9239 ASSERT(!RP_ISSTUB(rp)); 9240 9241 num_ops = 2; 9242 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9243 /* 9244 * Since nfsv4 readdir may not return entries for "." and "..", 9245 * the client must recreate them: 9246 * To find the correct nodeid, do the following: 9247 * For current node, get nodeid from dnlc. 9248 * - if current node is rootvp, set pnodeid to nodeid. 9249 * - else if parent is in the dnlc, get its nodeid from there. 9250 * - else add LOOKUPP+GETATTR to compound. 9251 */ 9252 nodeid = rp->r_attr.va_nodeid; 9253 if (vp->v_flag & VROOT) { 9254 pnodeid = nodeid; /* root of mount point */ 9255 } else { 9256 dvp = dnlc_lookup(vp, ".."); 9257 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9258 /* parent in dnlc cache - no need for otw */ 9259 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9260 } else { 9261 /* 9262 * parent not in dnlc cache, 9263 * do lookupp to get its id 9264 */ 9265 num_ops = 5; 9266 pnodeid = 0; /* set later by getattr parent */ 9267 } 9268 if (dvp) 9269 VN_RELE(dvp); 9270 } 9271 } 9272 recov_state.rs_flags = 0; 9273 recov_state.rs_num_retry_despite_err = 0; 9274 9275 /* Save the original mount point security flavor */ 9276 (void) save_mnt_secinfo(mi->mi_curr_serv); 9277 9278 recov_retry: 9279 args.ctag = TAG_READDIR; 9280 9281 args.array = argop; 9282 args.array_len = num_ops; 9283 9284 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9285 &recov_state, NULL)) { 9286 /* 9287 * If readdir a node that is a stub for a crossed mount point, 9288 * keep the original secinfo flavor for the current file 9289 * system, not the crossed one. 9290 */ 9291 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9292 rdc->error = e.error; 9293 return; 9294 } 9295 9296 /* 9297 * Determine which attrs to request for dirents. This code 9298 * must be protected by nfs4_start/end_fop because of r_server 9299 * (which will change during failover recovery). 9300 * 9301 */ 9302 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9303 /* 9304 * Get all vattr attrs plus filehandle and rdattr_error 9305 */ 9306 rd_bitsval = NFS4_VATTR_MASK | 9307 FATTR4_RDATTR_ERROR_MASK | 9308 FATTR4_FILEHANDLE_MASK; 9309 9310 if (rp->r_flags & R4READDIRWATTR) { 9311 mutex_enter(&rp->r_statelock); 9312 rp->r_flags &= ~R4READDIRWATTR; 9313 mutex_exit(&rp->r_statelock); 9314 } 9315 } else { 9316 servinfo4_t *svp = rp->r_server; 9317 9318 /* 9319 * Already read directory. Use readdir with 9320 * no attrs (except for mounted_on_fileid) for updates. 9321 */ 9322 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9323 9324 /* 9325 * request mounted on fileid if supported, else request 9326 * fileid. maybe we should verify that fileid is supported 9327 * and request something else if not. 9328 */ 9329 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9330 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9331 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9332 nfs_rw_exit(&svp->sv_lock); 9333 } 9334 9335 /* putfh directory fh */ 9336 argop[0].argop = OP_CPUTFH; 9337 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9338 9339 argop[1].argop = OP_READDIR; 9340 rargs = &argop[1].nfs_argop4_u.opreaddir; 9341 /* 9342 * 1 and 2 are reserved for client "." and ".." entry offset. 9343 * cookie 0 should be used over-the-wire to start reading at 9344 * the beginning of the directory excluding "." and "..". 9345 */ 9346 if (rdc->nfs4_cookie == 0 || 9347 rdc->nfs4_cookie == 1 || 9348 rdc->nfs4_cookie == 2) { 9349 rargs->cookie = (nfs_cookie4)0; 9350 rargs->cookieverf = 0; 9351 } else { 9352 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9353 mutex_enter(&rp->r_statelock); 9354 rargs->cookieverf = rp->r_cookieverf4; 9355 mutex_exit(&rp->r_statelock); 9356 } 9357 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9358 rargs->maxcount = mi->mi_tsize; 9359 rargs->attr_request = rd_bitsval; 9360 rargs->rdc = rdc; 9361 rargs->dvp = vp; 9362 rargs->mi = mi; 9363 rargs->cr = cr; 9364 9365 9366 /* 9367 * If count < than the minimum required, we return no entries 9368 * and fail with EINVAL 9369 */ 9370 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9371 rdc->error = EINVAL; 9372 goto out; 9373 } 9374 9375 if (args.array_len == 5) { 9376 /* 9377 * Add lookupp and getattr for parent nodeid. 9378 */ 9379 argop[2].argop = OP_LOOKUPP; 9380 9381 argop[3].argop = OP_GETFH; 9382 9383 /* getattr parent */ 9384 argop[4].argop = OP_GETATTR; 9385 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9386 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9387 } 9388 9389 doqueue = 1; 9390 9391 if (mi->mi_io_kstats) { 9392 mutex_enter(&mi->mi_lock); 9393 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9394 mutex_exit(&mi->mi_lock); 9395 } 9396 9397 /* capture the time of this call */ 9398 rargs->t = t = gethrtime(); 9399 9400 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9401 9402 if (mi->mi_io_kstats) { 9403 mutex_enter(&mi->mi_lock); 9404 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9405 mutex_exit(&mi->mi_lock); 9406 } 9407 9408 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9409 9410 /* 9411 * If RPC error occurred and it isn't an error that 9412 * triggers recovery, then go ahead and fail now. 9413 */ 9414 if (e.error != 0 && !needrecov) { 9415 rdc->error = e.error; 9416 goto out; 9417 } 9418 9419 if (needrecov) { 9420 bool_t abort; 9421 9422 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9423 "nfs4readdir: initiating recovery.\n")); 9424 9425 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9426 NULL, OP_READDIR, NULL, NULL, NULL); 9427 if (abort == FALSE) { 9428 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9429 &recov_state, needrecov); 9430 if (!e.error) 9431 (void) xdr_free(xdr_COMPOUND4res_clnt, 9432 (caddr_t)&res); 9433 if (rdc->entries != NULL) { 9434 kmem_free(rdc->entries, rdc->entlen); 9435 rdc->entries = NULL; 9436 } 9437 goto recov_retry; 9438 } 9439 9440 if (e.error != 0) { 9441 rdc->error = e.error; 9442 goto out; 9443 } 9444 9445 /* fall through for res.status case */ 9446 } 9447 9448 res_opcnt = res.array_len; 9449 9450 /* 9451 * If compound failed first 2 ops (PUTFH+READDIR), then return 9452 * failure here. Subsequent ops are for filling out dot-dot 9453 * dirent, and if they fail, we still want to give the caller 9454 * the dirents returned by (the successful) READDIR op, so we need 9455 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9456 * 9457 * One example where PUTFH+READDIR ops would succeed but 9458 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9459 * but lacks x. In this case, a POSIX server's VOP_READDIR 9460 * would succeed; however, VOP_LOOKUP(..) would fail since no 9461 * x perm. We need to come up with a non-vendor-specific way 9462 * for a POSIX server to return d_ino from dotdot's dirent if 9463 * client only requests mounted_on_fileid, and just say the 9464 * LOOKUPP succeeded and fill out the GETATTR. However, if 9465 * client requested any mandatory attrs, server would be required 9466 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9467 * for dotdot. 9468 */ 9469 9470 if (res.status) { 9471 if (res_opcnt <= 2) { 9472 e.error = geterrno4(res.status); 9473 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9474 &recov_state, needrecov); 9475 nfs4_purge_stale_fh(e.error, vp, cr); 9476 rdc->error = e.error; 9477 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9478 if (rdc->entries != NULL) { 9479 kmem_free(rdc->entries, rdc->entlen); 9480 rdc->entries = NULL; 9481 } 9482 /* 9483 * If readdir a node that is a stub for a 9484 * crossed mount point, keep the original 9485 * secinfo flavor for the current file system, 9486 * not the crossed one. 9487 */ 9488 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9489 return; 9490 } 9491 } 9492 9493 resop = &res.array[1]; /* readdir res */ 9494 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9495 9496 mutex_enter(&rp->r_statelock); 9497 rp->r_cookieverf4 = rd_res->cookieverf; 9498 mutex_exit(&rp->r_statelock); 9499 9500 /* 9501 * For "." and ".." entries 9502 * e.g. 9503 * seek(cookie=0) -> "." entry with d_off = 1 9504 * seek(cookie=1) -> ".." entry with d_off = 2 9505 */ 9506 if (cookie == (nfs_cookie4) 0) { 9507 if (rd_res->dotp) 9508 rd_res->dotp->d_ino = nodeid; 9509 if (rd_res->dotdotp) 9510 rd_res->dotdotp->d_ino = pnodeid; 9511 } 9512 if (cookie == (nfs_cookie4) 1) { 9513 if (rd_res->dotdotp) 9514 rd_res->dotdotp->d_ino = pnodeid; 9515 } 9516 9517 9518 /* LOOKUPP+GETATTR attemped */ 9519 if (args.array_len == 5 && rd_res->dotdotp) { 9520 if (res.status == NFS4_OK && res_opcnt == 5) { 9521 nfs_fh4 *fhp; 9522 nfs4_sharedfh_t *sfhp; 9523 vnode_t *pvp; 9524 nfs4_ga_res_t *garp; 9525 9526 resop++; /* lookupp */ 9527 resop++; /* getfh */ 9528 fhp = &resop->nfs_resop4_u.opgetfh.object; 9529 9530 resop++; /* getattr of parent */ 9531 9532 /* 9533 * First, take care of finishing the 9534 * readdir results. 9535 */ 9536 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9537 /* 9538 * The d_ino of .. must be the inode number 9539 * of the mounted filesystem. 9540 */ 9541 if (garp->n4g_va.va_mask & AT_NODEID) 9542 rd_res->dotdotp->d_ino = 9543 garp->n4g_va.va_nodeid; 9544 9545 9546 /* 9547 * Next, create the ".." dnlc entry 9548 */ 9549 sfhp = sfh4_get(fhp, mi); 9550 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9551 dnlc_update(vp, "..", pvp); 9552 VN_RELE(pvp); 9553 } 9554 sfh4_rele(&sfhp); 9555 } 9556 } 9557 9558 if (mi->mi_io_kstats) { 9559 mutex_enter(&mi->mi_lock); 9560 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9561 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9562 mutex_exit(&mi->mi_lock); 9563 } 9564 9565 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9566 9567 out: 9568 /* 9569 * If readdir a node that is a stub for a crossed mount point, 9570 * keep the original secinfo flavor for the current file system, 9571 * not the crossed one. 9572 */ 9573 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9574 9575 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9576 } 9577 9578 9579 static int 9580 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9581 { 9582 rnode4_t *rp = VTOR4(bp->b_vp); 9583 int count; 9584 int error; 9585 cred_t *cred_otw = NULL; 9586 offset_t offset; 9587 nfs4_open_stream_t *osp = NULL; 9588 bool_t first_time = TRUE; /* first time getting otw cred */ 9589 bool_t last_time = FALSE; /* last time getting otw cred */ 9590 9591 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9592 9593 DTRACE_IO1(start, struct buf *, bp); 9594 offset = ldbtob(bp->b_lblkno); 9595 9596 if (bp->b_flags & B_READ) { 9597 read_again: 9598 /* 9599 * Releases the osp, if it is provided. 9600 * Puts a hold on the cred_otw and the new osp (if found). 9601 */ 9602 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9603 &first_time, &last_time); 9604 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9605 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9606 readahead, NULL); 9607 crfree(cred_otw); 9608 if (!error) { 9609 if (bp->b_resid) { 9610 /* 9611 * Didn't get it all because we hit EOF, 9612 * zero all the memory beyond the EOF. 9613 */ 9614 /* bzero(rdaddr + */ 9615 bzero(bp->b_un.b_addr + 9616 bp->b_bcount - bp->b_resid, bp->b_resid); 9617 } 9618 mutex_enter(&rp->r_statelock); 9619 if (bp->b_resid == bp->b_bcount && 9620 offset >= rp->r_size) { 9621 /* 9622 * We didn't read anything at all as we are 9623 * past EOF. Return an error indicator back 9624 * but don't destroy the pages (yet). 9625 */ 9626 error = NFS_EOF; 9627 } 9628 mutex_exit(&rp->r_statelock); 9629 } else if (error == EACCES && last_time == FALSE) { 9630 goto read_again; 9631 } 9632 } else { 9633 if (!(rp->r_flags & R4STALE)) { 9634 write_again: 9635 /* 9636 * Releases the osp, if it is provided. 9637 * Puts a hold on the cred_otw and the new 9638 * osp (if found). 9639 */ 9640 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9641 &first_time, &last_time); 9642 mutex_enter(&rp->r_statelock); 9643 count = MIN(bp->b_bcount, rp->r_size - offset); 9644 mutex_exit(&rp->r_statelock); 9645 if (count < 0) 9646 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9647 #ifdef DEBUG 9648 if (count == 0) { 9649 zoneid_t zoneid = getzoneid(); 9650 9651 zcmn_err(zoneid, CE_WARN, 9652 "nfs4_bio: zero length write at %lld", 9653 offset); 9654 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9655 "b_bcount=%ld, file size=%lld", 9656 rp->r_flags, (long)bp->b_bcount, 9657 rp->r_size); 9658 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9659 if (nfs4_bio_do_stop) 9660 debug_enter("nfs4_bio"); 9661 } 9662 #endif 9663 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9664 count, cred_otw, stab_comm); 9665 if (error == EACCES && last_time == FALSE) { 9666 crfree(cred_otw); 9667 goto write_again; 9668 } 9669 bp->b_error = error; 9670 if (error && error != EINTR && 9671 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9672 /* 9673 * Don't print EDQUOT errors on the console. 9674 * Don't print asynchronous EACCES errors. 9675 * Don't print EFBIG errors. 9676 * Print all other write errors. 9677 */ 9678 if (error != EDQUOT && error != EFBIG && 9679 (error != EACCES || 9680 !(bp->b_flags & B_ASYNC))) 9681 nfs4_write_error(bp->b_vp, 9682 error, cred_otw); 9683 /* 9684 * Update r_error and r_flags as appropriate. 9685 * If the error was ESTALE, then mark the 9686 * rnode as not being writeable and save 9687 * the error status. Otherwise, save any 9688 * errors which occur from asynchronous 9689 * page invalidations. Any errors occurring 9690 * from other operations should be saved 9691 * by the caller. 9692 */ 9693 mutex_enter(&rp->r_statelock); 9694 if (error == ESTALE) { 9695 rp->r_flags |= R4STALE; 9696 if (!rp->r_error) 9697 rp->r_error = error; 9698 } else if (!rp->r_error && 9699 (bp->b_flags & 9700 (B_INVAL|B_FORCE|B_ASYNC)) == 9701 (B_INVAL|B_FORCE|B_ASYNC)) { 9702 rp->r_error = error; 9703 } 9704 mutex_exit(&rp->r_statelock); 9705 } 9706 crfree(cred_otw); 9707 } else { 9708 error = rp->r_error; 9709 /* 9710 * A close may have cleared r_error, if so, 9711 * propagate ESTALE error return properly 9712 */ 9713 if (error == 0) 9714 error = ESTALE; 9715 } 9716 } 9717 9718 if (error != 0 && error != NFS_EOF) 9719 bp->b_flags |= B_ERROR; 9720 9721 if (osp) 9722 open_stream_rele(osp, rp); 9723 9724 DTRACE_IO1(done, struct buf *, bp); 9725 9726 return (error); 9727 } 9728 9729 /* ARGSUSED */ 9730 int 9731 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9732 { 9733 return (EREMOTE); 9734 } 9735 9736 /* ARGSUSED2 */ 9737 int 9738 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9739 { 9740 rnode4_t *rp = VTOR4(vp); 9741 9742 if (!write_lock) { 9743 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9744 return (V_WRITELOCK_FALSE); 9745 } 9746 9747 if ((rp->r_flags & R4DIRECTIO) || 9748 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9749 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9750 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9751 return (V_WRITELOCK_FALSE); 9752 nfs_rw_exit(&rp->r_rwlock); 9753 } 9754 9755 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9756 return (V_WRITELOCK_TRUE); 9757 } 9758 9759 /* ARGSUSED */ 9760 void 9761 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9762 { 9763 rnode4_t *rp = VTOR4(vp); 9764 9765 nfs_rw_exit(&rp->r_rwlock); 9766 } 9767 9768 /* ARGSUSED */ 9769 static int 9770 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9771 { 9772 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9773 return (EIO); 9774 9775 /* 9776 * Because we stuff the readdir cookie into the offset field 9777 * someone may attempt to do an lseek with the cookie which 9778 * we want to succeed. 9779 */ 9780 if (vp->v_type == VDIR) 9781 return (0); 9782 if (*noffp < 0) 9783 return (EINVAL); 9784 return (0); 9785 } 9786 9787 9788 /* 9789 * Return all the pages from [off..off+len) in file 9790 */ 9791 /* ARGSUSED */ 9792 static int 9793 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9794 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9795 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9796 { 9797 rnode4_t *rp; 9798 int error; 9799 mntinfo4_t *mi; 9800 9801 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9802 return (EIO); 9803 rp = VTOR4(vp); 9804 if (IS_SHADOW(vp, rp)) 9805 vp = RTOV4(rp); 9806 9807 if (vp->v_flag & VNOMAP) 9808 return (ENOSYS); 9809 9810 if (protp != NULL) 9811 *protp = PROT_ALL; 9812 9813 /* 9814 * Now validate that the caches are up to date. 9815 */ 9816 if (error = nfs4_validate_caches(vp, cr)) 9817 return (error); 9818 9819 mi = VTOMI4(vp); 9820 retry: 9821 mutex_enter(&rp->r_statelock); 9822 9823 /* 9824 * Don't create dirty pages faster than they 9825 * can be cleaned so that the system doesn't 9826 * get imbalanced. If the async queue is 9827 * maxed out, then wait for it to drain before 9828 * creating more dirty pages. Also, wait for 9829 * any threads doing pagewalks in the vop_getattr 9830 * entry points so that they don't block for 9831 * long periods. 9832 */ 9833 if (rw == S_CREATE) { 9834 while ((mi->mi_max_threads != 0 && 9835 rp->r_awcount > 2 * mi->mi_max_threads) || 9836 rp->r_gcount > 0) 9837 cv_wait(&rp->r_cv, &rp->r_statelock); 9838 } 9839 9840 /* 9841 * If we are getting called as a side effect of an nfs_write() 9842 * operation the local file size might not be extended yet. 9843 * In this case we want to be able to return pages of zeroes. 9844 */ 9845 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9846 NFS4_DEBUG(nfs4_pageio_debug, 9847 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9848 "len=%llu, size=%llu, attrsize =%llu", off, 9849 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9850 mutex_exit(&rp->r_statelock); 9851 return (EFAULT); /* beyond EOF */ 9852 } 9853 9854 mutex_exit(&rp->r_statelock); 9855 9856 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9857 pl, plsz, seg, addr, rw, cr); 9858 NFS4_DEBUG(nfs4_pageio_debug && error, 9859 (CE_NOTE, "getpages error %d; off=%lld, len=%lld", 9860 error, off, (u_longlong_t)len)); 9861 9862 switch (error) { 9863 case NFS_EOF: 9864 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9865 goto retry; 9866 case ESTALE: 9867 nfs4_purge_stale_fh(error, vp, cr); 9868 } 9869 9870 return (error); 9871 } 9872 9873 /* 9874 * Called from pvn_getpages to get a particular page. 9875 */ 9876 /* ARGSUSED */ 9877 static int 9878 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9879 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9880 enum seg_rw rw, cred_t *cr) 9881 { 9882 rnode4_t *rp; 9883 uint_t bsize; 9884 struct buf *bp; 9885 page_t *pp; 9886 u_offset_t lbn; 9887 u_offset_t io_off; 9888 u_offset_t blkoff; 9889 u_offset_t rablkoff; 9890 size_t io_len; 9891 uint_t blksize; 9892 int error; 9893 int readahead; 9894 int readahead_issued = 0; 9895 int ra_window; /* readahead window */ 9896 page_t *pagefound; 9897 page_t *savepp; 9898 9899 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9900 return (EIO); 9901 9902 rp = VTOR4(vp); 9903 ASSERT(!IS_SHADOW(vp, rp)); 9904 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9905 9906 reread: 9907 bp = NULL; 9908 pp = NULL; 9909 pagefound = NULL; 9910 9911 if (pl != NULL) 9912 pl[0] = NULL; 9913 9914 error = 0; 9915 lbn = off / bsize; 9916 blkoff = lbn * bsize; 9917 9918 /* 9919 * Queueing up the readahead before doing the synchronous read 9920 * results in a significant increase in read throughput because 9921 * of the increased parallelism between the async threads and 9922 * the process context. 9923 */ 9924 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9925 rw != S_CREATE && 9926 !(vp->v_flag & VNOCACHE)) { 9927 mutex_enter(&rp->r_statelock); 9928 9929 /* 9930 * Calculate the number of readaheads to do. 9931 * a) No readaheads at offset = 0. 9932 * b) Do maximum(nfs4_nra) readaheads when the readahead 9933 * window is closed. 9934 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9935 * upon how far the readahead window is open or close. 9936 * d) No readaheads if rp->r_nextr is not within the scope 9937 * of the readahead window (random i/o). 9938 */ 9939 9940 if (off == 0) 9941 readahead = 0; 9942 else if (blkoff == rp->r_nextr) 9943 readahead = nfs4_nra; 9944 else if (rp->r_nextr > blkoff && 9945 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9946 <= (nfs4_nra - 1))) 9947 readahead = nfs4_nra - ra_window; 9948 else 9949 readahead = 0; 9950 9951 rablkoff = rp->r_nextr; 9952 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9953 mutex_exit(&rp->r_statelock); 9954 if (nfs4_async_readahead(vp, rablkoff + bsize, 9955 addr + (rablkoff + bsize - off), 9956 seg, cr, nfs4_readahead) < 0) { 9957 mutex_enter(&rp->r_statelock); 9958 break; 9959 } 9960 readahead--; 9961 rablkoff += bsize; 9962 /* 9963 * Indicate that we did a readahead so 9964 * readahead offset is not updated 9965 * by the synchronous read below. 9966 */ 9967 readahead_issued = 1; 9968 mutex_enter(&rp->r_statelock); 9969 /* 9970 * set readahead offset to 9971 * offset of last async readahead 9972 * request. 9973 */ 9974 rp->r_nextr = rablkoff; 9975 } 9976 mutex_exit(&rp->r_statelock); 9977 } 9978 9979 again: 9980 if ((pagefound = page_exists(vp, off)) == NULL) { 9981 if (pl == NULL) { 9982 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9983 nfs4_readahead); 9984 } else if (rw == S_CREATE) { 9985 /* 9986 * Block for this page is not allocated, or the offset 9987 * is beyond the current allocation size, or we're 9988 * allocating a swap slot and the page was not found, 9989 * so allocate it and return a zero page. 9990 */ 9991 if ((pp = page_create_va(vp, off, 9992 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9993 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9994 io_len = PAGESIZE; 9995 mutex_enter(&rp->r_statelock); 9996 rp->r_nextr = off + PAGESIZE; 9997 mutex_exit(&rp->r_statelock); 9998 } else { 9999 /* 10000 * Need to go to server to get a block 10001 */ 10002 mutex_enter(&rp->r_statelock); 10003 if (blkoff < rp->r_size && 10004 blkoff + bsize > rp->r_size) { 10005 /* 10006 * If less than a block left in 10007 * file read less than a block. 10008 */ 10009 if (rp->r_size <= off) { 10010 /* 10011 * Trying to access beyond EOF, 10012 * set up to get at least one page. 10013 */ 10014 blksize = off + PAGESIZE - blkoff; 10015 } else 10016 blksize = rp->r_size - blkoff; 10017 } else if ((off == 0) || 10018 (off != rp->r_nextr && !readahead_issued)) { 10019 blksize = PAGESIZE; 10020 blkoff = off; /* block = page here */ 10021 } else 10022 blksize = bsize; 10023 mutex_exit(&rp->r_statelock); 10024 10025 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 10026 &io_len, blkoff, blksize, 0); 10027 10028 /* 10029 * Some other thread has entered the page, 10030 * so just use it. 10031 */ 10032 if (pp == NULL) 10033 goto again; 10034 10035 /* 10036 * Now round the request size up to page boundaries. 10037 * This ensures that the entire page will be 10038 * initialized to zeroes if EOF is encountered. 10039 */ 10040 io_len = ptob(btopr(io_len)); 10041 10042 bp = pageio_setup(pp, io_len, vp, B_READ); 10043 ASSERT(bp != NULL); 10044 10045 /* 10046 * pageio_setup should have set b_addr to 0. This 10047 * is correct since we want to do I/O on a page 10048 * boundary. bp_mapin will use this addr to calculate 10049 * an offset, and then set b_addr to the kernel virtual 10050 * address it allocated for us. 10051 */ 10052 ASSERT(bp->b_un.b_addr == 0); 10053 10054 bp->b_edev = 0; 10055 bp->b_dev = 0; 10056 bp->b_lblkno = lbtodb(io_off); 10057 bp->b_file = vp; 10058 bp->b_offset = (offset_t)off; 10059 bp_mapin(bp); 10060 10061 /* 10062 * If doing a write beyond what we believe is EOF, 10063 * don't bother trying to read the pages from the 10064 * server, we'll just zero the pages here. We 10065 * don't check that the rw flag is S_WRITE here 10066 * because some implementations may attempt a 10067 * read access to the buffer before copying data. 10068 */ 10069 mutex_enter(&rp->r_statelock); 10070 if (io_off >= rp->r_size && seg == segkmap) { 10071 mutex_exit(&rp->r_statelock); 10072 bzero(bp->b_un.b_addr, io_len); 10073 } else { 10074 mutex_exit(&rp->r_statelock); 10075 error = nfs4_bio(bp, NULL, cr, FALSE); 10076 } 10077 10078 /* 10079 * Unmap the buffer before freeing it. 10080 */ 10081 bp_mapout(bp); 10082 pageio_done(bp); 10083 10084 savepp = pp; 10085 do { 10086 pp->p_fsdata = C_NOCOMMIT; 10087 } while ((pp = pp->p_next) != savepp); 10088 10089 if (error == NFS_EOF) { 10090 /* 10091 * If doing a write system call just return 10092 * zeroed pages, else user tried to get pages 10093 * beyond EOF, return error. We don't check 10094 * that the rw flag is S_WRITE here because 10095 * some implementations may attempt a read 10096 * access to the buffer before copying data. 10097 */ 10098 if (seg == segkmap) 10099 error = 0; 10100 else 10101 error = EFAULT; 10102 } 10103 10104 if (!readahead_issued && !error) { 10105 mutex_enter(&rp->r_statelock); 10106 rp->r_nextr = io_off + io_len; 10107 mutex_exit(&rp->r_statelock); 10108 } 10109 } 10110 } 10111 10112 out: 10113 if (pl == NULL) 10114 return (error); 10115 10116 if (error) { 10117 if (pp != NULL) 10118 pvn_read_done(pp, B_ERROR); 10119 return (error); 10120 } 10121 10122 if (pagefound) { 10123 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10124 10125 /* 10126 * Page exists in the cache, acquire the appropriate lock. 10127 * If this fails, start all over again. 10128 */ 10129 if ((pp = page_lookup(vp, off, se)) == NULL) { 10130 #ifdef DEBUG 10131 nfs4_lostpage++; 10132 #endif 10133 goto reread; 10134 } 10135 pl[0] = pp; 10136 pl[1] = NULL; 10137 return (0); 10138 } 10139 10140 if (pp != NULL) 10141 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10142 10143 return (error); 10144 } 10145 10146 static void 10147 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10148 cred_t *cr) 10149 { 10150 int error; 10151 page_t *pp; 10152 u_offset_t io_off; 10153 size_t io_len; 10154 struct buf *bp; 10155 uint_t bsize, blksize; 10156 rnode4_t *rp = VTOR4(vp); 10157 page_t *savepp; 10158 10159 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10160 10161 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10162 10163 mutex_enter(&rp->r_statelock); 10164 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10165 /* 10166 * If less than a block left in file read less 10167 * than a block. 10168 */ 10169 blksize = rp->r_size - blkoff; 10170 } else 10171 blksize = bsize; 10172 mutex_exit(&rp->r_statelock); 10173 10174 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10175 &io_off, &io_len, blkoff, blksize, 1); 10176 /* 10177 * The isra flag passed to the kluster function is 1, we may have 10178 * gotten a return value of NULL for a variety of reasons (# of free 10179 * pages < minfree, someone entered the page on the vnode etc). In all 10180 * cases, we want to punt on the readahead. 10181 */ 10182 if (pp == NULL) 10183 return; 10184 10185 /* 10186 * Now round the request size up to page boundaries. 10187 * This ensures that the entire page will be 10188 * initialized to zeroes if EOF is encountered. 10189 */ 10190 io_len = ptob(btopr(io_len)); 10191 10192 bp = pageio_setup(pp, io_len, vp, B_READ); 10193 ASSERT(bp != NULL); 10194 10195 /* 10196 * pageio_setup should have set b_addr to 0. This is correct since 10197 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10198 * to calculate an offset, and then set b_addr to the kernel virtual 10199 * address it allocated for us. 10200 */ 10201 ASSERT(bp->b_un.b_addr == 0); 10202 10203 bp->b_edev = 0; 10204 bp->b_dev = 0; 10205 bp->b_lblkno = lbtodb(io_off); 10206 bp->b_file = vp; 10207 bp->b_offset = (offset_t)blkoff; 10208 bp_mapin(bp); 10209 10210 /* 10211 * If doing a write beyond what we believe is EOF, don't bother trying 10212 * to read the pages from the server, we'll just zero the pages here. 10213 * We don't check that the rw flag is S_WRITE here because some 10214 * implementations may attempt a read access to the buffer before 10215 * copying data. 10216 */ 10217 mutex_enter(&rp->r_statelock); 10218 if (io_off >= rp->r_size && seg == segkmap) { 10219 mutex_exit(&rp->r_statelock); 10220 bzero(bp->b_un.b_addr, io_len); 10221 error = 0; 10222 } else { 10223 mutex_exit(&rp->r_statelock); 10224 error = nfs4_bio(bp, NULL, cr, TRUE); 10225 if (error == NFS_EOF) 10226 error = 0; 10227 } 10228 10229 /* 10230 * Unmap the buffer before freeing it. 10231 */ 10232 bp_mapout(bp); 10233 pageio_done(bp); 10234 10235 savepp = pp; 10236 do { 10237 pp->p_fsdata = C_NOCOMMIT; 10238 } while ((pp = pp->p_next) != savepp); 10239 10240 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10241 10242 /* 10243 * In case of error set readahead offset 10244 * to the lowest offset. 10245 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10246 */ 10247 if (error && rp->r_nextr > io_off) { 10248 mutex_enter(&rp->r_statelock); 10249 if (rp->r_nextr > io_off) 10250 rp->r_nextr = io_off; 10251 mutex_exit(&rp->r_statelock); 10252 } 10253 } 10254 10255 /* 10256 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10257 * If len == 0, do from off to EOF. 10258 * 10259 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10260 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10261 * (from pageout). 10262 */ 10263 /* ARGSUSED */ 10264 static int 10265 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10266 caller_context_t *ct) 10267 { 10268 int error; 10269 rnode4_t *rp; 10270 10271 ASSERT(cr != NULL); 10272 10273 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10274 return (EIO); 10275 10276 rp = VTOR4(vp); 10277 if (IS_SHADOW(vp, rp)) 10278 vp = RTOV4(rp); 10279 10280 /* 10281 * XXX - Why should this check be made here? 10282 */ 10283 if (vp->v_flag & VNOMAP) 10284 return (ENOSYS); 10285 10286 if (len == 0 && !(flags & B_INVAL) && 10287 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10288 return (0); 10289 10290 mutex_enter(&rp->r_statelock); 10291 rp->r_count++; 10292 mutex_exit(&rp->r_statelock); 10293 error = nfs4_putpages(vp, off, len, flags, cr); 10294 mutex_enter(&rp->r_statelock); 10295 rp->r_count--; 10296 cv_broadcast(&rp->r_cv); 10297 mutex_exit(&rp->r_statelock); 10298 10299 return (error); 10300 } 10301 10302 /* 10303 * Write out a single page, possibly klustering adjacent dirty pages. 10304 */ 10305 int 10306 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10307 int flags, cred_t *cr) 10308 { 10309 u_offset_t io_off; 10310 u_offset_t lbn_off; 10311 u_offset_t lbn; 10312 size_t io_len; 10313 uint_t bsize; 10314 int error; 10315 rnode4_t *rp; 10316 10317 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10318 ASSERT(pp != NULL); 10319 ASSERT(cr != NULL); 10320 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10321 10322 rp = VTOR4(vp); 10323 ASSERT(rp->r_count > 0); 10324 ASSERT(!IS_SHADOW(vp, rp)); 10325 10326 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10327 lbn = pp->p_offset / bsize; 10328 lbn_off = lbn * bsize; 10329 10330 /* 10331 * Find a kluster that fits in one block, or in 10332 * one page if pages are bigger than blocks. If 10333 * there is less file space allocated than a whole 10334 * page, we'll shorten the i/o request below. 10335 */ 10336 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10337 roundup(bsize, PAGESIZE), flags); 10338 10339 /* 10340 * pvn_write_kluster shouldn't have returned a page with offset 10341 * behind the original page we were given. Verify that. 10342 */ 10343 ASSERT((pp->p_offset / bsize) >= lbn); 10344 10345 /* 10346 * Now pp will have the list of kept dirty pages marked for 10347 * write back. It will also handle invalidation and freeing 10348 * of pages that are not dirty. Check for page length rounding 10349 * problems. 10350 */ 10351 if (io_off + io_len > lbn_off + bsize) { 10352 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10353 io_len = lbn_off + bsize - io_off; 10354 } 10355 /* 10356 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10357 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10358 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10359 * progress and the r_size has not been made consistent with the 10360 * new size of the file. When the uiomove() completes the r_size is 10361 * updated and the R4MODINPROGRESS flag is cleared. 10362 * 10363 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10364 * consistent value of r_size. Without this handshaking, it is 10365 * possible that nfs4_bio() picks up the old value of r_size 10366 * before the uiomove() in writerp4() completes. This will result 10367 * in the write through nfs4_bio() being dropped. 10368 * 10369 * More precisely, there is a window between the time the uiomove() 10370 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10371 * operation intervenes in this window, the page will be picked up, 10372 * because it is dirty (it will be unlocked, unless it was 10373 * pagecreate'd). When the page is picked up as dirty, the dirty 10374 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10375 * checked. This will still be the old size. Therefore the page will 10376 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10377 * the page will be found to be clean and the write will be dropped. 10378 */ 10379 if (rp->r_flags & R4MODINPROGRESS) { 10380 mutex_enter(&rp->r_statelock); 10381 if ((rp->r_flags & R4MODINPROGRESS) && 10382 rp->r_modaddr + MAXBSIZE > io_off && 10383 rp->r_modaddr < io_off + io_len) { 10384 page_t *plist; 10385 /* 10386 * A write is in progress for this region of the file. 10387 * If we did not detect R4MODINPROGRESS here then this 10388 * path through nfs_putapage() would eventually go to 10389 * nfs4_bio() and may not write out all of the data 10390 * in the pages. We end up losing data. So we decide 10391 * to set the modified bit on each page in the page 10392 * list and mark the rnode with R4DIRTY. This write 10393 * will be restarted at some later time. 10394 */ 10395 plist = pp; 10396 while (plist != NULL) { 10397 pp = plist; 10398 page_sub(&plist, pp); 10399 hat_setmod(pp); 10400 page_io_unlock(pp); 10401 page_unlock(pp); 10402 } 10403 rp->r_flags |= R4DIRTY; 10404 mutex_exit(&rp->r_statelock); 10405 if (offp) 10406 *offp = io_off; 10407 if (lenp) 10408 *lenp = io_len; 10409 return (0); 10410 } 10411 mutex_exit(&rp->r_statelock); 10412 } 10413 10414 if (flags & B_ASYNC) { 10415 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10416 nfs4_sync_putapage); 10417 } else 10418 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10419 10420 if (offp) 10421 *offp = io_off; 10422 if (lenp) 10423 *lenp = io_len; 10424 return (error); 10425 } 10426 10427 static int 10428 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10429 int flags, cred_t *cr) 10430 { 10431 int error; 10432 rnode4_t *rp; 10433 10434 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10435 10436 flags |= B_WRITE; 10437 10438 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10439 10440 rp = VTOR4(vp); 10441 10442 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10443 error == EACCES) && 10444 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10445 if (!(rp->r_flags & R4OUTOFSPACE)) { 10446 mutex_enter(&rp->r_statelock); 10447 rp->r_flags |= R4OUTOFSPACE; 10448 mutex_exit(&rp->r_statelock); 10449 } 10450 flags |= B_ERROR; 10451 pvn_write_done(pp, flags); 10452 /* 10453 * If this was not an async thread, then try again to 10454 * write out the pages, but this time, also destroy 10455 * them whether or not the write is successful. This 10456 * will prevent memory from filling up with these 10457 * pages and destroying them is the only alternative 10458 * if they can't be written out. 10459 * 10460 * Don't do this if this is an async thread because 10461 * when the pages are unlocked in pvn_write_done, 10462 * some other thread could have come along, locked 10463 * them, and queued for an async thread. It would be 10464 * possible for all of the async threads to be tied 10465 * up waiting to lock the pages again and they would 10466 * all already be locked and waiting for an async 10467 * thread to handle them. Deadlock. 10468 */ 10469 if (!(flags & B_ASYNC)) { 10470 error = nfs4_putpage(vp, io_off, io_len, 10471 B_INVAL | B_FORCE, cr, NULL); 10472 } 10473 } else { 10474 if (error) 10475 flags |= B_ERROR; 10476 else if (rp->r_flags & R4OUTOFSPACE) { 10477 mutex_enter(&rp->r_statelock); 10478 rp->r_flags &= ~R4OUTOFSPACE; 10479 mutex_exit(&rp->r_statelock); 10480 } 10481 pvn_write_done(pp, flags); 10482 if (freemem < desfree) 10483 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10484 NFS4_WRITE_NOWAIT); 10485 } 10486 10487 return (error); 10488 } 10489 10490 #ifdef DEBUG 10491 int nfs4_force_open_before_mmap = 0; 10492 #endif 10493 10494 /* ARGSUSED */ 10495 static int 10496 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10497 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10498 caller_context_t *ct) 10499 { 10500 struct segvn_crargs vn_a; 10501 int error = 0; 10502 rnode4_t *rp = VTOR4(vp); 10503 mntinfo4_t *mi = VTOMI4(vp); 10504 10505 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10506 return (EIO); 10507 10508 if (vp->v_flag & VNOMAP) 10509 return (ENOSYS); 10510 10511 if (off < 0 || (off + len) < 0) 10512 return (ENXIO); 10513 10514 if (vp->v_type != VREG) 10515 return (ENODEV); 10516 10517 /* 10518 * If the file is delegated to the client don't do anything. 10519 * If the file is not delegated, then validate the data cache. 10520 */ 10521 mutex_enter(&rp->r_statev4_lock); 10522 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10523 mutex_exit(&rp->r_statev4_lock); 10524 error = nfs4_validate_caches(vp, cr); 10525 if (error) 10526 return (error); 10527 } else { 10528 mutex_exit(&rp->r_statev4_lock); 10529 } 10530 10531 /* 10532 * Check to see if the vnode is currently marked as not cachable. 10533 * This means portions of the file are locked (through VOP_FRLOCK). 10534 * In this case the map request must be refused. We use 10535 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10536 * 10537 * Atomically increment r_inmap after acquiring r_rwlock. The 10538 * idea here is to acquire r_rwlock to block read/write and 10539 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10540 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10541 * and we can prevent the deadlock that would have occurred 10542 * when nfs4_addmap() would have acquired it out of order. 10543 * 10544 * Since we are not protecting r_inmap by any lock, we do not 10545 * hold any lock when we decrement it. We atomically decrement 10546 * r_inmap after we release r_lkserlock. 10547 */ 10548 10549 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10550 return (EINTR); 10551 atomic_inc_uint(&rp->r_inmap); 10552 nfs_rw_exit(&rp->r_rwlock); 10553 10554 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10555 atomic_dec_uint(&rp->r_inmap); 10556 return (EINTR); 10557 } 10558 10559 10560 if (vp->v_flag & VNOCACHE) { 10561 error = EAGAIN; 10562 goto done; 10563 } 10564 10565 /* 10566 * Don't allow concurrent locks and mapping if mandatory locking is 10567 * enabled. 10568 */ 10569 if (flk_has_remote_locks(vp)) { 10570 struct vattr va; 10571 va.va_mask = AT_MODE; 10572 error = nfs4getattr(vp, &va, cr); 10573 if (error != 0) 10574 goto done; 10575 if (MANDLOCK(vp, va.va_mode)) { 10576 error = EAGAIN; 10577 goto done; 10578 } 10579 } 10580 10581 /* 10582 * It is possible that the rnode has a lost lock request that we 10583 * are still trying to recover, and that the request conflicts with 10584 * this map request. 10585 * 10586 * An alternative approach would be for nfs4_safemap() to consider 10587 * queued lock requests when deciding whether to set or clear 10588 * VNOCACHE. This would require the frlock code path to call 10589 * nfs4_safemap() after enqueing a lost request. 10590 */ 10591 if (nfs4_map_lost_lock_conflict(vp)) { 10592 error = EAGAIN; 10593 goto done; 10594 } 10595 10596 as_rangelock(as); 10597 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10598 if (error != 0) { 10599 as_rangeunlock(as); 10600 goto done; 10601 } 10602 10603 if (vp->v_type == VREG) { 10604 /* 10605 * We need to retrieve the open stream 10606 */ 10607 nfs4_open_stream_t *osp = NULL; 10608 nfs4_open_owner_t *oop = NULL; 10609 10610 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10611 if (oop != NULL) { 10612 /* returns with 'os_sync_lock' held */ 10613 osp = find_open_stream(oop, rp); 10614 open_owner_rele(oop); 10615 } 10616 if (osp == NULL) { 10617 #ifdef DEBUG 10618 if (nfs4_force_open_before_mmap) { 10619 error = EIO; 10620 goto done; 10621 } 10622 #endif 10623 /* returns with 'os_sync_lock' held */ 10624 error = open_and_get_osp(vp, cr, &osp); 10625 if (osp == NULL) { 10626 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10627 "nfs4_map: we tried to OPEN the file " 10628 "but again no osp, so fail with EIO")); 10629 goto done; 10630 } 10631 } 10632 10633 if (osp->os_failed_reopen) { 10634 mutex_exit(&osp->os_sync_lock); 10635 open_stream_rele(osp, rp); 10636 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10637 "nfs4_map: os_failed_reopen set on " 10638 "osp %p, cr %p, rp %s", (void *)osp, 10639 (void *)cr, rnode4info(rp))); 10640 error = EIO; 10641 goto done; 10642 } 10643 mutex_exit(&osp->os_sync_lock); 10644 open_stream_rele(osp, rp); 10645 } 10646 10647 vn_a.vp = vp; 10648 vn_a.offset = off; 10649 vn_a.type = (flags & MAP_TYPE); 10650 vn_a.prot = (uchar_t)prot; 10651 vn_a.maxprot = (uchar_t)maxprot; 10652 vn_a.flags = (flags & ~MAP_TYPE); 10653 vn_a.cred = cr; 10654 vn_a.amp = NULL; 10655 vn_a.szc = 0; 10656 vn_a.lgrp_mem_policy_flags = 0; 10657 10658 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10659 as_rangeunlock(as); 10660 10661 done: 10662 nfs_rw_exit(&rp->r_lkserlock); 10663 atomic_dec_uint(&rp->r_inmap); 10664 return (error); 10665 } 10666 10667 /* 10668 * We're most likely dealing with a kernel module that likes to READ 10669 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10670 * officially OPEN the file to create the necessary client state 10671 * for bookkeeping of os_mmap_read/write counts. 10672 * 10673 * Since VOP_MAP only passes in a pointer to the vnode rather than 10674 * a double pointer, we can't handle the case where nfs4open_otw() 10675 * returns a different vnode than the one passed into VOP_MAP (since 10676 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10677 * we return NULL and let nfs4_map() fail. Note: the only case where 10678 * this should happen is if the file got removed and replaced with the 10679 * same name on the server (in addition to the fact that we're trying 10680 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10681 */ 10682 static int 10683 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10684 { 10685 rnode4_t *rp, *drp; 10686 vnode_t *dvp, *open_vp; 10687 char file_name[MAXNAMELEN]; 10688 int just_created; 10689 nfs4_open_stream_t *osp; 10690 nfs4_open_owner_t *oop; 10691 int error; 10692 10693 *ospp = NULL; 10694 open_vp = map_vp; 10695 10696 rp = VTOR4(open_vp); 10697 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10698 return (error); 10699 drp = VTOR4(dvp); 10700 10701 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10702 VN_RELE(dvp); 10703 return (EINTR); 10704 } 10705 10706 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10707 nfs_rw_exit(&drp->r_rwlock); 10708 VN_RELE(dvp); 10709 return (error); 10710 } 10711 10712 mutex_enter(&rp->r_statev4_lock); 10713 if (rp->created_v4) { 10714 rp->created_v4 = 0; 10715 mutex_exit(&rp->r_statev4_lock); 10716 10717 dnlc_update(dvp, file_name, open_vp); 10718 /* This is needed so we don't bump the open ref count */ 10719 just_created = 1; 10720 } else { 10721 mutex_exit(&rp->r_statev4_lock); 10722 just_created = 0; 10723 } 10724 10725 VN_HOLD(map_vp); 10726 10727 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10728 just_created); 10729 if (error) { 10730 nfs_rw_exit(&drp->r_rwlock); 10731 VN_RELE(dvp); 10732 VN_RELE(map_vp); 10733 return (error); 10734 } 10735 10736 nfs_rw_exit(&drp->r_rwlock); 10737 VN_RELE(dvp); 10738 10739 /* 10740 * If nfs4open_otw() returned a different vnode then "undo" 10741 * the open and return failure to the caller. 10742 */ 10743 if (!VN_CMP(open_vp, map_vp)) { 10744 nfs4_error_t e; 10745 10746 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10747 "open returned a different vnode")); 10748 /* 10749 * If there's an error, ignore it, 10750 * and let VOP_INACTIVE handle it. 10751 */ 10752 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10753 CLOSE_NORM, 0, 0, 0); 10754 VN_RELE(map_vp); 10755 return (EIO); 10756 } 10757 10758 VN_RELE(map_vp); 10759 10760 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10761 if (!oop) { 10762 nfs4_error_t e; 10763 10764 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10765 "no open owner")); 10766 /* 10767 * If there's an error, ignore it, 10768 * and let VOP_INACTIVE handle it. 10769 */ 10770 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10771 CLOSE_NORM, 0, 0, 0); 10772 return (EIO); 10773 } 10774 osp = find_open_stream(oop, rp); 10775 open_owner_rele(oop); 10776 *ospp = osp; 10777 return (0); 10778 } 10779 10780 /* 10781 * Please be aware that when this function is called, the address space write 10782 * a_lock is held. Do not put over the wire calls in this function. 10783 */ 10784 /* ARGSUSED */ 10785 static int 10786 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10787 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10788 caller_context_t *ct) 10789 { 10790 rnode4_t *rp; 10791 int error = 0; 10792 mntinfo4_t *mi; 10793 10794 mi = VTOMI4(vp); 10795 rp = VTOR4(vp); 10796 10797 if (nfs_zone() != mi->mi_zone) 10798 return (EIO); 10799 if (vp->v_flag & VNOMAP) 10800 return (ENOSYS); 10801 10802 /* 10803 * Don't need to update the open stream first, since this 10804 * mmap can't add any additional share access that isn't 10805 * already contained in the open stream (for the case where we 10806 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10807 * take into account os_mmap_read[write] counts). 10808 */ 10809 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10810 10811 if (vp->v_type == VREG) { 10812 /* 10813 * We need to retrieve the open stream and update the counts. 10814 * If there is no open stream here, something is wrong. 10815 */ 10816 nfs4_open_stream_t *osp = NULL; 10817 nfs4_open_owner_t *oop = NULL; 10818 10819 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10820 if (oop != NULL) { 10821 /* returns with 'os_sync_lock' held */ 10822 osp = find_open_stream(oop, rp); 10823 open_owner_rele(oop); 10824 } 10825 if (osp == NULL) { 10826 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10827 "nfs4_addmap: we should have an osp" 10828 "but we don't, so fail with EIO")); 10829 error = EIO; 10830 goto out; 10831 } 10832 10833 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10834 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10835 10836 /* 10837 * Update the map count in the open stream. 10838 * This is necessary in the case where we 10839 * open/mmap/close/, then the server reboots, and we 10840 * attempt to reopen. If the mmap doesn't add share 10841 * access then we send an invalid reopen with 10842 * access = NONE. 10843 * 10844 * We need to specifically check each PROT_* so a mmap 10845 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10846 * read and write access. A simple comparison of prot 10847 * to ~PROT_WRITE to determine read access is insufficient 10848 * since prot can be |= with PROT_USER, etc. 10849 */ 10850 10851 /* 10852 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10853 */ 10854 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10855 osp->os_mmap_write += btopr(len); 10856 if (maxprot & PROT_READ) 10857 osp->os_mmap_read += btopr(len); 10858 if (maxprot & PROT_EXEC) 10859 osp->os_mmap_read += btopr(len); 10860 /* 10861 * Ensure that os_mmap_read gets incremented, even if 10862 * maxprot were to look like PROT_NONE. 10863 */ 10864 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10865 !(maxprot & PROT_EXEC)) 10866 osp->os_mmap_read += btopr(len); 10867 osp->os_mapcnt += btopr(len); 10868 mutex_exit(&osp->os_sync_lock); 10869 open_stream_rele(osp, rp); 10870 } 10871 10872 out: 10873 /* 10874 * If we got an error, then undo our 10875 * incrementing of 'r_mapcnt'. 10876 */ 10877 10878 if (error) { 10879 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10880 ASSERT(rp->r_mapcnt >= 0); 10881 } 10882 return (error); 10883 } 10884 10885 /* ARGSUSED */ 10886 static int 10887 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10888 { 10889 10890 return (VTOR4(vp1) == VTOR4(vp2)); 10891 } 10892 10893 /* 10894 * Data structure for nfs4_lkserlock_callback() function. 10895 */ 10896 struct nfs4_lkserlock_callback_data { 10897 vnode_t *vp; 10898 int rc; 10899 }; 10900 10901 /* 10902 * Callback function for reclock(). 10903 */ 10904 static callb_cpr_t * 10905 nfs4_lkserlock_callback(flk_cb_when_t when, void *infop) 10906 { 10907 struct nfs4_lkserlock_callback_data *dp = 10908 (struct nfs4_lkserlock_callback_data *)infop; 10909 rnode4_t *rp = VTOR4(dp->vp); 10910 10911 if (when == FLK_BEFORE_SLEEP) 10912 nfs_rw_exit(&rp->r_lkserlock); 10913 else 10914 dp->rc = nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, 10915 INTR4(dp->vp)); 10916 10917 return (NULL); 10918 } 10919 10920 /* ARGSUSED */ 10921 static int 10922 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10923 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10924 caller_context_t *ct) 10925 { 10926 int rc = 0; 10927 rnode4_t *rp; 10928 int intr = INTR4(vp); 10929 nfs4_error_t e; 10930 int frcmd; 10931 struct lm_sysid *ls = NULL; 10932 10933 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10934 return (EIO); 10935 10936 /* check for valid cmd parameter and set frcmd appropriately */ 10937 switch (cmd) { 10938 case F_GETLK: 10939 frcmd = 0; 10940 break; 10941 case F_SETLK: 10942 frcmd = SETFLCK; 10943 break; 10944 case F_SETLKW: 10945 frcmd = SETFLCK | SLPFLCK; 10946 break; 10947 default: 10948 return (EINVAL); 10949 } 10950 10951 /* 10952 * If lock is relative to EOF, we need the newest length of the file. 10953 * Therefore invalidate the ATTR_CACHE. 10954 */ 10955 if (bfp->l_whence == 2) /* SEEK_END */ 10956 PURGE_ATTRCACHE4(vp); 10957 10958 /* 10959 * If the filesystem is mounted using local locking, pass the 10960 * request off to the local locking code. 10961 */ 10962 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10963 if (cmd == F_SETLK || cmd == F_SETLKW) { 10964 /* 10965 * For complete safety, we should be holding 10966 * r_lkserlock. However, we can't call 10967 * nfs4_safelock and then fs_frlock while 10968 * holding r_lkserlock, so just invoke 10969 * nfs4_safelock and expect that this will 10970 * catch enough of the cases. 10971 */ 10972 if (!nfs4_safelock(vp, bfp, cr)) 10973 return (EAGAIN); 10974 } 10975 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10976 } 10977 10978 /* 10979 * Convert the offset. We need to do this to make sure our view of the 10980 * locking range is always the same through the rest of this function. 10981 * This is especially needed for bfp->l_whence == SEEK_END, because the 10982 * length of the file could change anytime and thus the locking range 10983 * would be a moving target for us. 10984 * 10985 * For the bfp->l_whence == SEEK_CUR case this is just a convenient 10986 * conversion to make the life easier for nfs4frlock(). 10987 */ 10988 rc = convoff(vp, bfp, 0, offset); 10989 if (rc != 0) 10990 return (rc); 10991 10992 if (bfp->l_type == F_UNLCK) { 10993 u_offset_t start, end; 10994 10995 /* 10996 * Shortcut for trivial case. 10997 */ 10998 if (cmd == F_GETLK) 10999 return (rc); 11000 11001 /* 11002 * For every lock or unlock request we need to do two steps: 11003 * (un)register the local lock, and (un)register the lock at 11004 * the NFSv4 server. It is essential to make sure the lock 11005 * status registered at the server and registered locally is 11006 * same and never goes out of sync. This means that if one 11007 * step fails, the other one needs to be either skipped, or 11008 * reverted. 11009 * 11010 * For lock requests the situation is easy since a lock 11011 * registration can be reverted without any risk of data 11012 * corruption. 11013 * 11014 * The unlock requests cannot be reverted because once a lock 11015 * is unregistered the race window is open and some other 11016 * process could grab a conflicting lock. This means that once 11017 * the first step (the first lock unregistration) succeeded, 11018 * the second step cannot fail. The second step for the unlock 11019 * request is the local lock unregistration by the reclock() 11020 * call. 11021 * 11022 * The only way how the reclock() call for an unlock request 11023 * could fail is the invalid unlock range so we check it here, 11024 * before the lock is unregistered at NFSv4 server. This 11025 * duplicates the check done in the reclock() function. 11026 */ 11027 rc = flk_convert_lock_data(vp, bfp, &start, &end, offset); 11028 if (rc != 0) 11029 return (rc); 11030 rc = flk_check_lock_data(start, end, MAXEND); 11031 if (rc != 0) 11032 return (rc); 11033 11034 intr = 0; 11035 } 11036 11037 /* 11038 * For F_SETLK and F_SETLKW we need to set sysid. 11039 */ 11040 if (cmd == F_SETLK || cmd == F_SETLKW) { 11041 rc = nfs4frlock_get_sysid(&ls, vp, bfp); 11042 if (rc != 0) 11043 return (rc); 11044 11045 /* 11046 * Client locks are registerred locally by oring the sysid with 11047 * LM_SYSID_CLIENT. The server registers locks locally using 11048 * just the sysid. We need to distinguish between the two to 11049 * avoid collision in a case one machine is used as both client 11050 * and server. 11051 */ 11052 bfp->l_sysid |= LM_SYSID_CLIENT; 11053 } 11054 11055 bfp->l_pid = curproc->p_pid; 11056 11057 rp = VTOR4(vp); 11058 11059 /* 11060 * Check whether the given lock request can proceed, given the 11061 * current file mappings. 11062 */ 11063 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) { 11064 if (ls != NULL) 11065 lm_rel_sysid(ls); 11066 return (EINTR); 11067 } 11068 if (cmd == F_SETLK || cmd == F_SETLKW) { 11069 if (!nfs4_safelock(vp, bfp, cr)) { 11070 rc = EAGAIN; 11071 goto done; 11072 } 11073 } 11074 11075 /* 11076 * For query we will try to find a conflicting local lock first by 11077 * calling reclock(). 11078 * 11079 * In a case this is a lock request we need to register it locally 11080 * first before we consult the NFSv4 server. 11081 */ 11082 if (cmd == F_GETLK || bfp->l_type != F_UNLCK) { 11083 /* 11084 * Save l_type. This is needed in a case the query (F_GETLK) 11085 * founds no local lock and we need to consult the server to 11086 * find possible conflicting lock. 11087 */ 11088 short saved_l_type = bfp->l_type; 11089 11090 /* 11091 * If we might sleep in reclock() we need to register a 11092 * callback to release the r_lkserlock during the sleep. 11093 */ 11094 if ((frcmd & SLPFLCK) == 0) { 11095 rc = reclock(vp, bfp, frcmd, flag, 0, flk_cbp); 11096 } else { 11097 flk_callback_t callback; 11098 struct nfs4_lkserlock_callback_data callback_data = 11099 {vp, 0}; 11100 11101 flk_add_callback(&callback, nfs4_lkserlock_callback, 11102 &callback_data, flk_cbp); 11103 rc = reclock(vp, bfp, frcmd, flag, 0, &callback); 11104 flk_del_callback(&callback); 11105 11106 if (callback_data.rc != 0) { 11107 /* 11108 * The nfs_rw_enter_sig() call in 11109 * nfs4_lkserlock_callback() failed. 11110 */ 11111 11112 if (rc == 0) { 11113 /* 11114 * The reclock() call above succeeded 11115 * so we need to revert it. 11116 */ 11117 bfp->l_type = F_UNLCK; 11118 rc = reclock(vp, bfp, frcmd, flag, 0, 11119 flk_cbp); 11120 /* The unlock cannot fail */ 11121 ASSERT(rc == 0); 11122 11123 /* 11124 * We are here because we failed to 11125 * acquire r_lkserlock in 11126 * nfs4_lkserlock_callback() due to a 11127 * signal. Return the appropriate 11128 * error. 11129 */ 11130 rc = EINTR; 11131 } 11132 11133 ASSERT(ls != NULL); 11134 lm_rel_sysid(ls); 11135 11136 return (rc); 11137 } 11138 11139 /* 11140 * We possibly released r_lkserlock in reclock() so 11141 * make sure it is still safe to lock the file. 11142 */ 11143 if (!nfs4_safelock(vp, bfp, cr)) { 11144 rc = EAGAIN; 11145 goto revert; 11146 } 11147 11148 } 11149 11150 /* 11151 * If the reclock() call failed we are done and we will return 11152 * an error to the caller. Similarly, if we found a 11153 * conflicting lock registered locally we are done too. We do 11154 * not need to consult the server. 11155 */ 11156 if ((rc != 0) || (cmd == F_GETLK && bfp->l_type != F_UNLCK)) 11157 goto done; 11158 11159 /* 11160 * If this is a query (F_GETLK) and we didn't found any 11161 * conflicting local lock (otherwise we would just jump out 11162 * above) the original l_type got replaced by F_UNLCK. Restore 11163 * its value so we will ask the server with original l_type. 11164 */ 11165 if (cmd == F_GETLK) 11166 bfp->l_type = saved_l_type; 11167 } 11168 11169 /* 11170 * Flush the cache after waiting for async I/O to finish. For new 11171 * locks, this is so that the process gets the latest bits from the 11172 * server. For unlocks, this is so that other clients see the 11173 * latest bits once the file has been unlocked. If currently dirty 11174 * pages can't be flushed, then don't allow a lock to be set. But 11175 * allow unlocks to succeed, to avoid having orphan locks on the 11176 * server. 11177 */ 11178 if (cmd != F_GETLK) { 11179 mutex_enter(&rp->r_statelock); 11180 while (rp->r_count > 0) { 11181 if (intr) { 11182 klwp_t *lwp = ttolwp(curthread); 11183 11184 if (lwp != NULL) 11185 lwp->lwp_nostop++; 11186 if (cv_wait_sig(&rp->r_cv, 11187 &rp->r_statelock) == 0) { 11188 if (lwp != NULL) 11189 lwp->lwp_nostop--; 11190 rc = EINTR; 11191 break; 11192 } 11193 if (lwp != NULL) 11194 lwp->lwp_nostop--; 11195 } else 11196 cv_wait(&rp->r_cv, &rp->r_statelock); 11197 } 11198 mutex_exit(&rp->r_statelock); 11199 if (rc != 0) { 11200 ASSERT(bfp->l_type != F_UNLCK); 11201 11202 goto revert; 11203 } 11204 11205 rc = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 11206 if (rc != 0) { 11207 if (rc == ENOSPC || rc == EDQUOT) { 11208 mutex_enter(&rp->r_statelock); 11209 if (!rp->r_error) 11210 rp->r_error = rc; 11211 mutex_exit(&rp->r_statelock); 11212 } 11213 11214 /* 11215 * If this was a lock request, make sure it is 11216 * reverted. 11217 */ 11218 if (bfp->l_type != F_UNLCK) { 11219 rc = ENOLCK; 11220 goto revert; 11221 } 11222 } 11223 } 11224 11225 /* 11226 * Call the lock manager to do the real work of contacting 11227 * the server and obtaining the lock. 11228 */ 11229 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, cr, &e, NULL, NULL); 11230 rc = e.error; 11231 11232 if (rc == 0) 11233 nfs4_lockcompletion(vp, cmd); 11234 11235 revert: 11236 /* 11237 * If this is either successful unlock request or a lock request that 11238 * failed we should unregister/revert the local lock now. 11239 */ 11240 if ((rc == 0 && cmd != F_GETLK && bfp->l_type == F_UNLCK) || 11241 (rc != 0 && cmd != F_GETLK && bfp->l_type != F_UNLCK)) { 11242 int r; 11243 11244 bfp->l_type = F_UNLCK; 11245 r = reclock(vp, bfp, frcmd, flag, 0, flk_cbp); 11246 /* The unlock cannot fail */ 11247 ASSERT(r == 0); 11248 } 11249 11250 done: 11251 nfs_rw_exit(&rp->r_lkserlock); 11252 if (ls != NULL) 11253 lm_rel_sysid(ls); 11254 11255 return (rc); 11256 } 11257 11258 /* 11259 * Free storage space associated with the specified vnode. The portion 11260 * to be freed is specified by bfp->l_start and bfp->l_len (already 11261 * normalized to a "whence" of 0). 11262 * 11263 * This is an experimental facility whose continued existence is not 11264 * guaranteed. Currently, we only support the special case 11265 * of l_len == 0, meaning free to end of file. 11266 */ 11267 /* ARGSUSED */ 11268 static int 11269 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 11270 offset_t offset, cred_t *cr, caller_context_t *ct) 11271 { 11272 int error; 11273 11274 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11275 return (EIO); 11276 ASSERT(vp->v_type == VREG); 11277 if (cmd != F_FREESP) 11278 return (EINVAL); 11279 11280 error = convoff(vp, bfp, 0, offset); 11281 if (!error) { 11282 ASSERT(bfp->l_start >= 0); 11283 if (bfp->l_len == 0) { 11284 struct vattr va; 11285 11286 va.va_mask = AT_SIZE; 11287 va.va_size = bfp->l_start; 11288 error = nfs4setattr(vp, &va, 0, cr, NULL); 11289 11290 if (error == 0 && bfp->l_start == 0) 11291 vnevent_truncate(vp, ct); 11292 } else 11293 error = EINVAL; 11294 } 11295 11296 return (error); 11297 } 11298 11299 /* ARGSUSED */ 11300 int 11301 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 11302 { 11303 rnode4_t *rp; 11304 rp = VTOR4(vp); 11305 11306 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 11307 vp = RTOV4(rp); 11308 } 11309 *vpp = vp; 11310 return (0); 11311 } 11312 11313 /* 11314 * Setup and add an address space callback to do the work of the delmap call. 11315 * The callback will (and must be) deleted in the actual callback function. 11316 * 11317 * This is done in order to take care of the problem that we have with holding 11318 * the address space's a_lock for a long period of time (e.g. if the NFS server 11319 * is down). Callbacks will be executed in the address space code while the 11320 * a_lock is not held. Holding the address space's a_lock causes things such 11321 * as ps and fork to hang because they are trying to acquire this lock as well. 11322 */ 11323 /* ARGSUSED */ 11324 static int 11325 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11326 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11327 caller_context_t *ct) 11328 { 11329 int caller_found; 11330 int error; 11331 rnode4_t *rp; 11332 nfs4_delmap_args_t *dmapp; 11333 nfs4_delmapcall_t *delmap_call; 11334 11335 if (vp->v_flag & VNOMAP) 11336 return (ENOSYS); 11337 11338 /* 11339 * A process may not change zones if it has NFS pages mmap'ed 11340 * in, so we can't legitimately get here from the wrong zone. 11341 */ 11342 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11343 11344 rp = VTOR4(vp); 11345 11346 /* 11347 * The way that the address space of this process deletes its mapping 11348 * of this file is via the following call chains: 11349 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11350 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11351 * 11352 * With the use of address space callbacks we are allowed to drop the 11353 * address space lock, a_lock, while executing the NFS operations that 11354 * need to go over the wire. Returning EAGAIN to the caller of this 11355 * function is what drives the execution of the callback that we add 11356 * below. The callback will be executed by the address space code 11357 * after dropping the a_lock. When the callback is finished, since 11358 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11359 * is called again on the same segment to finish the rest of the work 11360 * that needs to happen during unmapping. 11361 * 11362 * This action of calling back into the segment driver causes 11363 * nfs4_delmap() to get called again, but since the callback was 11364 * already executed at this point, it already did the work and there 11365 * is nothing left for us to do. 11366 * 11367 * To Summarize: 11368 * - The first time nfs4_delmap is called by the current thread is when 11369 * we add the caller associated with this delmap to the delmap caller 11370 * list, add the callback, and return EAGAIN. 11371 * - The second time in this call chain when nfs4_delmap is called we 11372 * will find this caller in the delmap caller list and realize there 11373 * is no more work to do thus removing this caller from the list and 11374 * returning the error that was set in the callback execution. 11375 */ 11376 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11377 if (caller_found) { 11378 /* 11379 * 'error' is from the actual delmap operations. To avoid 11380 * hangs, we need to handle the return of EAGAIN differently 11381 * since this is what drives the callback execution. 11382 * In this case, we don't want to return EAGAIN and do the 11383 * callback execution because there are none to execute. 11384 */ 11385 if (error == EAGAIN) 11386 return (0); 11387 else 11388 return (error); 11389 } 11390 11391 /* current caller was not in the list */ 11392 delmap_call = nfs4_init_delmapcall(); 11393 11394 mutex_enter(&rp->r_statelock); 11395 list_insert_tail(&rp->r_indelmap, delmap_call); 11396 mutex_exit(&rp->r_statelock); 11397 11398 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11399 11400 dmapp->vp = vp; 11401 dmapp->off = off; 11402 dmapp->addr = addr; 11403 dmapp->len = len; 11404 dmapp->prot = prot; 11405 dmapp->maxprot = maxprot; 11406 dmapp->flags = flags; 11407 dmapp->cr = cr; 11408 dmapp->caller = delmap_call; 11409 11410 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11411 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11412 11413 return (error ? error : EAGAIN); 11414 } 11415 11416 static nfs4_delmapcall_t * 11417 nfs4_init_delmapcall() 11418 { 11419 nfs4_delmapcall_t *delmap_call; 11420 11421 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11422 delmap_call->call_id = curthread; 11423 delmap_call->error = 0; 11424 11425 return (delmap_call); 11426 } 11427 11428 static void 11429 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11430 { 11431 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11432 } 11433 11434 /* 11435 * Searches for the current delmap caller (based on curthread) in the list of 11436 * callers. If it is found, we remove it and free the delmap caller. 11437 * Returns: 11438 * 0 if the caller wasn't found 11439 * 1 if the caller was found, removed and freed. *errp will be set 11440 * to what the result of the delmap was. 11441 */ 11442 static int 11443 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11444 { 11445 nfs4_delmapcall_t *delmap_call; 11446 11447 /* 11448 * If the list doesn't exist yet, we create it and return 11449 * that the caller wasn't found. No list = no callers. 11450 */ 11451 mutex_enter(&rp->r_statelock); 11452 if (!(rp->r_flags & R4DELMAPLIST)) { 11453 /* The list does not exist */ 11454 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11455 offsetof(nfs4_delmapcall_t, call_node)); 11456 rp->r_flags |= R4DELMAPLIST; 11457 mutex_exit(&rp->r_statelock); 11458 return (0); 11459 } else { 11460 /* The list exists so search it */ 11461 for (delmap_call = list_head(&rp->r_indelmap); 11462 delmap_call != NULL; 11463 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11464 if (delmap_call->call_id == curthread) { 11465 /* current caller is in the list */ 11466 *errp = delmap_call->error; 11467 list_remove(&rp->r_indelmap, delmap_call); 11468 mutex_exit(&rp->r_statelock); 11469 nfs4_free_delmapcall(delmap_call); 11470 return (1); 11471 } 11472 } 11473 } 11474 mutex_exit(&rp->r_statelock); 11475 return (0); 11476 } 11477 11478 /* 11479 * Remove some pages from an mmap'd vnode. Just update the 11480 * count of pages. If doing close-to-open, then flush and 11481 * commit all of the pages associated with this file. 11482 * Otherwise, start an asynchronous page flush to write out 11483 * any dirty pages. This will also associate a credential 11484 * with the rnode which can be used to write the pages. 11485 */ 11486 /* ARGSUSED */ 11487 static void 11488 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11489 { 11490 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11491 rnode4_t *rp; 11492 mntinfo4_t *mi; 11493 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11494 11495 rp = VTOR4(dmapp->vp); 11496 mi = VTOMI4(dmapp->vp); 11497 11498 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11499 ASSERT(rp->r_mapcnt >= 0); 11500 11501 /* 11502 * Initiate a page flush and potential commit if there are 11503 * pages, the file system was not mounted readonly, the segment 11504 * was mapped shared, and the pages themselves were writeable. 11505 */ 11506 if (nfs4_has_pages(dmapp->vp) && 11507 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11508 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11509 mutex_enter(&rp->r_statelock); 11510 rp->r_flags |= R4DIRTY; 11511 mutex_exit(&rp->r_statelock); 11512 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11513 dmapp->len, dmapp->cr); 11514 if (!e.error) { 11515 mutex_enter(&rp->r_statelock); 11516 e.error = rp->r_error; 11517 rp->r_error = 0; 11518 mutex_exit(&rp->r_statelock); 11519 } 11520 } else 11521 e.error = 0; 11522 11523 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11524 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11525 B_INVAL, dmapp->cr, NULL); 11526 11527 if (e.error) { 11528 e.stat = puterrno4(e.error); 11529 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11530 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11531 dmapp->caller->error = e.error; 11532 } 11533 11534 /* Check to see if we need to close the file */ 11535 11536 if (dmapp->vp->v_type == VREG) { 11537 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11538 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11539 11540 if (e.error != 0 || e.stat != NFS4_OK) { 11541 /* 11542 * Since it is possible that e.error == 0 and 11543 * e.stat != NFS4_OK (and vice versa), 11544 * we do the proper checking in order to get both 11545 * e.error and e.stat reporting the correct info. 11546 */ 11547 if (e.stat == NFS4_OK) 11548 e.stat = puterrno4(e.error); 11549 if (e.error == 0) 11550 e.error = geterrno4(e.stat); 11551 11552 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11553 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11554 dmapp->caller->error = e.error; 11555 } 11556 } 11557 11558 (void) as_delete_callback(as, arg); 11559 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11560 } 11561 11562 11563 static uint_t 11564 fattr4_maxfilesize_to_bits(uint64_t ll) 11565 { 11566 uint_t l = 1; 11567 11568 if (ll == 0) { 11569 return (0); 11570 } 11571 11572 if (ll & 0xffffffff00000000) { 11573 l += 32; ll >>= 32; 11574 } 11575 if (ll & 0xffff0000) { 11576 l += 16; ll >>= 16; 11577 } 11578 if (ll & 0xff00) { 11579 l += 8; ll >>= 8; 11580 } 11581 if (ll & 0xf0) { 11582 l += 4; ll >>= 4; 11583 } 11584 if (ll & 0xc) { 11585 l += 2; ll >>= 2; 11586 } 11587 if (ll & 0x2) { 11588 l += 1; 11589 } 11590 return (l); 11591 } 11592 11593 static int 11594 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11595 { 11596 vnode_t *avp = NULL; 11597 int error; 11598 11599 if ((error = nfs4lookup_xattr(vp, "", &avp, 11600 LOOKUP_XATTR, cr)) == 0) 11601 error = do_xattr_exists_check(avp, valp, cr); 11602 if (avp) 11603 VN_RELE(avp); 11604 11605 return (error); 11606 } 11607 11608 /* ARGSUSED */ 11609 int 11610 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11611 caller_context_t *ct) 11612 { 11613 int error; 11614 hrtime_t t; 11615 rnode4_t *rp; 11616 nfs4_ga_res_t gar; 11617 nfs4_ga_ext_res_t ger; 11618 11619 gar.n4g_ext_res = &ger; 11620 11621 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11622 return (EIO); 11623 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11624 *valp = MAXPATHLEN; 11625 return (0); 11626 } 11627 if (cmd == _PC_ACL_ENABLED) { 11628 *valp = _ACL_ACE_ENABLED; 11629 return (0); 11630 } 11631 11632 rp = VTOR4(vp); 11633 if (cmd == _PC_XATTR_EXISTS) { 11634 /* 11635 * The existence of the xattr directory is not sufficient 11636 * for determining whether generic user attributes exists. 11637 * The attribute directory could only be a transient directory 11638 * used for Solaris sysattr support. Do a small readdir 11639 * to verify if the only entries are sysattrs or not. 11640 * 11641 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11642 * is NULL. Once the xadir vp exists, we can create xattrs, 11643 * and we don't have any way to update the "base" object's 11644 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11645 * could help out. 11646 */ 11647 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11648 rp->r_xattr_dir == NULL) { 11649 return (nfs4_have_xattrs(vp, valp, cr)); 11650 } 11651 } else { /* OLD CODE */ 11652 if (ATTRCACHE4_VALID(vp)) { 11653 mutex_enter(&rp->r_statelock); 11654 if (rp->r_pathconf.pc4_cache_valid) { 11655 error = 0; 11656 switch (cmd) { 11657 case _PC_FILESIZEBITS: 11658 *valp = 11659 rp->r_pathconf.pc4_filesizebits; 11660 break; 11661 case _PC_LINK_MAX: 11662 *valp = 11663 rp->r_pathconf.pc4_link_max; 11664 break; 11665 case _PC_NAME_MAX: 11666 *valp = 11667 rp->r_pathconf.pc4_name_max; 11668 break; 11669 case _PC_CHOWN_RESTRICTED: 11670 *valp = 11671 rp->r_pathconf.pc4_chown_restricted; 11672 break; 11673 case _PC_NO_TRUNC: 11674 *valp = 11675 rp->r_pathconf.pc4_no_trunc; 11676 break; 11677 default: 11678 error = EINVAL; 11679 break; 11680 } 11681 mutex_exit(&rp->r_statelock); 11682 #ifdef DEBUG 11683 nfs4_pathconf_cache_hits++; 11684 #endif 11685 return (error); 11686 } 11687 mutex_exit(&rp->r_statelock); 11688 } 11689 } 11690 #ifdef DEBUG 11691 nfs4_pathconf_cache_misses++; 11692 #endif 11693 11694 t = gethrtime(); 11695 11696 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11697 11698 if (error) { 11699 mutex_enter(&rp->r_statelock); 11700 rp->r_pathconf.pc4_cache_valid = FALSE; 11701 rp->r_pathconf.pc4_xattr_valid = FALSE; 11702 mutex_exit(&rp->r_statelock); 11703 return (error); 11704 } 11705 11706 /* interpret the max filesize */ 11707 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11708 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11709 11710 /* Store the attributes we just received */ 11711 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11712 11713 switch (cmd) { 11714 case _PC_FILESIZEBITS: 11715 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11716 break; 11717 case _PC_LINK_MAX: 11718 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11719 break; 11720 case _PC_NAME_MAX: 11721 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11722 break; 11723 case _PC_CHOWN_RESTRICTED: 11724 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11725 break; 11726 case _PC_NO_TRUNC: 11727 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11728 break; 11729 case _PC_XATTR_EXISTS: 11730 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11731 if (error = nfs4_have_xattrs(vp, valp, cr)) 11732 return (error); 11733 } 11734 break; 11735 default: 11736 return (EINVAL); 11737 } 11738 11739 return (0); 11740 } 11741 11742 /* 11743 * Called by async thread to do synchronous pageio. Do the i/o, wait 11744 * for it to complete, and cleanup the page list when done. 11745 */ 11746 static int 11747 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11748 int flags, cred_t *cr) 11749 { 11750 int error; 11751 11752 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11753 11754 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11755 if (flags & B_READ) 11756 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11757 else 11758 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11759 return (error); 11760 } 11761 11762 /* ARGSUSED */ 11763 static int 11764 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11765 int flags, cred_t *cr, caller_context_t *ct) 11766 { 11767 int error; 11768 rnode4_t *rp; 11769 11770 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11771 return (EIO); 11772 11773 if (pp == NULL) 11774 return (EINVAL); 11775 11776 rp = VTOR4(vp); 11777 mutex_enter(&rp->r_statelock); 11778 rp->r_count++; 11779 mutex_exit(&rp->r_statelock); 11780 11781 if (flags & B_ASYNC) { 11782 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11783 nfs4_sync_pageio); 11784 } else 11785 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11786 mutex_enter(&rp->r_statelock); 11787 rp->r_count--; 11788 cv_broadcast(&rp->r_cv); 11789 mutex_exit(&rp->r_statelock); 11790 return (error); 11791 } 11792 11793 /* ARGSUSED */ 11794 static void 11795 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11796 caller_context_t *ct) 11797 { 11798 int error; 11799 rnode4_t *rp; 11800 page_t *plist; 11801 page_t *pptr; 11802 offset3 offset; 11803 count3 len; 11804 k_sigset_t smask; 11805 11806 /* 11807 * We should get called with fl equal to either B_FREE or 11808 * B_INVAL. Any other value is illegal. 11809 * 11810 * The page that we are either supposed to free or destroy 11811 * should be exclusive locked and its io lock should not 11812 * be held. 11813 */ 11814 ASSERT(fl == B_FREE || fl == B_INVAL); 11815 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11816 11817 rp = VTOR4(vp); 11818 11819 /* 11820 * If the page doesn't need to be committed or we shouldn't 11821 * even bother attempting to commit it, then just make sure 11822 * that the p_fsdata byte is clear and then either free or 11823 * destroy the page as appropriate. 11824 */ 11825 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11826 pp->p_fsdata = C_NOCOMMIT; 11827 if (fl == B_FREE) 11828 page_free(pp, dn); 11829 else 11830 page_destroy(pp, dn); 11831 return; 11832 } 11833 11834 /* 11835 * If there is a page invalidation operation going on, then 11836 * if this is one of the pages being destroyed, then just 11837 * clear the p_fsdata byte and then either free or destroy 11838 * the page as appropriate. 11839 */ 11840 mutex_enter(&rp->r_statelock); 11841 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11842 mutex_exit(&rp->r_statelock); 11843 pp->p_fsdata = C_NOCOMMIT; 11844 if (fl == B_FREE) 11845 page_free(pp, dn); 11846 else 11847 page_destroy(pp, dn); 11848 return; 11849 } 11850 11851 /* 11852 * If we are freeing this page and someone else is already 11853 * waiting to do a commit, then just unlock the page and 11854 * return. That other thread will take care of commiting 11855 * this page. The page can be freed sometime after the 11856 * commit has finished. Otherwise, if the page is marked 11857 * as delay commit, then we may be getting called from 11858 * pvn_write_done, one page at a time. This could result 11859 * in one commit per page, so we end up doing lots of small 11860 * commits instead of fewer larger commits. This is bad, 11861 * we want do as few commits as possible. 11862 */ 11863 if (fl == B_FREE) { 11864 if (rp->r_flags & R4COMMITWAIT) { 11865 page_unlock(pp); 11866 mutex_exit(&rp->r_statelock); 11867 return; 11868 } 11869 if (pp->p_fsdata == C_DELAYCOMMIT) { 11870 pp->p_fsdata = C_COMMIT; 11871 page_unlock(pp); 11872 mutex_exit(&rp->r_statelock); 11873 return; 11874 } 11875 } 11876 11877 /* 11878 * Check to see if there is a signal which would prevent an 11879 * attempt to commit the pages from being successful. If so, 11880 * then don't bother with all of the work to gather pages and 11881 * generate the unsuccessful RPC. Just return from here and 11882 * let the page be committed at some later time. 11883 */ 11884 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11885 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11886 sigunintr(&smask); 11887 page_unlock(pp); 11888 mutex_exit(&rp->r_statelock); 11889 return; 11890 } 11891 sigunintr(&smask); 11892 11893 /* 11894 * We are starting to need to commit pages, so let's try 11895 * to commit as many as possible at once to reduce the 11896 * overhead. 11897 * 11898 * Set the `commit inprogress' state bit. We must 11899 * first wait until any current one finishes. Then 11900 * we initialize the c_pages list with this page. 11901 */ 11902 while (rp->r_flags & R4COMMIT) { 11903 rp->r_flags |= R4COMMITWAIT; 11904 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11905 rp->r_flags &= ~R4COMMITWAIT; 11906 } 11907 rp->r_flags |= R4COMMIT; 11908 mutex_exit(&rp->r_statelock); 11909 ASSERT(rp->r_commit.c_pages == NULL); 11910 rp->r_commit.c_pages = pp; 11911 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11912 rp->r_commit.c_commlen = PAGESIZE; 11913 11914 /* 11915 * Gather together all other pages which can be committed. 11916 * They will all be chained off r_commit.c_pages. 11917 */ 11918 nfs4_get_commit(vp); 11919 11920 /* 11921 * Clear the `commit inprogress' status and disconnect 11922 * the list of pages to be committed from the rnode. 11923 * At this same time, we also save the starting offset 11924 * and length of data to be committed on the server. 11925 */ 11926 plist = rp->r_commit.c_pages; 11927 rp->r_commit.c_pages = NULL; 11928 offset = rp->r_commit.c_commbase; 11929 len = rp->r_commit.c_commlen; 11930 mutex_enter(&rp->r_statelock); 11931 rp->r_flags &= ~R4COMMIT; 11932 cv_broadcast(&rp->r_commit.c_cv); 11933 mutex_exit(&rp->r_statelock); 11934 11935 if (curproc == proc_pageout || curproc == proc_fsflush || 11936 nfs_zone() != VTOMI4(vp)->mi_zone) { 11937 nfs4_async_commit(vp, plist, offset, len, 11938 cr, do_nfs4_async_commit); 11939 return; 11940 } 11941 11942 /* 11943 * Actually generate the COMMIT op over the wire operation. 11944 */ 11945 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11946 11947 /* 11948 * If we got an error during the commit, just unlock all 11949 * of the pages. The pages will get retransmitted to the 11950 * server during a putpage operation. 11951 */ 11952 if (error) { 11953 while (plist != NULL) { 11954 pptr = plist; 11955 page_sub(&plist, pptr); 11956 page_unlock(pptr); 11957 } 11958 return; 11959 } 11960 11961 /* 11962 * We've tried as hard as we can to commit the data to stable 11963 * storage on the server. We just unlock the rest of the pages 11964 * and clear the commit required state. They will be put 11965 * onto the tail of the cachelist if they are nolonger 11966 * mapped. 11967 */ 11968 while (plist != pp) { 11969 pptr = plist; 11970 page_sub(&plist, pptr); 11971 pptr->p_fsdata = C_NOCOMMIT; 11972 page_unlock(pptr); 11973 } 11974 11975 /* 11976 * It is possible that nfs4_commit didn't return error but 11977 * some other thread has modified the page we are going 11978 * to free/destroy. 11979 * In this case we need to rewrite the page. Do an explicit check 11980 * before attempting to free/destroy the page. If modified, needs to 11981 * be rewritten so unlock the page and return. 11982 */ 11983 if (hat_ismod(pp)) { 11984 pp->p_fsdata = C_NOCOMMIT; 11985 page_unlock(pp); 11986 return; 11987 } 11988 11989 /* 11990 * Now, as appropriate, either free or destroy the page 11991 * that we were called with. 11992 */ 11993 pp->p_fsdata = C_NOCOMMIT; 11994 if (fl == B_FREE) 11995 page_free(pp, dn); 11996 else 11997 page_destroy(pp, dn); 11998 } 11999 12000 /* 12001 * Commit requires that the current fh be the file written to. 12002 * The compound op structure is: 12003 * PUTFH(file), COMMIT 12004 */ 12005 static int 12006 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 12007 { 12008 COMPOUND4args_clnt args; 12009 COMPOUND4res_clnt res; 12010 COMMIT4res *cm_res; 12011 nfs_argop4 argop[2]; 12012 nfs_resop4 *resop; 12013 int doqueue; 12014 mntinfo4_t *mi; 12015 rnode4_t *rp; 12016 cred_t *cred_otw = NULL; 12017 bool_t needrecov = FALSE; 12018 nfs4_recov_state_t recov_state; 12019 nfs4_open_stream_t *osp = NULL; 12020 bool_t first_time = TRUE; /* first time getting OTW cred */ 12021 bool_t last_time = FALSE; /* last time getting OTW cred */ 12022 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 12023 12024 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12025 12026 rp = VTOR4(vp); 12027 12028 mi = VTOMI4(vp); 12029 recov_state.rs_flags = 0; 12030 recov_state.rs_num_retry_despite_err = 0; 12031 get_commit_cred: 12032 /* 12033 * Releases the osp, if a valid open stream is provided. 12034 * Puts a hold on the cred_otw and the new osp (if found). 12035 */ 12036 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 12037 &first_time, &last_time); 12038 args.ctag = TAG_COMMIT; 12039 recov_retry: 12040 /* 12041 * Commit ops: putfh file; commit 12042 */ 12043 args.array_len = 2; 12044 args.array = argop; 12045 12046 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12047 &recov_state, NULL); 12048 if (e.error) { 12049 crfree(cred_otw); 12050 if (osp != NULL) 12051 open_stream_rele(osp, rp); 12052 return (e.error); 12053 } 12054 12055 /* putfh directory */ 12056 argop[0].argop = OP_CPUTFH; 12057 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 12058 12059 /* commit */ 12060 argop[1].argop = OP_COMMIT; 12061 argop[1].nfs_argop4_u.opcommit.offset = offset; 12062 argop[1].nfs_argop4_u.opcommit.count = count; 12063 12064 doqueue = 1; 12065 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 12066 12067 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 12068 if (!needrecov && e.error) { 12069 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 12070 needrecov); 12071 crfree(cred_otw); 12072 if (e.error == EACCES && last_time == FALSE) 12073 goto get_commit_cred; 12074 if (osp != NULL) 12075 open_stream_rele(osp, rp); 12076 return (e.error); 12077 } 12078 12079 if (needrecov) { 12080 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 12081 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) { 12082 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12083 &recov_state, needrecov); 12084 if (!e.error) 12085 (void) xdr_free(xdr_COMPOUND4res_clnt, 12086 (caddr_t)&res); 12087 goto recov_retry; 12088 } 12089 if (e.error) { 12090 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12091 &recov_state, needrecov); 12092 crfree(cred_otw); 12093 if (osp != NULL) 12094 open_stream_rele(osp, rp); 12095 return (e.error); 12096 } 12097 /* fall through for res.status case */ 12098 } 12099 12100 if (res.status) { 12101 e.error = geterrno4(res.status); 12102 if (e.error == EACCES && last_time == FALSE) { 12103 crfree(cred_otw); 12104 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12105 &recov_state, needrecov); 12106 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12107 goto get_commit_cred; 12108 } 12109 /* 12110 * Can't do a nfs4_purge_stale_fh here because this 12111 * can cause a deadlock. nfs4_commit can 12112 * be called from nfs4_dispose which can be called 12113 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 12114 * can call back to pvn_vplist_dirty. 12115 */ 12116 if (e.error == ESTALE) { 12117 mutex_enter(&rp->r_statelock); 12118 rp->r_flags |= R4STALE; 12119 if (!rp->r_error) 12120 rp->r_error = e.error; 12121 mutex_exit(&rp->r_statelock); 12122 PURGE_ATTRCACHE4(vp); 12123 } else { 12124 mutex_enter(&rp->r_statelock); 12125 if (!rp->r_error) 12126 rp->r_error = e.error; 12127 mutex_exit(&rp->r_statelock); 12128 } 12129 } else { 12130 ASSERT(rp->r_flags & R4HAVEVERF); 12131 resop = &res.array[1]; /* commit res */ 12132 cm_res = &resop->nfs_resop4_u.opcommit; 12133 mutex_enter(&rp->r_statelock); 12134 if (cm_res->writeverf == rp->r_writeverf) { 12135 mutex_exit(&rp->r_statelock); 12136 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12137 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12138 &recov_state, needrecov); 12139 crfree(cred_otw); 12140 if (osp != NULL) 12141 open_stream_rele(osp, rp); 12142 return (0); 12143 } 12144 nfs4_set_mod(vp); 12145 rp->r_writeverf = cm_res->writeverf; 12146 mutex_exit(&rp->r_statelock); 12147 e.error = NFS_VERF_MISMATCH; 12148 } 12149 12150 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12151 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 12152 crfree(cred_otw); 12153 if (osp != NULL) 12154 open_stream_rele(osp, rp); 12155 12156 return (e.error); 12157 } 12158 12159 static void 12160 nfs4_set_mod(vnode_t *vp) 12161 { 12162 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12163 12164 /* make sure we're looking at the master vnode, not a shadow */ 12165 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check); 12166 } 12167 12168 /* 12169 * This function is used to gather a page list of the pages which 12170 * can be committed on the server. 12171 * 12172 * The calling thread must have set R4COMMIT. This bit is used to 12173 * serialize access to the commit structure in the rnode. As long 12174 * as the thread has set R4COMMIT, then it can manipulate the commit 12175 * structure without requiring any other locks. 12176 * 12177 * When this function is called from nfs4_dispose() the page passed 12178 * into nfs4_dispose() will be SE_EXCL locked, and so this function 12179 * will skip it. This is not a problem since we initially add the 12180 * page to the r_commit page list. 12181 * 12182 */ 12183 static void 12184 nfs4_get_commit(vnode_t *vp) 12185 { 12186 rnode4_t *rp; 12187 page_t *pp; 12188 kmutex_t *vphm; 12189 12190 rp = VTOR4(vp); 12191 12192 ASSERT(rp->r_flags & R4COMMIT); 12193 12194 /* make sure we're looking at the master vnode, not a shadow */ 12195 12196 if (IS_SHADOW(vp, rp)) 12197 vp = RTOV4(rp); 12198 12199 vphm = page_vnode_mutex(vp); 12200 mutex_enter(vphm); 12201 12202 /* 12203 * If there are no pages associated with this vnode, then 12204 * just return. 12205 */ 12206 if ((pp = vp->v_pages) == NULL) { 12207 mutex_exit(vphm); 12208 return; 12209 } 12210 12211 /* 12212 * Step through all of the pages associated with this vnode 12213 * looking for pages which need to be committed. 12214 */ 12215 do { 12216 /* Skip marker pages. */ 12217 if (pp->p_hash == PVN_VPLIST_HASH_TAG) 12218 continue; 12219 12220 /* 12221 * First short-cut everything (without the page_lock) 12222 * and see if this page does not need to be committed 12223 * or is modified if so then we'll just skip it. 12224 */ 12225 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 12226 continue; 12227 12228 /* 12229 * Attempt to lock the page. If we can't, then 12230 * someone else is messing with it or we have been 12231 * called from nfs4_dispose and this is the page that 12232 * nfs4_dispose was called with.. anyway just skip it. 12233 */ 12234 if (!page_trylock(pp, SE_EXCL)) 12235 continue; 12236 12237 /* 12238 * Lets check again now that we have the page lock. 12239 */ 12240 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12241 page_unlock(pp); 12242 continue; 12243 } 12244 12245 /* this had better not be a free page */ 12246 ASSERT(PP_ISFREE(pp) == 0); 12247 12248 /* 12249 * The page needs to be committed and we locked it. 12250 * Update the base and length parameters and add it 12251 * to r_pages. 12252 */ 12253 if (rp->r_commit.c_pages == NULL) { 12254 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12255 rp->r_commit.c_commlen = PAGESIZE; 12256 } else if (pp->p_offset < rp->r_commit.c_commbase) { 12257 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 12258 (offset3)pp->p_offset + rp->r_commit.c_commlen; 12259 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12260 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 12261 <= pp->p_offset) { 12262 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12263 rp->r_commit.c_commbase + PAGESIZE; 12264 } 12265 page_add(&rp->r_commit.c_pages, pp); 12266 } while ((pp = pp->p_vpnext) != vp->v_pages); 12267 12268 mutex_exit(vphm); 12269 } 12270 12271 /* 12272 * This routine is used to gather together a page list of the pages 12273 * which are to be committed on the server. This routine must not 12274 * be called if the calling thread holds any locked pages. 12275 * 12276 * The calling thread must have set R4COMMIT. This bit is used to 12277 * serialize access to the commit structure in the rnode. As long 12278 * as the thread has set R4COMMIT, then it can manipulate the commit 12279 * structure without requiring any other locks. 12280 */ 12281 static void 12282 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 12283 { 12284 12285 rnode4_t *rp; 12286 page_t *pp; 12287 u_offset_t end; 12288 u_offset_t off; 12289 ASSERT(len != 0); 12290 rp = VTOR4(vp); 12291 ASSERT(rp->r_flags & R4COMMIT); 12292 12293 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12294 12295 /* make sure we're looking at the master vnode, not a shadow */ 12296 12297 if (IS_SHADOW(vp, rp)) 12298 vp = RTOV4(rp); 12299 12300 /* 12301 * If there are no pages associated with this vnode, then 12302 * just return. 12303 */ 12304 if ((pp = vp->v_pages) == NULL) 12305 return; 12306 /* 12307 * Calculate the ending offset. 12308 */ 12309 end = soff + len; 12310 for (off = soff; off < end; off += PAGESIZE) { 12311 /* 12312 * Lookup each page by vp, offset. 12313 */ 12314 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12315 continue; 12316 /* 12317 * If this page does not need to be committed or is 12318 * modified, then just skip it. 12319 */ 12320 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12321 page_unlock(pp); 12322 continue; 12323 } 12324 12325 ASSERT(PP_ISFREE(pp) == 0); 12326 /* 12327 * The page needs to be committed and we locked it. 12328 * Update the base and length parameters and add it 12329 * to r_pages. 12330 */ 12331 if (rp->r_commit.c_pages == NULL) { 12332 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12333 rp->r_commit.c_commlen = PAGESIZE; 12334 } else { 12335 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12336 rp->r_commit.c_commbase + PAGESIZE; 12337 } 12338 page_add(&rp->r_commit.c_pages, pp); 12339 } 12340 } 12341 12342 /* 12343 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12344 * Flushes and commits data to the server. 12345 */ 12346 static int 12347 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12348 { 12349 int error; 12350 verifier4 write_verf; 12351 rnode4_t *rp = VTOR4(vp); 12352 12353 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12354 12355 /* 12356 * Flush the data portion of the file and then commit any 12357 * portions which need to be committed. This may need to 12358 * be done twice if the server has changed state since 12359 * data was last written. The data will need to be 12360 * rewritten to the server and then a new commit done. 12361 * 12362 * In fact, this may need to be done several times if the 12363 * server is having problems and crashing while we are 12364 * attempting to do this. 12365 */ 12366 12367 top: 12368 /* 12369 * Do a flush based on the poff and plen arguments. This 12370 * will synchronously write out any modified pages in the 12371 * range specified by (poff, plen). This starts all of the 12372 * i/o operations which will be waited for in the next 12373 * call to nfs4_putpage 12374 */ 12375 12376 mutex_enter(&rp->r_statelock); 12377 write_verf = rp->r_writeverf; 12378 mutex_exit(&rp->r_statelock); 12379 12380 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12381 if (error == EAGAIN) 12382 error = 0; 12383 12384 /* 12385 * Do a flush based on the poff and plen arguments. This 12386 * will synchronously write out any modified pages in the 12387 * range specified by (poff, plen) and wait until all of 12388 * the asynchronous i/o's in that range are done as well. 12389 */ 12390 if (!error) 12391 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12392 12393 if (error) 12394 return (error); 12395 12396 mutex_enter(&rp->r_statelock); 12397 if (rp->r_writeverf != write_verf) { 12398 mutex_exit(&rp->r_statelock); 12399 goto top; 12400 } 12401 mutex_exit(&rp->r_statelock); 12402 12403 /* 12404 * Now commit any pages which might need to be committed. 12405 * If the error, NFS_VERF_MISMATCH, is returned, then 12406 * start over with the flush operation. 12407 */ 12408 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12409 12410 if (error == NFS_VERF_MISMATCH) 12411 goto top; 12412 12413 return (error); 12414 } 12415 12416 /* 12417 * nfs4_commit_vp() will wait for other pending commits and 12418 * will either commit the whole file or a range, plen dictates 12419 * if we commit whole file. a value of zero indicates the whole 12420 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12421 */ 12422 static int 12423 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12424 cred_t *cr, int wait_on_writes) 12425 { 12426 rnode4_t *rp; 12427 page_t *plist; 12428 offset3 offset; 12429 count3 len; 12430 12431 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12432 12433 rp = VTOR4(vp); 12434 12435 /* 12436 * before we gather commitable pages make 12437 * sure there are no outstanding async writes 12438 */ 12439 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12440 mutex_enter(&rp->r_statelock); 12441 while (rp->r_count > 0) { 12442 cv_wait(&rp->r_cv, &rp->r_statelock); 12443 } 12444 mutex_exit(&rp->r_statelock); 12445 } 12446 12447 /* 12448 * Set the `commit inprogress' state bit. We must 12449 * first wait until any current one finishes. 12450 */ 12451 mutex_enter(&rp->r_statelock); 12452 while (rp->r_flags & R4COMMIT) { 12453 rp->r_flags |= R4COMMITWAIT; 12454 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12455 rp->r_flags &= ~R4COMMITWAIT; 12456 } 12457 rp->r_flags |= R4COMMIT; 12458 mutex_exit(&rp->r_statelock); 12459 12460 /* 12461 * Gather all of the pages which need to be 12462 * committed. 12463 */ 12464 if (plen == 0) 12465 nfs4_get_commit(vp); 12466 else 12467 nfs4_get_commit_range(vp, poff, plen); 12468 12469 /* 12470 * Clear the `commit inprogress' bit and disconnect the 12471 * page list which was gathered by nfs4_get_commit. 12472 */ 12473 plist = rp->r_commit.c_pages; 12474 rp->r_commit.c_pages = NULL; 12475 offset = rp->r_commit.c_commbase; 12476 len = rp->r_commit.c_commlen; 12477 mutex_enter(&rp->r_statelock); 12478 rp->r_flags &= ~R4COMMIT; 12479 cv_broadcast(&rp->r_commit.c_cv); 12480 mutex_exit(&rp->r_statelock); 12481 12482 /* 12483 * If any pages need to be committed, commit them and 12484 * then unlock them so that they can be freed some 12485 * time later. 12486 */ 12487 if (plist == NULL) 12488 return (0); 12489 12490 /* 12491 * No error occurred during the flush portion 12492 * of this operation, so now attempt to commit 12493 * the data to stable storage on the server. 12494 * 12495 * This will unlock all of the pages on the list. 12496 */ 12497 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12498 } 12499 12500 static int 12501 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12502 cred_t *cr) 12503 { 12504 int error; 12505 page_t *pp; 12506 12507 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12508 12509 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12510 12511 /* 12512 * If we got an error, then just unlock all of the pages 12513 * on the list. 12514 */ 12515 if (error) { 12516 while (plist != NULL) { 12517 pp = plist; 12518 page_sub(&plist, pp); 12519 page_unlock(pp); 12520 } 12521 return (error); 12522 } 12523 /* 12524 * We've tried as hard as we can to commit the data to stable 12525 * storage on the server. We just unlock the pages and clear 12526 * the commit required state. They will get freed later. 12527 */ 12528 while (plist != NULL) { 12529 pp = plist; 12530 page_sub(&plist, pp); 12531 pp->p_fsdata = C_NOCOMMIT; 12532 page_unlock(pp); 12533 } 12534 12535 return (error); 12536 } 12537 12538 static void 12539 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12540 cred_t *cr) 12541 { 12542 12543 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12544 } 12545 12546 /*ARGSUSED*/ 12547 static int 12548 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12549 caller_context_t *ct) 12550 { 12551 int error = 0; 12552 mntinfo4_t *mi; 12553 vattr_t va; 12554 vsecattr_t nfsace4_vsap; 12555 12556 mi = VTOMI4(vp); 12557 if (nfs_zone() != mi->mi_zone) 12558 return (EIO); 12559 if (mi->mi_flags & MI4_ACL) { 12560 /* if we have a delegation, return it */ 12561 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12562 (void) nfs4delegreturn(VTOR4(vp), 12563 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12564 12565 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12566 NFS4_ACL_SET); 12567 if (error) /* EINVAL */ 12568 return (error); 12569 12570 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12571 /* 12572 * These are aclent_t type entries. 12573 */ 12574 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12575 vp->v_type == VDIR, FALSE); 12576 if (error) 12577 return (error); 12578 } else { 12579 /* 12580 * These are ace_t type entries. 12581 */ 12582 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12583 FALSE); 12584 if (error) 12585 return (error); 12586 } 12587 bzero(&va, sizeof (va)); 12588 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12589 vs_ace4_destroy(&nfsace4_vsap); 12590 return (error); 12591 } 12592 return (ENOSYS); 12593 } 12594 12595 /* ARGSUSED */ 12596 int 12597 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12598 caller_context_t *ct) 12599 { 12600 int error; 12601 mntinfo4_t *mi; 12602 nfs4_ga_res_t gar; 12603 rnode4_t *rp = VTOR4(vp); 12604 12605 mi = VTOMI4(vp); 12606 if (nfs_zone() != mi->mi_zone) 12607 return (EIO); 12608 12609 bzero(&gar, sizeof (gar)); 12610 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12611 12612 /* 12613 * vsecattr->vsa_mask holds the original acl request mask. 12614 * This is needed when determining what to return. 12615 * (See: nfs4_create_getsecattr_return()) 12616 */ 12617 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12618 if (error) /* EINVAL */ 12619 return (error); 12620 12621 /* 12622 * If this is a referral stub, don't try to go OTW for an ACL 12623 */ 12624 if (RP_ISSTUB_REFERRAL(VTOR4(vp))) 12625 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 12626 12627 if (mi->mi_flags & MI4_ACL) { 12628 /* 12629 * Check if the data is cached and the cache is valid. If it 12630 * is we don't go over the wire. 12631 */ 12632 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12633 mutex_enter(&rp->r_statelock); 12634 if (rp->r_secattr != NULL) { 12635 error = nfs4_create_getsecattr_return( 12636 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12637 rp->r_attr.va_gid, 12638 vp->v_type == VDIR); 12639 if (!error) { /* error == 0 - Success! */ 12640 mutex_exit(&rp->r_statelock); 12641 return (error); 12642 } 12643 } 12644 mutex_exit(&rp->r_statelock); 12645 } 12646 12647 /* 12648 * The getattr otw call will always get both the acl, in 12649 * the form of a list of nfsace4's, and the number of acl 12650 * entries; independent of the value of gar.n4g_va.va_mask. 12651 */ 12652 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12653 if (error) { 12654 vs_ace4_destroy(&gar.n4g_vsa); 12655 if (error == ENOTSUP || error == EOPNOTSUPP) 12656 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12657 return (error); 12658 } 12659 12660 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12661 /* 12662 * No error was returned, but according to the response 12663 * bitmap, neither was an acl. 12664 */ 12665 vs_ace4_destroy(&gar.n4g_vsa); 12666 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12667 return (error); 12668 } 12669 12670 /* 12671 * Update the cache with the ACL. 12672 */ 12673 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12674 12675 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12676 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12677 vp->v_type == VDIR); 12678 vs_ace4_destroy(&gar.n4g_vsa); 12679 if ((error) && (vsecattr->vsa_mask & 12680 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12681 (error != EACCES)) { 12682 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12683 } 12684 return (error); 12685 } 12686 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12687 return (error); 12688 } 12689 12690 /* 12691 * The function returns: 12692 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12693 * - EINVAL if the passed in "acl_mask" is an invalid request. 12694 * 12695 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12696 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12697 * 12698 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12699 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12700 * - We have a count field set without the corresponding acl field set. (e.g. - 12701 * VSA_ACECNT is set, but VSA_ACE is not) 12702 */ 12703 static int 12704 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12705 { 12706 /* Shortcut the masks that are always valid. */ 12707 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12708 return (0); 12709 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12710 return (0); 12711 12712 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12713 /* 12714 * We can't have any VSA_ACL type stuff in the mask now. 12715 */ 12716 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12717 VSA_DFACLCNT)) 12718 return (EINVAL); 12719 12720 if (op == NFS4_ACL_SET) { 12721 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12722 return (EINVAL); 12723 } 12724 } 12725 12726 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12727 /* 12728 * We can't have any VSA_ACE type stuff in the mask now. 12729 */ 12730 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12731 return (EINVAL); 12732 12733 if (op == NFS4_ACL_SET) { 12734 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12735 return (EINVAL); 12736 12737 if ((acl_mask & VSA_DFACLCNT) && 12738 !(acl_mask & VSA_DFACL)) 12739 return (EINVAL); 12740 } 12741 } 12742 return (0); 12743 } 12744 12745 /* 12746 * The theory behind creating the correct getsecattr return is simply this: 12747 * "Don't return anything that the caller is not expecting to have to free." 12748 */ 12749 static int 12750 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12751 uid_t uid, gid_t gid, int isdir) 12752 { 12753 int error = 0; 12754 /* Save the mask since the translators modify it. */ 12755 uint_t orig_mask = vsap->vsa_mask; 12756 12757 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12758 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE); 12759 12760 if (error) 12761 return (error); 12762 12763 /* 12764 * If the caller only asked for the ace count (VSA_ACECNT) 12765 * don't give them the full acl (VSA_ACE), free it. 12766 */ 12767 if (!orig_mask & VSA_ACE) { 12768 if (vsap->vsa_aclentp != NULL) { 12769 kmem_free(vsap->vsa_aclentp, 12770 vsap->vsa_aclcnt * sizeof (ace_t)); 12771 vsap->vsa_aclentp = NULL; 12772 } 12773 } 12774 vsap->vsa_mask = orig_mask; 12775 12776 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12777 VSA_DFACLCNT)) { 12778 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12779 isdir, FALSE); 12780 12781 if (error) 12782 return (error); 12783 12784 /* 12785 * If the caller only asked for the acl count (VSA_ACLCNT) 12786 * and/or the default acl count (VSA_DFACLCNT) don't give them 12787 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12788 */ 12789 if (!orig_mask & VSA_ACL) { 12790 if (vsap->vsa_aclentp != NULL) { 12791 kmem_free(vsap->vsa_aclentp, 12792 vsap->vsa_aclcnt * sizeof (aclent_t)); 12793 vsap->vsa_aclentp = NULL; 12794 } 12795 } 12796 12797 if (!orig_mask & VSA_DFACL) { 12798 if (vsap->vsa_dfaclentp != NULL) { 12799 kmem_free(vsap->vsa_dfaclentp, 12800 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12801 vsap->vsa_dfaclentp = NULL; 12802 } 12803 } 12804 vsap->vsa_mask = orig_mask; 12805 } 12806 return (0); 12807 } 12808 12809 /* ARGSUSED */ 12810 int 12811 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12812 caller_context_t *ct) 12813 { 12814 int error; 12815 12816 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12817 return (EIO); 12818 /* 12819 * check for valid cmd parameter 12820 */ 12821 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12822 return (EINVAL); 12823 12824 /* 12825 * Check access permissions 12826 */ 12827 if ((cmd & F_SHARE) && 12828 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12829 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12830 return (EBADF); 12831 12832 /* 12833 * If the filesystem is mounted using local locking, pass the 12834 * request off to the local share code. 12835 */ 12836 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12837 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12838 12839 switch (cmd) { 12840 case F_SHARE: 12841 case F_UNSHARE: 12842 /* 12843 * This will be properly implemented later, 12844 * see RFE: 4823948 . 12845 */ 12846 error = EAGAIN; 12847 break; 12848 12849 case F_HASREMOTELOCKS: 12850 /* 12851 * NFS client can't store remote locks itself 12852 */ 12853 shr->s_access = 0; 12854 error = 0; 12855 break; 12856 12857 default: 12858 error = EINVAL; 12859 break; 12860 } 12861 12862 return (error); 12863 } 12864 12865 /* 12866 * Common code called by directory ops to update the attrcache 12867 */ 12868 static int 12869 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12870 hrtime_t t, vnode_t *vp, cred_t *cr) 12871 { 12872 int error = 0; 12873 12874 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12875 12876 if (status != NFS4_OK) { 12877 /* getattr not done or failed */ 12878 PURGE_ATTRCACHE4(vp); 12879 return (error); 12880 } 12881 12882 if (garp) { 12883 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12884 } else { 12885 PURGE_ATTRCACHE4(vp); 12886 } 12887 return (error); 12888 } 12889 12890 /* 12891 * Update directory caches for directory modification ops (link, rename, etc.) 12892 * When dinfo is NULL, manage dircaches in the old way. 12893 */ 12894 static void 12895 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12896 dirattr_info_t *dinfo) 12897 { 12898 rnode4_t *drp = VTOR4(dvp); 12899 12900 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12901 12902 /* Purge rddir cache for dir since it changed */ 12903 if (drp->r_dir != NULL) 12904 nfs4_purge_rddir_cache(dvp); 12905 12906 /* 12907 * If caller provided dinfo, then use it to manage dir caches. 12908 */ 12909 if (dinfo != NULL) { 12910 if (vp != NULL) { 12911 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12912 if (!VTOR4(vp)->created_v4) { 12913 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12914 dnlc_update(dvp, nm, vp); 12915 } else { 12916 /* 12917 * XXX don't update if the created_v4 flag is 12918 * set 12919 */ 12920 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12921 NFS4_DEBUG(nfs4_client_state_debug, 12922 (CE_NOTE, "nfs4_update_dircaches: " 12923 "don't update dnlc: created_v4 flag")); 12924 } 12925 } 12926 12927 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12928 dinfo->di_cred, FALSE, cinfo); 12929 12930 return; 12931 } 12932 12933 /* 12934 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12935 * Since caller modified dir but didn't receive post-dirmod-op dir 12936 * attrs, the dir's attrs must be purged. 12937 * 12938 * XXX this check and dnlc update/purge should really be atomic, 12939 * XXX but can't use rnode statelock because it'll deadlock in 12940 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12941 * XXX does occur. 12942 * 12943 * XXX We also may want to check that atomic is true in the 12944 * XXX change_info struct. If it is not, the change_info may 12945 * XXX reflect changes by more than one clients which means that 12946 * XXX our cache may not be valid. 12947 */ 12948 PURGE_ATTRCACHE4(dvp); 12949 if (drp->r_change == cinfo->before) { 12950 /* no changes took place in the directory prior to our link */ 12951 if (vp != NULL) { 12952 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12953 if (!VTOR4(vp)->created_v4) { 12954 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12955 dnlc_update(dvp, nm, vp); 12956 } else { 12957 /* 12958 * XXX dont' update if the created_v4 flag 12959 * is set 12960 */ 12961 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12962 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12963 "nfs4_update_dircaches: don't" 12964 " update dnlc: created_v4 flag")); 12965 } 12966 } 12967 } else { 12968 /* Another client modified directory - purge its dnlc cache */ 12969 dnlc_purge_vp(dvp); 12970 } 12971 } 12972 12973 /* 12974 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12975 * file. 12976 * 12977 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12978 * file (ie: client recovery) and otherwise set to FALSE. 12979 * 12980 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12981 * initiated) calling functions. 12982 * 12983 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12984 * of resending a 'lost' open request. 12985 * 12986 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12987 * server that hands out BAD_SEQID on open confirm. 12988 * 12989 * Errors are returned via the nfs4_error_t parameter. 12990 */ 12991 void 12992 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12993 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12994 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12995 { 12996 COMPOUND4args_clnt args; 12997 COMPOUND4res_clnt res; 12998 nfs_argop4 argop[2]; 12999 nfs_resop4 *resop; 13000 int doqueue = 1; 13001 mntinfo4_t *mi; 13002 OPEN_CONFIRM4args *open_confirm_args; 13003 int needrecov; 13004 13005 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13006 #if DEBUG 13007 mutex_enter(&oop->oo_lock); 13008 ASSERT(oop->oo_seqid_inuse); 13009 mutex_exit(&oop->oo_lock); 13010 #endif 13011 13012 recov_retry_confirm: 13013 nfs4_error_zinit(ep); 13014 *retry_open = FALSE; 13015 13016 if (resend) 13017 args.ctag = TAG_OPEN_CONFIRM_LOST; 13018 else 13019 args.ctag = TAG_OPEN_CONFIRM; 13020 13021 args.array_len = 2; 13022 args.array = argop; 13023 13024 /* putfh target fh */ 13025 argop[0].argop = OP_CPUTFH; 13026 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 13027 13028 argop[1].argop = OP_OPEN_CONFIRM; 13029 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 13030 13031 (*seqid) += 1; 13032 open_confirm_args->seqid = *seqid; 13033 open_confirm_args->open_stateid = *stateid; 13034 13035 mi = VTOMI4(vp); 13036 13037 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 13038 13039 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 13040 nfs4_set_open_seqid((*seqid), oop, args.ctag); 13041 } 13042 13043 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 13044 if (!needrecov && ep->error) 13045 return; 13046 13047 if (needrecov) { 13048 bool_t abort = FALSE; 13049 13050 if (reopening_file == FALSE) { 13051 nfs4_bseqid_entry_t *bsep = NULL; 13052 13053 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 13054 bsep = nfs4_create_bseqid_entry(oop, NULL, 13055 vp, 0, args.ctag, 13056 open_confirm_args->seqid); 13057 13058 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 13059 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL); 13060 if (bsep) { 13061 kmem_free(bsep, sizeof (*bsep)); 13062 if (num_bseqid_retryp && 13063 --(*num_bseqid_retryp) == 0) 13064 abort = TRUE; 13065 } 13066 } 13067 if ((ep->error == ETIMEDOUT || 13068 res.status == NFS4ERR_RESOURCE) && 13069 abort == FALSE && resend == FALSE) { 13070 if (!ep->error) 13071 (void) xdr_free(xdr_COMPOUND4res_clnt, 13072 (caddr_t)&res); 13073 13074 delay(SEC_TO_TICK(confirm_retry_sec)); 13075 goto recov_retry_confirm; 13076 } 13077 /* State may have changed so retry the entire OPEN op */ 13078 if (abort == FALSE) 13079 *retry_open = TRUE; 13080 else 13081 *retry_open = FALSE; 13082 if (!ep->error) 13083 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13084 return; 13085 } 13086 13087 if (res.status) { 13088 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13089 return; 13090 } 13091 13092 resop = &res.array[1]; /* open confirm res */ 13093 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 13094 stateid, sizeof (*stateid)); 13095 13096 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13097 } 13098 13099 /* 13100 * Return the credentials associated with a client state object. The 13101 * caller is responsible for freeing the credentials. 13102 */ 13103 13104 static cred_t * 13105 state_to_cred(nfs4_open_stream_t *osp) 13106 { 13107 cred_t *cr; 13108 13109 /* 13110 * It's ok to not lock the open stream and open owner to get 13111 * the oo_cred since this is only written once (upon creation) 13112 * and will not change. 13113 */ 13114 cr = osp->os_open_owner->oo_cred; 13115 crhold(cr); 13116 13117 return (cr); 13118 } 13119 13120 /* 13121 * nfs4_find_sysid 13122 * 13123 * Find the sysid for the knetconfig associated with the given mi. 13124 */ 13125 static struct lm_sysid * 13126 nfs4_find_sysid(mntinfo4_t *mi) 13127 { 13128 ASSERT(nfs_zone() == mi->mi_zone); 13129 13130 /* 13131 * Switch from RDMA knconf to original mount knconf 13132 */ 13133 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 13134 mi->mi_curr_serv->sv_hostname, NULL)); 13135 } 13136 13137 #ifdef DEBUG 13138 /* 13139 * Return a string version of the call type for easy reading. 13140 */ 13141 static char * 13142 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 13143 { 13144 switch (ctype) { 13145 case NFS4_LCK_CTYPE_NORM: 13146 return ("NORMAL"); 13147 case NFS4_LCK_CTYPE_RECLAIM: 13148 return ("RECLAIM"); 13149 case NFS4_LCK_CTYPE_RESEND: 13150 return ("RESEND"); 13151 case NFS4_LCK_CTYPE_REINSTATE: 13152 return ("REINSTATE"); 13153 default: 13154 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 13155 "type %d", ctype); 13156 return (""); 13157 } 13158 } 13159 #endif 13160 13161 /* 13162 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 13163 * Unlock requests don't have an over-the-wire locktype, so we just return 13164 * something non-threatening. 13165 */ 13166 13167 static nfs_lock_type4 13168 flk_to_locktype(int cmd, int l_type) 13169 { 13170 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 13171 13172 switch (l_type) { 13173 case F_UNLCK: 13174 return (READ_LT); 13175 case F_RDLCK: 13176 if (cmd == F_SETLK) 13177 return (READ_LT); 13178 else 13179 return (READW_LT); 13180 case F_WRLCK: 13181 if (cmd == F_SETLK) 13182 return (WRITE_LT); 13183 else 13184 return (WRITEW_LT); 13185 } 13186 panic("flk_to_locktype"); 13187 /*NOTREACHED*/ 13188 } 13189 13190 /* 13191 * Set the flock64's lm_sysid for nfs4frlock. 13192 */ 13193 static int 13194 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 13195 { 13196 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13197 13198 /* Find the lm_sysid */ 13199 *lspp = nfs4_find_sysid(VTOMI4(vp)); 13200 13201 if (*lspp == NULL) { 13202 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13203 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 13204 return (ENOLCK); 13205 } 13206 13207 flk->l_sysid = lm_sysidt(*lspp); 13208 13209 return (0); 13210 } 13211 13212 /* 13213 * Do the remaining preliminary setup for nfs4frlock. 13214 */ 13215 static void 13216 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 13217 vnode_t *vp, cred_t *search_cr, cred_t **cred_otw) 13218 { 13219 /* 13220 * set tick_delay to the base delay time. 13221 * (nfs4_base_wait_time is in msecs) 13222 */ 13223 13224 *tick_delayp = drv_usectohz(nfs4_base_wait_time * 1000); 13225 13226 recov_statep->rs_flags = 0; 13227 recov_statep->rs_num_retry_despite_err = 0; 13228 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 13229 } 13230 13231 /* 13232 * Initialize and allocate the data structures necessary for 13233 * the nfs4frlock call. 13234 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 13235 */ 13236 static void 13237 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 13238 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 13239 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 13240 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 13241 { 13242 int argoplist_size; 13243 int num_ops = 2; 13244 13245 *retry = FALSE; 13246 *did_start_fop = FALSE; 13247 *skip_get_err = FALSE; 13248 lost_rqstp->lr_op = 0; 13249 argoplist_size = num_ops * sizeof (nfs_argop4); 13250 /* fill array with zero */ 13251 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 13252 13253 *argspp = argsp; 13254 *respp = NULL; 13255 13256 argsp->array_len = num_ops; 13257 argsp->array = *argopp; 13258 13259 /* initialize in case of error; will get real value down below */ 13260 argsp->ctag = TAG_NONE; 13261 13262 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13263 *op_hintp = OH_LOCKU; 13264 else 13265 *op_hintp = OH_OTHER; 13266 } 13267 13268 /* 13269 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13270 * the proper nfs4_server_t for this instance of nfs4frlock. 13271 * Returns 0 (success) or an errno value. 13272 */ 13273 static int 13274 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13275 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13276 bool_t *did_start_fop, bool_t *startrecovp) 13277 { 13278 int error = 0; 13279 rnode4_t *rp; 13280 13281 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13282 13283 if (ctype == NFS4_LCK_CTYPE_NORM) { 13284 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13285 recov_statep, startrecovp); 13286 if (error) 13287 return (error); 13288 *did_start_fop = TRUE; 13289 } else { 13290 *did_start_fop = FALSE; 13291 *startrecovp = FALSE; 13292 } 13293 13294 if (!error) { 13295 rp = VTOR4(vp); 13296 13297 /* If the file failed recovery, just quit. */ 13298 mutex_enter(&rp->r_statelock); 13299 if (rp->r_flags & R4RECOVERR) { 13300 error = EIO; 13301 } 13302 mutex_exit(&rp->r_statelock); 13303 } 13304 13305 return (error); 13306 } 13307 13308 /* 13309 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13310 * resend nfs4frlock call is initiated by the recovery framework. 13311 * Acquires the lop and oop seqid synchronization. 13312 */ 13313 static void 13314 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13315 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13316 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13317 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13318 { 13319 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13320 int error; 13321 13322 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13323 (CE_NOTE, 13324 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13325 ASSERT(resend_rqstp != NULL); 13326 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13327 resend_rqstp->lr_op == OP_LOCKU); 13328 13329 *oopp = resend_rqstp->lr_oop; 13330 if (resend_rqstp->lr_oop) { 13331 open_owner_hold(resend_rqstp->lr_oop); 13332 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13333 ASSERT(error == 0); /* recov thread always succeeds */ 13334 } 13335 13336 /* Must resend this lost lock/locku request. */ 13337 ASSERT(resend_rqstp->lr_lop != NULL); 13338 *lopp = resend_rqstp->lr_lop; 13339 lock_owner_hold(resend_rqstp->lr_lop); 13340 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13341 ASSERT(error == 0); /* recov thread always succeeds */ 13342 13343 *ospp = resend_rqstp->lr_osp; 13344 if (*ospp) 13345 open_stream_hold(resend_rqstp->lr_osp); 13346 13347 if (resend_rqstp->lr_op == OP_LOCK) { 13348 LOCK4args *lock_args; 13349 13350 argop->argop = OP_LOCK; 13351 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13352 lock_args->locktype = resend_rqstp->lr_locktype; 13353 lock_args->reclaim = 13354 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13355 lock_args->offset = resend_rqstp->lr_flk->l_start; 13356 lock_args->length = resend_rqstp->lr_flk->l_len; 13357 if (lock_args->length == 0) 13358 lock_args->length = ~lock_args->length; 13359 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13360 mi2clientid(mi), &lock_args->locker); 13361 13362 switch (resend_rqstp->lr_ctype) { 13363 case NFS4_LCK_CTYPE_RESEND: 13364 argsp->ctag = TAG_LOCK_RESEND; 13365 break; 13366 case NFS4_LCK_CTYPE_REINSTATE: 13367 argsp->ctag = TAG_LOCK_REINSTATE; 13368 break; 13369 case NFS4_LCK_CTYPE_RECLAIM: 13370 argsp->ctag = TAG_LOCK_RECLAIM; 13371 break; 13372 default: 13373 argsp->ctag = TAG_LOCK_UNKNOWN; 13374 break; 13375 } 13376 } else { 13377 LOCKU4args *locku_args; 13378 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13379 13380 argop->argop = OP_LOCKU; 13381 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13382 locku_args->locktype = READ_LT; 13383 locku_args->seqid = lop->lock_seqid + 1; 13384 mutex_enter(&lop->lo_lock); 13385 locku_args->lock_stateid = lop->lock_stateid; 13386 mutex_exit(&lop->lo_lock); 13387 locku_args->offset = resend_rqstp->lr_flk->l_start; 13388 locku_args->length = resend_rqstp->lr_flk->l_len; 13389 if (locku_args->length == 0) 13390 locku_args->length = ~locku_args->length; 13391 13392 switch (resend_rqstp->lr_ctype) { 13393 case NFS4_LCK_CTYPE_RESEND: 13394 argsp->ctag = TAG_LOCKU_RESEND; 13395 break; 13396 case NFS4_LCK_CTYPE_REINSTATE: 13397 argsp->ctag = TAG_LOCKU_REINSTATE; 13398 break; 13399 default: 13400 argsp->ctag = TAG_LOCK_UNKNOWN; 13401 break; 13402 } 13403 } 13404 } 13405 13406 /* 13407 * Setup the LOCKT4 arguments. 13408 */ 13409 static void 13410 nfs4frlock_setup_lockt_args(nfs_argop4 *argop, LOCKT4args **lockt_argsp, 13411 COMPOUND4args_clnt *argsp, flock64_t *flk, rnode4_t *rp) 13412 { 13413 LOCKT4args *lockt_args; 13414 13415 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13416 argop->argop = OP_LOCKT; 13417 argsp->ctag = TAG_LOCKT; 13418 lockt_args = &argop->nfs_argop4_u.oplockt; 13419 13420 /* 13421 * The locktype will be READ_LT unless it's 13422 * a write lock. We do this because the Solaris 13423 * system call allows the combination of 13424 * F_UNLCK and F_GETLK* and so in that case the 13425 * unlock is mapped to a read. 13426 */ 13427 if (flk->l_type == F_WRLCK) 13428 lockt_args->locktype = WRITE_LT; 13429 else 13430 lockt_args->locktype = READ_LT; 13431 13432 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13433 /* set the lock owner4 args */ 13434 nfs4_setlockowner_args(&lockt_args->owner, rp, flk->l_pid); 13435 lockt_args->offset = flk->l_start; 13436 lockt_args->length = flk->l_len; 13437 if (flk->l_len == 0) 13438 lockt_args->length = ~lockt_args->length; 13439 13440 *lockt_argsp = lockt_args; 13441 } 13442 13443 /* 13444 * If the client is holding a delegation, and the open stream to be used 13445 * with this lock request is a delegation open stream, then re-open the stream. 13446 * Sets the nfs4_error_t to all zeros unless the open stream has already 13447 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13448 * means the caller should retry (like a recovery retry). 13449 */ 13450 static void 13451 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13452 { 13453 open_delegation_type4 dt; 13454 bool_t reopen_needed, force; 13455 nfs4_open_stream_t *osp; 13456 open_claim_type4 oclaim; 13457 rnode4_t *rp = VTOR4(vp); 13458 mntinfo4_t *mi = VTOMI4(vp); 13459 13460 ASSERT(nfs_zone() == mi->mi_zone); 13461 13462 nfs4_error_zinit(ep); 13463 13464 mutex_enter(&rp->r_statev4_lock); 13465 dt = rp->r_deleg_type; 13466 mutex_exit(&rp->r_statev4_lock); 13467 13468 if (dt != OPEN_DELEGATE_NONE) { 13469 nfs4_open_owner_t *oop; 13470 13471 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13472 if (!oop) { 13473 ep->stat = NFS4ERR_IO; 13474 return; 13475 } 13476 /* returns with 'os_sync_lock' held */ 13477 osp = find_open_stream(oop, rp); 13478 if (!osp) { 13479 open_owner_rele(oop); 13480 ep->stat = NFS4ERR_IO; 13481 return; 13482 } 13483 13484 if (osp->os_failed_reopen) { 13485 NFS4_DEBUG((nfs4_open_stream_debug || 13486 nfs4_client_lock_debug), (CE_NOTE, 13487 "nfs4frlock_check_deleg: os_failed_reopen set " 13488 "for osp %p, cr %p, rp %s", (void *)osp, 13489 (void *)cr, rnode4info(rp))); 13490 mutex_exit(&osp->os_sync_lock); 13491 open_stream_rele(osp, rp); 13492 open_owner_rele(oop); 13493 ep->stat = NFS4ERR_IO; 13494 return; 13495 } 13496 13497 /* 13498 * Determine whether a reopen is needed. If this 13499 * is a delegation open stream, then send the open 13500 * to the server to give visibility to the open owner. 13501 * Even if it isn't a delegation open stream, we need 13502 * to check if the previous open CLAIM_DELEGATE_CUR 13503 * was sufficient. 13504 */ 13505 13506 reopen_needed = osp->os_delegation || 13507 ((lt == F_RDLCK && 13508 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13509 (lt == F_WRLCK && 13510 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13511 13512 mutex_exit(&osp->os_sync_lock); 13513 open_owner_rele(oop); 13514 13515 if (reopen_needed) { 13516 /* 13517 * Always use CLAIM_PREVIOUS after server reboot. 13518 * The server will reject CLAIM_DELEGATE_CUR if 13519 * it is used during the grace period. 13520 */ 13521 mutex_enter(&mi->mi_lock); 13522 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13523 oclaim = CLAIM_PREVIOUS; 13524 force = TRUE; 13525 } else { 13526 oclaim = CLAIM_DELEGATE_CUR; 13527 force = FALSE; 13528 } 13529 mutex_exit(&mi->mi_lock); 13530 13531 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13532 if (ep->error == EAGAIN) { 13533 nfs4_error_zinit(ep); 13534 ep->stat = NFS4ERR_DELAY; 13535 } 13536 } 13537 open_stream_rele(osp, rp); 13538 osp = NULL; 13539 } 13540 } 13541 13542 /* 13543 * Setup the LOCKU4 arguments. 13544 * Returns errors via the nfs4_error_t. 13545 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13546 * over-the-wire. The caller must release the 13547 * reference on *lopp. 13548 * NFS4ERR_DELAY caller should retry (like recovery retry) 13549 * (other) unrecoverable error. 13550 */ 13551 static void 13552 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13553 LOCKU4args **locku_argsp, flock64_t *flk, 13554 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13555 vnode_t *vp, cred_t *cr, bool_t *skip_get_err, bool_t *go_otwp) 13556 { 13557 nfs4_lock_owner_t *lop = NULL; 13558 LOCKU4args *locku_args; 13559 pid_t pid = flk->l_pid; 13560 bool_t is_spec = FALSE; 13561 rnode4_t *rp = VTOR4(vp); 13562 13563 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13564 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13565 13566 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13567 if (ep->error || ep->stat) 13568 return; 13569 13570 argop->argop = OP_LOCKU; 13571 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13572 argsp->ctag = TAG_LOCKU_REINSTATE; 13573 else 13574 argsp->ctag = TAG_LOCKU; 13575 locku_args = &argop->nfs_argop4_u.oplocku; 13576 *locku_argsp = locku_args; 13577 13578 /* 13579 * XXX what should locku_args->locktype be? 13580 * setting to ALWAYS be READ_LT so at least 13581 * it is a valid locktype. 13582 */ 13583 13584 locku_args->locktype = READ_LT; 13585 13586 /* 13587 * Get the lock owner stateid. If no lock owner 13588 * exists, return success. 13589 */ 13590 lop = find_lock_owner(rp, pid, LOWN_ANY); 13591 *lopp = lop; 13592 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13593 is_spec = TRUE; 13594 if (!lop || is_spec) { 13595 /* 13596 * No lock owner so no locks to unlock. 13597 * Return success. 13598 * 13599 * If the lockowner is using a special stateid, 13600 * then the original lock request (that created 13601 * this lockowner) was never successful, so we 13602 * have no lock to undo OTW. 13603 */ 13604 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13605 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13606 "(%ld) so return success", (long)pid)); 13607 13608 /* 13609 * Release our hold and NULL out so final_cleanup 13610 * doesn't try to end a lock seqid sync we 13611 * never started. 13612 */ 13613 if (is_spec) { 13614 lock_owner_rele(lop); 13615 *lopp = NULL; 13616 } 13617 *skip_get_err = TRUE; 13618 *go_otwp = FALSE; 13619 return; 13620 } 13621 13622 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13623 if (ep->error == EAGAIN) { 13624 lock_owner_rele(lop); 13625 *lopp = NULL; 13626 return; 13627 } 13628 13629 mutex_enter(&lop->lo_lock); 13630 locku_args->lock_stateid = lop->lock_stateid; 13631 mutex_exit(&lop->lo_lock); 13632 locku_args->seqid = lop->lock_seqid + 1; 13633 13634 /* leave the ref count on lop, rele after RPC call */ 13635 13636 locku_args->offset = flk->l_start; 13637 locku_args->length = flk->l_len; 13638 if (flk->l_len == 0) 13639 locku_args->length = ~locku_args->length; 13640 13641 *go_otwp = TRUE; 13642 } 13643 13644 /* 13645 * Setup the LOCK4 arguments. 13646 * 13647 * Returns errors via the nfs4_error_t. 13648 * NFS4_OK no problems 13649 * NFS4ERR_DELAY caller should retry (like recovery retry) 13650 * (other) unrecoverable error 13651 */ 13652 static void 13653 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13654 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13655 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13656 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13657 { 13658 LOCK4args *lock_args; 13659 nfs4_open_owner_t *oop = NULL; 13660 nfs4_open_stream_t *osp = NULL; 13661 nfs4_lock_owner_t *lop = NULL; 13662 pid_t pid = flk->l_pid; 13663 rnode4_t *rp = VTOR4(vp); 13664 13665 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13666 13667 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13668 if (ep->error || ep->stat != NFS4_OK) 13669 return; 13670 13671 argop->argop = OP_LOCK; 13672 if (ctype == NFS4_LCK_CTYPE_NORM) 13673 argsp->ctag = TAG_LOCK; 13674 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13675 argsp->ctag = TAG_RELOCK; 13676 else 13677 argsp->ctag = TAG_LOCK_REINSTATE; 13678 lock_args = &argop->nfs_argop4_u.oplock; 13679 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13680 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13681 /* 13682 * Get the lock owner. If no lock owner exists, 13683 * create a 'temporary' one and grab the open seqid 13684 * synchronization (which puts a hold on the open 13685 * owner and open stream). 13686 * This also grabs the lock seqid synchronization. 13687 */ 13688 ep->stat = 13689 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13690 13691 if (ep->stat != NFS4_OK) 13692 goto out; 13693 13694 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13695 &lock_args->locker); 13696 13697 lock_args->offset = flk->l_start; 13698 lock_args->length = flk->l_len; 13699 if (flk->l_len == 0) 13700 lock_args->length = ~lock_args->length; 13701 *lock_argsp = lock_args; 13702 out: 13703 *oopp = oop; 13704 *ospp = osp; 13705 *lopp = lop; 13706 } 13707 13708 /* 13709 * After we get the reply from the server, record the proper information 13710 * for possible resend lock requests. 13711 * 13712 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13713 */ 13714 static void 13715 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13716 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13717 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13718 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13719 { 13720 bool_t unlock = (flk->l_type == F_UNLCK); 13721 13722 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13723 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13724 ctype == NFS4_LCK_CTYPE_REINSTATE); 13725 13726 if (error != 0 && !unlock) { 13727 NFS4_DEBUG((nfs4_lost_rqst_debug || 13728 nfs4_client_lock_debug), (CE_NOTE, 13729 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13730 " for lop %p", (void *)lop)); 13731 ASSERT(lop != NULL); 13732 mutex_enter(&lop->lo_lock); 13733 lop->lo_pending_rqsts = 1; 13734 mutex_exit(&lop->lo_lock); 13735 } 13736 13737 lost_rqstp->lr_putfirst = FALSE; 13738 lost_rqstp->lr_op = 0; 13739 13740 /* 13741 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13742 * recovery purposes so that the lock request that was sent 13743 * can be saved and re-issued later. Ditto for EIO from a forced 13744 * unmount. This is done to have the client's local locking state 13745 * match the v4 server's state; that is, the request was 13746 * potentially received and accepted by the server but the client 13747 * thinks it was not. 13748 */ 13749 if (error == ETIMEDOUT || error == EINTR || 13750 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13751 NFS4_DEBUG((nfs4_lost_rqst_debug || 13752 nfs4_client_lock_debug), (CE_NOTE, 13753 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13754 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13755 (void *)lop, (void *)oop, (void *)osp)); 13756 if (unlock) 13757 lost_rqstp->lr_op = OP_LOCKU; 13758 else { 13759 lost_rqstp->lr_op = OP_LOCK; 13760 lost_rqstp->lr_locktype = locktype; 13761 } 13762 /* 13763 * Objects are held and rele'd via the recovery code. 13764 * See nfs4_save_lost_rqst. 13765 */ 13766 lost_rqstp->lr_vp = vp; 13767 lost_rqstp->lr_dvp = NULL; 13768 lost_rqstp->lr_oop = oop; 13769 lost_rqstp->lr_osp = osp; 13770 lost_rqstp->lr_lop = lop; 13771 lost_rqstp->lr_cr = cr; 13772 switch (ctype) { 13773 case NFS4_LCK_CTYPE_NORM: 13774 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13775 break; 13776 case NFS4_LCK_CTYPE_REINSTATE: 13777 lost_rqstp->lr_putfirst = TRUE; 13778 lost_rqstp->lr_ctype = ctype; 13779 break; 13780 default: 13781 break; 13782 } 13783 lost_rqstp->lr_flk = flk; 13784 } 13785 } 13786 13787 /* 13788 * Update lop's seqid. Also update the seqid stored in a resend request, 13789 * if any. (Some recovery errors increment the seqid, and we may have to 13790 * send the resend request again.) 13791 */ 13792 13793 static void 13794 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13795 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13796 { 13797 if (lock_args) { 13798 if (lock_args->locker.new_lock_owner == TRUE) 13799 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13800 else { 13801 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13802 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13803 } 13804 } else if (locku_args) { 13805 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13806 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13807 } 13808 } 13809 13810 /* 13811 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13812 * COMPOUND4 args/res for calls that need to retry. 13813 * Switches the *cred_otwp to base_cr. 13814 */ 13815 static void 13816 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13817 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13818 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13819 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13820 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13821 { 13822 nfs4_open_owner_t *oop = *oopp; 13823 nfs4_open_stream_t *osp = *ospp; 13824 nfs4_lock_owner_t *lop = *lopp; 13825 nfs_argop4 *argop = (*argspp)->array; 13826 13827 if (*did_start_fop) { 13828 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13829 needrecov); 13830 *did_start_fop = FALSE; 13831 } 13832 ASSERT((*argspp)->array_len == 2); 13833 if (argop[1].argop == OP_LOCK) 13834 nfs4args_lock_free(&argop[1]); 13835 else if (argop[1].argop == OP_LOCKT) 13836 nfs4args_lockt_free(&argop[1]); 13837 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13838 if (!error) 13839 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13840 *argspp = NULL; 13841 *respp = NULL; 13842 13843 if (lop) { 13844 nfs4_end_lock_seqid_sync(lop); 13845 lock_owner_rele(lop); 13846 *lopp = NULL; 13847 } 13848 13849 /* need to free up the reference on osp for lock args */ 13850 if (osp != NULL) { 13851 open_stream_rele(osp, VTOR4(vp)); 13852 *ospp = NULL; 13853 } 13854 13855 /* need to free up the reference on oop for lock args */ 13856 if (oop != NULL) { 13857 nfs4_end_open_seqid_sync(oop); 13858 open_owner_rele(oop); 13859 *oopp = NULL; 13860 } 13861 13862 crfree(*cred_otwp); 13863 *cred_otwp = base_cr; 13864 crhold(*cred_otwp); 13865 } 13866 13867 /* 13868 * Function to process the client's recovery for nfs4frlock. 13869 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13870 * 13871 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13872 * COMPOUND4 args/res for calls that need to retry. 13873 * 13874 * Note: the rp's r_lkserlock is *not* dropped during this path. 13875 */ 13876 static bool_t 13877 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13878 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13879 LOCK4args *lock_args, LOCKU4args *locku_args, 13880 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13881 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13882 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13883 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13884 { 13885 nfs4_open_owner_t *oop = *oopp; 13886 nfs4_open_stream_t *osp = *ospp; 13887 nfs4_lock_owner_t *lop = *lopp; 13888 13889 bool_t abort, retry; 13890 13891 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13892 ASSERT((*argspp) != NULL); 13893 ASSERT((*respp) != NULL); 13894 if (lock_args || locku_args) 13895 ASSERT(lop != NULL); 13896 13897 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13898 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13899 13900 retry = TRUE; 13901 abort = FALSE; 13902 if (needrecov) { 13903 nfs4_bseqid_entry_t *bsep = NULL; 13904 nfs_opnum4 op; 13905 13906 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13907 13908 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13909 seqid4 seqid; 13910 13911 if (lock_args) { 13912 if (lock_args->locker.new_lock_owner == TRUE) 13913 seqid = lock_args->locker.locker4_u. 13914 open_owner.open_seqid; 13915 else 13916 seqid = lock_args->locker.locker4_u. 13917 lock_owner.lock_seqid; 13918 } else if (locku_args) { 13919 seqid = locku_args->seqid; 13920 } else { 13921 seqid = 0; 13922 } 13923 13924 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13925 flk->l_pid, (*argspp)->ctag, seqid); 13926 } 13927 13928 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13929 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13930 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13931 NULL, op, bsep, NULL, NULL); 13932 13933 if (bsep) 13934 kmem_free(bsep, sizeof (*bsep)); 13935 } 13936 13937 /* 13938 * Return that we do not want to retry the request for 3 cases: 13939 * 1. If we received EINTR or are bailing out because of a forced 13940 * unmount, we came into this code path just for the sake of 13941 * initiating recovery, we now need to return the error. 13942 * 2. If we have aborted recovery. 13943 * 3. We received NFS4ERR_BAD_SEQID. 13944 */ 13945 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13946 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13947 retry = FALSE; 13948 13949 if (*did_start_fop == TRUE) { 13950 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13951 needrecov); 13952 *did_start_fop = FALSE; 13953 } 13954 13955 if (retry == TRUE) { 13956 nfs_argop4 *argop; 13957 13958 argop = (*argspp)->array; 13959 ASSERT((*argspp)->array_len == 2); 13960 13961 if (argop[1].argop == OP_LOCK) 13962 nfs4args_lock_free(&argop[1]); 13963 else if (argop[1].argop == OP_LOCKT) 13964 nfs4args_lockt_free(&argop[1]); 13965 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13966 if (!ep->error) 13967 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13968 *respp = NULL; 13969 *argspp = NULL; 13970 } 13971 13972 if (lop != NULL) { 13973 nfs4_end_lock_seqid_sync(lop); 13974 lock_owner_rele(lop); 13975 } 13976 13977 *lopp = NULL; 13978 13979 /* need to free up the reference on osp for lock args */ 13980 if (osp != NULL) { 13981 open_stream_rele(osp, rp); 13982 *ospp = NULL; 13983 } 13984 13985 /* need to free up the reference on oop for lock args */ 13986 if (oop != NULL) { 13987 nfs4_end_open_seqid_sync(oop); 13988 open_owner_rele(oop); 13989 *oopp = NULL; 13990 } 13991 13992 return (retry); 13993 } 13994 13995 /* 13996 * Handle the DENIED reply from the server for nfs4frlock. 13997 * Returns TRUE if we should retry the request; FALSE otherwise. 13998 * 13999 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14000 * COMPOUND4 args/res for calls that need to retry. Can also 14001 * drop and regrab the r_lkserlock. 14002 */ 14003 static bool_t 14004 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 14005 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 14006 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 14007 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 14008 nfs4_recov_state_t *recov_statep, int needrecov, 14009 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 14010 clock_t *tick_delayp, int *errorp, 14011 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 14012 bool_t *skip_get_err) 14013 { 14014 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14015 14016 if (lock_args) { 14017 nfs4_open_owner_t *oop = *oopp; 14018 nfs4_open_stream_t *osp = *ospp; 14019 nfs4_lock_owner_t *lop = *lopp; 14020 int intr; 14021 14022 /* 14023 * Blocking lock needs to sleep and retry from the request. 14024 * 14025 * Do not block and wait for 'resend' or 'reinstate' 14026 * lock requests, just return the error. 14027 * 14028 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 14029 */ 14030 if (cmd == F_SETLKW) { 14031 rnode4_t *rp = VTOR4(vp); 14032 nfs_argop4 *argop = (*argspp)->array; 14033 14034 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14035 14036 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14037 recov_statep, needrecov); 14038 *did_start_fop = FALSE; 14039 ASSERT((*argspp)->array_len == 2); 14040 if (argop[1].argop == OP_LOCK) 14041 nfs4args_lock_free(&argop[1]); 14042 else if (argop[1].argop == OP_LOCKT) 14043 nfs4args_lockt_free(&argop[1]); 14044 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14045 if (*respp) 14046 (void) xdr_free(xdr_COMPOUND4res_clnt, 14047 (caddr_t)*respp); 14048 *argspp = NULL; 14049 *respp = NULL; 14050 nfs4_end_lock_seqid_sync(lop); 14051 lock_owner_rele(lop); 14052 *lopp = NULL; 14053 if (osp != NULL) { 14054 open_stream_rele(osp, rp); 14055 *ospp = NULL; 14056 } 14057 if (oop != NULL) { 14058 nfs4_end_open_seqid_sync(oop); 14059 open_owner_rele(oop); 14060 *oopp = NULL; 14061 } 14062 14063 nfs_rw_exit(&rp->r_lkserlock); 14064 14065 intr = nfs4_block_and_wait(tick_delayp); 14066 14067 (void) nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, 14068 FALSE); 14069 14070 if (intr) { 14071 *errorp = EINTR; 14072 return (FALSE); 14073 } 14074 14075 /* 14076 * Make sure we are still safe to lock with 14077 * regards to mmapping. 14078 */ 14079 if (!nfs4_safelock(vp, flk, cr)) { 14080 *errorp = EAGAIN; 14081 return (FALSE); 14082 } 14083 14084 return (TRUE); 14085 } 14086 if (ctype == NFS4_LCK_CTYPE_NORM) 14087 *errorp = EAGAIN; 14088 *skip_get_err = TRUE; 14089 flk->l_whence = 0; 14090 return (FALSE); 14091 } else if (lockt_args) { 14092 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14093 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 14094 14095 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 14096 flk, lockt_args); 14097 14098 /* according to NLM code */ 14099 *errorp = 0; 14100 *skip_get_err = TRUE; 14101 return (FALSE); 14102 } 14103 return (FALSE); 14104 } 14105 14106 /* 14107 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 14108 */ 14109 static void 14110 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 14111 { 14112 switch (resp->status) { 14113 case NFS4ERR_ACCESS: 14114 case NFS4ERR_ADMIN_REVOKED: 14115 case NFS4ERR_BADHANDLE: 14116 case NFS4ERR_BAD_RANGE: 14117 case NFS4ERR_BAD_SEQID: 14118 case NFS4ERR_BAD_STATEID: 14119 case NFS4ERR_BADXDR: 14120 case NFS4ERR_DEADLOCK: 14121 case NFS4ERR_DELAY: 14122 case NFS4ERR_EXPIRED: 14123 case NFS4ERR_FHEXPIRED: 14124 case NFS4ERR_GRACE: 14125 case NFS4ERR_INVAL: 14126 case NFS4ERR_ISDIR: 14127 case NFS4ERR_LEASE_MOVED: 14128 case NFS4ERR_LOCK_NOTSUPP: 14129 case NFS4ERR_LOCK_RANGE: 14130 case NFS4ERR_MOVED: 14131 case NFS4ERR_NOFILEHANDLE: 14132 case NFS4ERR_NO_GRACE: 14133 case NFS4ERR_OLD_STATEID: 14134 case NFS4ERR_OPENMODE: 14135 case NFS4ERR_RECLAIM_BAD: 14136 case NFS4ERR_RECLAIM_CONFLICT: 14137 case NFS4ERR_RESOURCE: 14138 case NFS4ERR_SERVERFAULT: 14139 case NFS4ERR_STALE: 14140 case NFS4ERR_STALE_CLIENTID: 14141 case NFS4ERR_STALE_STATEID: 14142 return; 14143 default: 14144 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14145 "nfs4frlock_results_default: got unrecognizable " 14146 "res.status %d", resp->status)); 14147 *errorp = NFS4ERR_INVAL; 14148 } 14149 } 14150 14151 /* 14152 * The lock request was successful, so update the client's state. 14153 */ 14154 static void 14155 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 14156 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 14157 vnode_t *vp, flock64_t *flk, cred_t *cr, 14158 nfs4_lost_rqst_t *resend_rqstp) 14159 { 14160 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14161 14162 if (lock_args) { 14163 LOCK4res *lock_res; 14164 14165 lock_res = &resop->nfs_resop4_u.oplock; 14166 /* update the stateid with server's response */ 14167 14168 if (lock_args->locker.new_lock_owner == TRUE) { 14169 mutex_enter(&lop->lo_lock); 14170 lop->lo_just_created = NFS4_PERM_CREATED; 14171 mutex_exit(&lop->lo_lock); 14172 } 14173 14174 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 14175 14176 /* 14177 * If the lock was the result of a resending a lost 14178 * request, we've synched up the stateid and seqid 14179 * with the server, but now the server might be out of sync 14180 * with what the application thinks it has for locks. 14181 * Clean that up here. It's unclear whether we should do 14182 * this even if the filesystem has been forcibly unmounted. 14183 * For most servers, it's probably wasted effort, but 14184 * RFC3530 lets servers require that unlocks exactly match 14185 * the locks that are held. 14186 */ 14187 if (resend_rqstp != NULL && 14188 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 14189 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 14190 } else { 14191 flk->l_whence = 0; 14192 } 14193 } else if (locku_args) { 14194 LOCKU4res *locku_res; 14195 14196 locku_res = &resop->nfs_resop4_u.oplocku; 14197 14198 /* Update the stateid with the server's response */ 14199 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 14200 } else if (lockt_args) { 14201 /* Switch the lock type to express success, see fcntl */ 14202 flk->l_type = F_UNLCK; 14203 flk->l_whence = 0; 14204 } 14205 } 14206 14207 /* 14208 * Do final cleanup before exiting nfs4frlock. 14209 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14210 * COMPOUND4 args/res for calls that haven't already. 14211 */ 14212 static void 14213 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14214 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14215 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14216 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, 14217 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14218 bool_t did_start_fop, bool_t skip_get_err, 14219 cred_t *cred_otw, cred_t *cred) 14220 { 14221 mntinfo4_t *mi = VTOMI4(vp); 14222 rnode4_t *rp = VTOR4(vp); 14223 int error = *errorp; 14224 nfs_argop4 *argop; 14225 int do_flush_pages = 0; 14226 14227 ASSERT(nfs_zone() == mi->mi_zone); 14228 /* 14229 * The client recovery code wants the raw status information, 14230 * so don't map the NFS status code to an errno value for 14231 * non-normal call types. 14232 */ 14233 if (ctype == NFS4_LCK_CTYPE_NORM) { 14234 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14235 *errorp = geterrno4(resp->status); 14236 if (did_start_fop == TRUE) 14237 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14238 needrecov); 14239 14240 /* 14241 * We've established a new lock on the server, so invalidate 14242 * the pages associated with the vnode to get the most up to 14243 * date pages from the server after acquiring the lock. We 14244 * want to be sure that the read operation gets the newest data. 14245 * 14246 * We flush the pages below after calling nfs4_end_fop above. 14247 * 14248 * The flush of the page cache must be done after 14249 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14250 */ 14251 if (!error && resp && resp->status == NFS4_OK) 14252 do_flush_pages = 1; 14253 } 14254 if (argsp) { 14255 ASSERT(argsp->array_len == 2); 14256 argop = argsp->array; 14257 if (argop[1].argop == OP_LOCK) 14258 nfs4args_lock_free(&argop[1]); 14259 else if (argop[1].argop == OP_LOCKT) 14260 nfs4args_lockt_free(&argop[1]); 14261 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14262 if (resp) 14263 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14264 } 14265 14266 /* free the reference on the lock owner */ 14267 if (lop != NULL) { 14268 nfs4_end_lock_seqid_sync(lop); 14269 lock_owner_rele(lop); 14270 } 14271 14272 /* need to free up the reference on osp for lock args */ 14273 if (osp != NULL) 14274 open_stream_rele(osp, rp); 14275 14276 /* need to free up the reference on oop for lock args */ 14277 if (oop != NULL) { 14278 nfs4_end_open_seqid_sync(oop); 14279 open_owner_rele(oop); 14280 } 14281 14282 if (do_flush_pages) 14283 nfs4_flush_pages(vp, cred); 14284 14285 /* 14286 * Record debug information in the event we get EINVAL. 14287 */ 14288 mutex_enter(&mi->mi_lock); 14289 if (*errorp == EINVAL && (lock_args || locku_args) && 14290 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14291 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14292 zcmn_err(getzoneid(), CE_NOTE, 14293 "%s operation failed with " 14294 "EINVAL probably since the server, %s," 14295 " doesn't support POSIX style locking", 14296 lock_args ? "LOCK" : "LOCKU", 14297 mi->mi_curr_serv->sv_hostname); 14298 mi->mi_flags |= MI4_LOCK_DEBUG; 14299 } 14300 } 14301 mutex_exit(&mi->mi_lock); 14302 14303 if (cred_otw) 14304 crfree(cred_otw); 14305 } 14306 14307 /* 14308 * This calls the server. 14309 * 14310 * Blocking lock requests will continually retry to acquire the lock 14311 * forever. 14312 * 14313 * The ctype is defined as follows: 14314 * NFS4_LCK_CTYPE_NORM: normal lock request. 14315 * 14316 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14317 * recovery. 14318 * 14319 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14320 * that we will use the information passed in via resend_rqstp to setup the 14321 * lock/locku request. This resend is the exact same request as the 'lost 14322 * lock', and is initiated by the recovery framework. A successful resend 14323 * request can initiate one or more reinstate requests. 14324 * 14325 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14326 * does not trigger additional reinstate requests. This lock call type is 14327 * set for setting the v4 server's locking state back to match what the 14328 * client's local locking state is in the event of a received 'lost lock'. 14329 * 14330 * Errors are returned via the nfs4_error_t parameter. 14331 */ 14332 void 14333 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14334 cred_t *cr, nfs4_error_t *ep, nfs4_lost_rqst_t *resend_rqstp, 14335 int *did_reclaimp) 14336 { 14337 COMPOUND4args_clnt args, *argsp = NULL; 14338 COMPOUND4res_clnt res, *resp = NULL; 14339 nfs_argop4 *argop; 14340 nfs_resop4 *resop; 14341 rnode4_t *rp; 14342 int doqueue = 1; 14343 clock_t tick_delay; /* delay in clock ticks */ 14344 LOCK4args *lock_args = NULL; 14345 LOCKU4args *locku_args = NULL; 14346 LOCKT4args *lockt_args = NULL; 14347 nfs4_open_owner_t *oop = NULL; 14348 nfs4_open_stream_t *osp = NULL; 14349 nfs4_lock_owner_t *lop = NULL; 14350 bool_t needrecov = FALSE; 14351 nfs4_recov_state_t recov_state; 14352 nfs4_op_hint_t op_hint; 14353 nfs4_lost_rqst_t lost_rqst; 14354 bool_t retry = FALSE; 14355 bool_t did_start_fop = FALSE; 14356 bool_t skip_get_err = FALSE; 14357 cred_t *cred_otw = NULL; 14358 bool_t recovonly; /* just queue request */ 14359 int frc_no_reclaim = 0; 14360 #ifdef DEBUG 14361 char *name; 14362 #endif 14363 14364 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14365 14366 #ifdef DEBUG 14367 name = fn_name(VTOSV(vp)->sv_name); 14368 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14369 "%s: cmd %d, type %d, start %"PRIx64", " 14370 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14371 "resend request %s", name, cmd, flk->l_type, flk->l_start, 14372 flk->l_len, flk->l_pid, flk->l_sysid, 14373 nfs4frlock_get_call_type(ctype), 14374 resend_rqstp ? "TRUE" : "FALSE")); 14375 kmem_free(name, MAXNAMELEN); 14376 #endif 14377 14378 nfs4_error_zinit(ep); 14379 14380 nfs4frlock_pre_setup(&tick_delay, &recov_state, vp, cr, &cred_otw); 14381 14382 rp = VTOR4(vp); 14383 14384 recov_retry: 14385 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14386 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14387 14388 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14389 &did_start_fop, &recovonly); 14390 14391 if (ep->error) 14392 goto out; 14393 14394 if (recovonly) { 14395 /* 14396 * Leave the request for the recovery system to deal with. 14397 */ 14398 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14399 ASSERT(cmd != F_GETLK); 14400 ASSERT(flk->l_type == F_UNLCK); 14401 14402 nfs4_error_init(ep, EINTR); 14403 needrecov = TRUE; 14404 lop = find_lock_owner(rp, flk->l_pid, LOWN_ANY); 14405 if (lop != NULL) { 14406 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14407 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14408 (void) nfs4_start_recovery(ep, 14409 VTOMI4(vp), vp, NULL, NULL, 14410 (lost_rqst.lr_op == OP_LOCK || 14411 lost_rqst.lr_op == OP_LOCKU) ? 14412 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL); 14413 lock_owner_rele(lop); 14414 lop = NULL; 14415 } 14416 goto out; 14417 } 14418 14419 /* putfh directory fh */ 14420 argop[0].argop = OP_CPUTFH; 14421 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14422 14423 /* 14424 * Set up the over-the-wire arguments and get references to the 14425 * open owner, etc. 14426 */ 14427 14428 if (ctype == NFS4_LCK_CTYPE_RESEND || 14429 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14430 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14431 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14432 } else { 14433 bool_t go_otw = TRUE; 14434 14435 ASSERT(resend_rqstp == NULL); 14436 14437 switch (cmd) { 14438 case F_GETLK: 14439 case F_O_GETLK: 14440 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14441 nfs4frlock_setup_lockt_args(&argop[1], &lockt_args, 14442 argsp, flk, rp); 14443 break; 14444 case F_SETLKW: 14445 case F_SETLK: 14446 if (flk->l_type == F_UNLCK) 14447 nfs4frlock_setup_locku_args(ctype, 14448 &argop[1], &locku_args, flk, 14449 &lop, ep, argsp, vp, cr, 14450 &skip_get_err, &go_otw); 14451 else 14452 nfs4frlock_setup_lock_args(ctype, 14453 &lock_args, &oop, &osp, &lop, &argop[1], 14454 argsp, flk, cmd, vp, cr, ep); 14455 14456 if (ep->error) 14457 goto out; 14458 14459 switch (ep->stat) { 14460 case NFS4_OK: 14461 break; 14462 case NFS4ERR_DELAY: 14463 /* recov thread never gets this error */ 14464 ASSERT(resend_rqstp == NULL); 14465 ASSERT(did_start_fop); 14466 14467 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14468 &recov_state, TRUE); 14469 did_start_fop = FALSE; 14470 if (argop[1].argop == OP_LOCK) 14471 nfs4args_lock_free(&argop[1]); 14472 else if (argop[1].argop == OP_LOCKT) 14473 nfs4args_lockt_free(&argop[1]); 14474 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14475 argsp = NULL; 14476 goto recov_retry; 14477 default: 14478 ep->error = EIO; 14479 goto out; 14480 } 14481 break; 14482 default: 14483 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14484 "nfs4_frlock: invalid cmd %d", cmd)); 14485 ep->error = EINVAL; 14486 goto out; 14487 } 14488 14489 if (!go_otw) 14490 goto out; 14491 } 14492 14493 /* 14494 * Send the server the lock request. Continually loop with a delay 14495 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14496 */ 14497 resp = &res; 14498 14499 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14500 (CE_NOTE, 14501 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14502 rnode4info(rp))); 14503 14504 if (lock_args && frc_no_reclaim) { 14505 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14506 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14507 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14508 lock_args->reclaim = FALSE; 14509 if (did_reclaimp) 14510 *did_reclaimp = 0; 14511 } 14512 14513 /* 14514 * Do the OTW call. 14515 */ 14516 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14517 14518 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14519 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14520 14521 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14522 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14523 "nfs4frlock: needrecov %d", needrecov)); 14524 14525 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14526 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14527 args.ctag); 14528 14529 /* 14530 * Check if one of these mutually exclusive error cases has 14531 * happened: 14532 * need to swap credentials due to access error 14533 * recovery is needed 14534 * different error (only known case is missing Kerberos ticket) 14535 */ 14536 14537 if ((ep->error == EACCES || 14538 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14539 cred_otw != cr) { 14540 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14541 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14542 cr, &cred_otw); 14543 goto recov_retry; 14544 } 14545 14546 if (needrecov) { 14547 /* 14548 * LOCKT requests don't need to recover from lost 14549 * requests since they don't create/modify state. 14550 */ 14551 if ((ep->error == EINTR || 14552 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14553 lockt_args) 14554 goto out; 14555 /* 14556 * Do not attempt recovery for requests initiated by 14557 * the recovery framework. Let the framework redrive them. 14558 */ 14559 if (ctype != NFS4_LCK_CTYPE_NORM) 14560 goto out; 14561 else { 14562 ASSERT(resend_rqstp == NULL); 14563 } 14564 14565 nfs4frlock_save_lost_rqst(ctype, ep->error, 14566 flk_to_locktype(cmd, flk->l_type), 14567 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14568 14569 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14570 &resp, lock_args, locku_args, &oop, &osp, &lop, 14571 rp, vp, &recov_state, op_hint, &did_start_fop, 14572 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14573 14574 if (retry) { 14575 ASSERT(oop == NULL); 14576 ASSERT(osp == NULL); 14577 ASSERT(lop == NULL); 14578 goto recov_retry; 14579 } 14580 goto out; 14581 } 14582 14583 /* 14584 * Bail out if have reached this point with ep->error set. Can 14585 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14586 * This happens if Kerberos ticket has expired or has been 14587 * destroyed. 14588 */ 14589 if (ep->error != 0) 14590 goto out; 14591 14592 /* 14593 * Process the reply. 14594 */ 14595 switch (resp->status) { 14596 case NFS4_OK: 14597 resop = &resp->array[1]; 14598 /* 14599 * Have a successful lock operation, now update state. 14600 */ 14601 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14602 resop, lop, vp, flk, cr, resend_rqstp); 14603 break; 14604 14605 case NFS4ERR_DENIED: 14606 resop = &resp->array[1]; 14607 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14608 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14609 &recov_state, needrecov, &argsp, &resp, 14610 &tick_delay, &ep->error, resop, cr, 14611 &did_start_fop, &skip_get_err); 14612 14613 if (retry) { 14614 ASSERT(oop == NULL); 14615 ASSERT(osp == NULL); 14616 ASSERT(lop == NULL); 14617 goto recov_retry; 14618 } 14619 break; 14620 /* 14621 * If the server won't let us reclaim, fall-back to trying to lock 14622 * the file from scratch. Code elsewhere will check the changeinfo 14623 * to ensure the file hasn't been changed. 14624 */ 14625 case NFS4ERR_NO_GRACE: 14626 if (lock_args && lock_args->reclaim == TRUE) { 14627 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14628 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14629 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14630 frc_no_reclaim = 1; 14631 /* clean up before retrying */ 14632 needrecov = 0; 14633 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14634 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14635 &recov_state, op_hint, &did_start_fop, NULL, flk); 14636 goto recov_retry; 14637 } 14638 /* FALLTHROUGH */ 14639 14640 default: 14641 nfs4frlock_results_default(resp, &ep->error); 14642 break; 14643 } 14644 out: 14645 /* 14646 * Process and cleanup from error. Make interrupted unlock 14647 * requests look successful, since they will be handled by the 14648 * client recovery code. 14649 */ 14650 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14651 needrecov, oop, osp, lop, &ep->error, 14652 lock_args, locku_args, did_start_fop, 14653 skip_get_err, cred_otw, cr); 14654 14655 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14656 (cmd == F_SETLK || cmd == F_SETLKW)) 14657 ep->error = 0; 14658 } 14659 14660 /* 14661 * nfs4_safelock: 14662 * 14663 * Return non-zero if the given lock request can be handled without 14664 * violating the constraints on concurrent mapping and locking. 14665 */ 14666 14667 static int 14668 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14669 { 14670 rnode4_t *rp = VTOR4(vp); 14671 struct vattr va; 14672 int error; 14673 14674 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14675 ASSERT(rp->r_mapcnt >= 0); 14676 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14677 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14678 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14679 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14680 14681 if (rp->r_mapcnt == 0) 14682 return (1); /* always safe if not mapped */ 14683 14684 /* 14685 * If the file is already mapped and there are locks, then they 14686 * should be all safe locks. So adding or removing a lock is safe 14687 * as long as the new request is safe (i.e., whole-file, meaning 14688 * length and starting offset are both zero). 14689 */ 14690 14691 if (bfp->l_start != 0 || bfp->l_len != 0) { 14692 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14693 "cannot lock a memory mapped file unless locking the " 14694 "entire file: start %"PRIx64", len %"PRIx64, 14695 bfp->l_start, bfp->l_len)); 14696 return (0); 14697 } 14698 14699 /* mandatory locking and mapping don't mix */ 14700 va.va_mask = AT_MODE; 14701 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14702 if (error != 0) { 14703 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14704 "getattr error %d", error)); 14705 return (0); /* treat errors conservatively */ 14706 } 14707 if (MANDLOCK(vp, va.va_mode)) { 14708 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14709 "cannot mandatory lock and mmap a file")); 14710 return (0); 14711 } 14712 14713 return (1); 14714 } 14715 14716 /* 14717 * nfs4_lockrelease: 14718 * 14719 * Release any locks on the given vnode that are held by the current 14720 * process. Also removes the lock owner (if one exists) from the rnode's 14721 * list. 14722 */ 14723 static int 14724 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14725 { 14726 flock64_t ld; 14727 int ret, error; 14728 rnode4_t *rp; 14729 nfs4_lock_owner_t *lop; 14730 nfs4_recov_state_t recov_state; 14731 mntinfo4_t *mi; 14732 bool_t possible_orphan = FALSE; 14733 bool_t recovonly; 14734 14735 ASSERT((uintptr_t)vp > KERNELBASE); 14736 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14737 14738 rp = VTOR4(vp); 14739 mi = VTOMI4(vp); 14740 14741 /* 14742 * If we have not locked anything then we can 14743 * just return since we have no work to do. 14744 */ 14745 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14746 return (0); 14747 } 14748 14749 /* 14750 * We need to comprehend that another thread may 14751 * kick off recovery and the lock_owner we have stashed 14752 * in lop might be invalid so we should NOT cache it 14753 * locally! 14754 */ 14755 recov_state.rs_flags = 0; 14756 recov_state.rs_num_retry_despite_err = 0; 14757 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14758 &recovonly); 14759 if (error) { 14760 mutex_enter(&rp->r_statelock); 14761 rp->r_flags |= R4LODANGLERS; 14762 mutex_exit(&rp->r_statelock); 14763 return (error); 14764 } 14765 14766 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14767 14768 /* 14769 * Check if the lock owner might have a lock (request was sent but 14770 * no response was received). Also check if there are any remote 14771 * locks on the file. (In theory we shouldn't have to make this 14772 * second check if there's no lock owner, but for now we'll be 14773 * conservative and do it anyway.) If either condition is true, 14774 * send an unlock for the entire file to the server. 14775 * 14776 * Note that no explicit synchronization is needed here. At worst, 14777 * flk_has_remote_locks() will return a false positive, in which case 14778 * the unlock call wastes time but doesn't harm correctness. 14779 */ 14780 14781 if (lop) { 14782 mutex_enter(&lop->lo_lock); 14783 possible_orphan = lop->lo_pending_rqsts; 14784 mutex_exit(&lop->lo_lock); 14785 lock_owner_rele(lop); 14786 } 14787 14788 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14789 14790 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14791 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14792 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14793 (void *)lop)); 14794 14795 if (possible_orphan || flk_has_remote_locks(vp)) { 14796 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14797 ld.l_whence = 0; /* unlock from start of file */ 14798 ld.l_start = 0; 14799 ld.l_len = 0; /* do entire file */ 14800 14801 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14802 cr, NULL); 14803 14804 if (ret != 0) { 14805 /* 14806 * If VOP_FRLOCK fails, make sure we unregister 14807 * local locks before we continue. 14808 */ 14809 struct lm_sysid *lmsid = nfs4_find_sysid(VTOMI4(vp)); 14810 14811 if (lmsid != NULL) { 14812 cleanlocks(vp, curproc->p_pid, 14813 lm_sysidt(lmsid) | LM_SYSID_CLIENT); 14814 lm_rel_sysid(lmsid); 14815 } 14816 14817 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14818 "nfs4_lockrelease: lock release error on vp" 14819 " %p: error %d.\n", (void *)vp, ret)); 14820 } 14821 } 14822 14823 recov_state.rs_flags = 0; 14824 recov_state.rs_num_retry_despite_err = 0; 14825 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14826 &recovonly); 14827 if (error) { 14828 mutex_enter(&rp->r_statelock); 14829 rp->r_flags |= R4LODANGLERS; 14830 mutex_exit(&rp->r_statelock); 14831 return (error); 14832 } 14833 14834 /* 14835 * So, here we're going to need to retrieve the lock-owner 14836 * again (in case recovery has done a switch-a-roo) and 14837 * remove it because we can. 14838 */ 14839 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14840 14841 if (lop) { 14842 nfs4_rnode_remove_lock_owner(rp, lop); 14843 lock_owner_rele(lop); 14844 } 14845 14846 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14847 return (0); 14848 } 14849 14850 /* 14851 * Wait for 'tick_delay' clock ticks. 14852 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14853 * 14854 * The client should retry to acquire the lock faster than the lease period. 14855 * We use roughly half of the lease time to use a similar calculation as it is 14856 * used in nfs4_renew_lease_thread(). 14857 * 14858 * XXX For future improvements, should implement a waiting queue scheme. 14859 */ 14860 static int 14861 nfs4_block_and_wait(clock_t *tick_delay) 14862 { 14863 /* wait tick_delay clock ticks or siginteruptus */ 14864 if (delay_sig(*tick_delay)) { 14865 return (EINTR); 14866 } 14867 14868 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14869 "reissue the lock request: blocked for %ld clock ticks: %ld " 14870 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14871 14872 *tick_delay = MIN(drv_usectohz(nfs4_max_base_wait_time * 1000), 14873 *tick_delay * 1.5); 14874 return (0); 14875 } 14876 14877 void 14878 nfs4_vnops_init(void) 14879 { 14880 } 14881 14882 void 14883 nfs4_vnops_fini(void) 14884 { 14885 } 14886 14887 /* 14888 * Return a reference to the directory (parent) vnode for a given vnode, 14889 * using the saved pathname information and the directory file handle. The 14890 * caller is responsible for disposing of the reference. 14891 * Returns zero or an errno value. 14892 * 14893 * Caller should set need_start_op to FALSE if it is the recovery 14894 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14895 */ 14896 int 14897 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14898 { 14899 svnode_t *svnp; 14900 vnode_t *dvp = NULL; 14901 servinfo4_t *svp; 14902 nfs4_fname_t *mfname; 14903 int error; 14904 14905 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14906 14907 if (vp->v_flag & VROOT) { 14908 nfs4_sharedfh_t *sfh; 14909 nfs_fh4 fh; 14910 mntinfo4_t *mi; 14911 14912 ASSERT(vp->v_type == VREG); 14913 14914 mi = VTOMI4(vp); 14915 svp = mi->mi_curr_serv; 14916 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14917 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14918 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14919 sfh = sfh4_get(&fh, VTOMI4(vp)); 14920 nfs_rw_exit(&svp->sv_lock); 14921 mfname = mi->mi_fname; 14922 fn_hold(mfname); 14923 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14924 sfh4_rele(&sfh); 14925 14926 if (dvp->v_type == VNON) 14927 dvp->v_type = VDIR; 14928 *dvpp = dvp; 14929 return (0); 14930 } 14931 14932 svnp = VTOSV(vp); 14933 14934 if (svnp == NULL) { 14935 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14936 "shadow node is NULL")); 14937 return (EINVAL); 14938 } 14939 14940 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14941 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14942 "shadow node name or dfh val == NULL")); 14943 return (EINVAL); 14944 } 14945 14946 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14947 (int)need_start_op); 14948 if (error != 0) { 14949 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14950 "nfs4_make_dotdot returned %d", error)); 14951 return (error); 14952 } 14953 if (!dvp) { 14954 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14955 "nfs4_make_dotdot returned a NULL dvp")); 14956 return (EIO); 14957 } 14958 if (dvp->v_type == VNON) 14959 dvp->v_type = VDIR; 14960 ASSERT(dvp->v_type == VDIR); 14961 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14962 mutex_enter(&dvp->v_lock); 14963 dvp->v_flag |= V_XATTRDIR; 14964 mutex_exit(&dvp->v_lock); 14965 } 14966 *dvpp = dvp; 14967 return (0); 14968 } 14969 14970 /* 14971 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14972 * length that fnamep can accept, including the trailing null. 14973 * Returns 0 if okay, returns an errno value if there was a problem. 14974 */ 14975 14976 int 14977 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14978 { 14979 char *fn; 14980 int err = 0; 14981 servinfo4_t *svp; 14982 svnode_t *shvp; 14983 14984 /* 14985 * If the file being opened has VROOT set, then this is 14986 * a "file" mount. sv_name will not be interesting, so 14987 * go back to the servinfo4 to get the original mount 14988 * path and strip off all but the final edge. Otherwise 14989 * just return the name from the shadow vnode. 14990 */ 14991 14992 if (vp->v_flag & VROOT) { 14993 14994 svp = VTOMI4(vp)->mi_curr_serv; 14995 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14996 14997 fn = strrchr(svp->sv_path, '/'); 14998 if (fn == NULL) 14999 err = EINVAL; 15000 else 15001 fn++; 15002 } else { 15003 shvp = VTOSV(vp); 15004 fn = fn_name(shvp->sv_name); 15005 } 15006 15007 if (err == 0) 15008 if (strlen(fn) < maxlen) 15009 (void) strcpy(fnamep, fn); 15010 else 15011 err = ENAMETOOLONG; 15012 15013 if (vp->v_flag & VROOT) 15014 nfs_rw_exit(&svp->sv_lock); 15015 else 15016 kmem_free(fn, MAXNAMELEN); 15017 15018 return (err); 15019 } 15020 15021 /* 15022 * Bookkeeping for a close that doesn't need to go over the wire. 15023 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 15024 * it is left at 1. 15025 */ 15026 void 15027 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 15028 { 15029 rnode4_t *rp; 15030 mntinfo4_t *mi; 15031 15032 mi = VTOMI4(vp); 15033 rp = VTOR4(vp); 15034 15035 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 15036 "rp=%p osp=%p", (void *)rp, (void *)osp)); 15037 ASSERT(nfs_zone() == mi->mi_zone); 15038 ASSERT(mutex_owned(&osp->os_sync_lock)); 15039 ASSERT(*have_lockp); 15040 15041 if (!osp->os_valid || 15042 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15043 return; 15044 } 15045 15046 /* 15047 * This removes the reference obtained at OPEN; ie, 15048 * when the open stream structure was created. 15049 * 15050 * We don't have to worry about calling 'open_stream_rele' 15051 * since we our currently holding a reference to this 15052 * open stream which means the count can not go to 0 with 15053 * this decrement. 15054 */ 15055 ASSERT(osp->os_ref_count >= 2); 15056 osp->os_ref_count--; 15057 osp->os_valid = 0; 15058 mutex_exit(&osp->os_sync_lock); 15059 *have_lockp = 0; 15060 15061 nfs4_dec_state_ref_count(mi); 15062 } 15063 15064 /* 15065 * Close all remaining open streams on the rnode. These open streams 15066 * could be here because: 15067 * - The close attempted at either close or delmap failed 15068 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 15069 * - Someone did mknod on a regular file but never opened it 15070 */ 15071 int 15072 nfs4close_all(vnode_t *vp, cred_t *cr) 15073 { 15074 nfs4_open_stream_t *osp; 15075 int error; 15076 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 15077 rnode4_t *rp; 15078 15079 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15080 15081 error = 0; 15082 rp = VTOR4(vp); 15083 15084 /* 15085 * At this point, all we know is that the last time 15086 * someone called vn_rele, the count was 1. Since then, 15087 * the vnode could have been re-activated. We want to 15088 * loop through the open streams and close each one, but 15089 * we have to be careful since once we release the rnode 15090 * hash bucket lock, someone else is free to come in and 15091 * re-activate the rnode and add new open streams. The 15092 * strategy is take the rnode hash bucket lock, verify that 15093 * the count is still 1, grab the open stream off the 15094 * head of the list and mark it invalid, then release the 15095 * rnode hash bucket lock and proceed with that open stream. 15096 * This is ok because nfs4close_one() will acquire the proper 15097 * open/create to close/destroy synchronization for open 15098 * streams, and will ensure that if someone has reopened 15099 * the open stream after we've dropped the hash bucket lock 15100 * then we'll just simply return without destroying the 15101 * open stream. 15102 * Repeat until the list is empty. 15103 */ 15104 15105 for (;;) { 15106 15107 /* make sure vnode hasn't been reactivated */ 15108 rw_enter(&rp->r_hashq->r_lock, RW_READER); 15109 mutex_enter(&vp->v_lock); 15110 if (vp->v_count > 1) { 15111 mutex_exit(&vp->v_lock); 15112 rw_exit(&rp->r_hashq->r_lock); 15113 break; 15114 } 15115 /* 15116 * Grabbing r_os_lock before releasing v_lock prevents 15117 * a window where the rnode/open stream could get 15118 * reactivated (and os_force_close set to 0) before we 15119 * had a chance to set os_force_close to 1. 15120 */ 15121 mutex_enter(&rp->r_os_lock); 15122 mutex_exit(&vp->v_lock); 15123 15124 osp = list_head(&rp->r_open_streams); 15125 if (!osp) { 15126 /* nothing left to CLOSE OTW, so return */ 15127 mutex_exit(&rp->r_os_lock); 15128 rw_exit(&rp->r_hashq->r_lock); 15129 break; 15130 } 15131 15132 mutex_enter(&rp->r_statev4_lock); 15133 /* the file can't still be mem mapped */ 15134 ASSERT(rp->r_mapcnt == 0); 15135 if (rp->created_v4) 15136 rp->created_v4 = 0; 15137 mutex_exit(&rp->r_statev4_lock); 15138 15139 /* 15140 * Grab a ref on this open stream; nfs4close_one 15141 * will mark it as invalid 15142 */ 15143 mutex_enter(&osp->os_sync_lock); 15144 osp->os_ref_count++; 15145 osp->os_force_close = 1; 15146 mutex_exit(&osp->os_sync_lock); 15147 mutex_exit(&rp->r_os_lock); 15148 rw_exit(&rp->r_hashq->r_lock); 15149 15150 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15151 15152 /* Update error if it isn't already non-zero */ 15153 if (error == 0) { 15154 if (e.error) 15155 error = e.error; 15156 else if (e.stat) 15157 error = geterrno4(e.stat); 15158 } 15159 15160 #ifdef DEBUG 15161 nfs4close_all_cnt++; 15162 #endif 15163 /* Release the ref on osp acquired above. */ 15164 open_stream_rele(osp, rp); 15165 15166 /* Proceed to the next open stream, if any */ 15167 } 15168 return (error); 15169 } 15170 15171 /* 15172 * nfs4close_one - close one open stream for a file if needed. 15173 * 15174 * "close_type" indicates which close path this is: 15175 * CLOSE_NORM: close initiated via VOP_CLOSE. 15176 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15177 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15178 * the close and release of client state for this open stream 15179 * (unless someone else has the open stream open). 15180 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15181 * (e.g., due to abort because of a signal). 15182 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15183 * 15184 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15185 * recovery. Instead, the caller is expected to deal with retries. 15186 * 15187 * The caller can either pass in the osp ('provided_osp') or not. 15188 * 15189 * 'access_bits' represents the access we are closing/downgrading. 15190 * 15191 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15192 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15193 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15194 * 15195 * Errors are returned via the nfs4_error_t. 15196 */ 15197 void 15198 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15199 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15200 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15201 uint_t mmap_flags) 15202 { 15203 nfs4_open_owner_t *oop; 15204 nfs4_open_stream_t *osp = NULL; 15205 int retry = 0; 15206 int num_retries = NFS4_NUM_RECOV_RETRIES; 15207 rnode4_t *rp; 15208 mntinfo4_t *mi; 15209 nfs4_recov_state_t recov_state; 15210 cred_t *cred_otw = NULL; 15211 bool_t recovonly = FALSE; 15212 int isrecov; 15213 int force_close; 15214 int close_failed = 0; 15215 int did_dec_count = 0; 15216 int did_start_op = 0; 15217 int did_force_recovlock = 0; 15218 int did_start_seqid_sync = 0; 15219 int have_sync_lock = 0; 15220 15221 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15222 15223 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15224 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15225 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15226 len, maxprot, mmap_flags, access_bits)); 15227 15228 nfs4_error_zinit(ep); 15229 rp = VTOR4(vp); 15230 mi = VTOMI4(vp); 15231 isrecov = (close_type == CLOSE_RESEND || 15232 close_type == CLOSE_AFTER_RESEND); 15233 15234 /* 15235 * First get the open owner. 15236 */ 15237 if (!provided_osp) { 15238 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15239 } else { 15240 oop = provided_osp->os_open_owner; 15241 ASSERT(oop != NULL); 15242 open_owner_hold(oop); 15243 } 15244 15245 if (!oop) { 15246 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15247 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15248 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15249 (void *)provided_osp, close_type)); 15250 ep->error = EIO; 15251 goto out; 15252 } 15253 15254 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15255 recov_retry: 15256 osp = NULL; 15257 close_failed = 0; 15258 force_close = (close_type == CLOSE_FORCE); 15259 retry = 0; 15260 did_start_op = 0; 15261 did_force_recovlock = 0; 15262 did_start_seqid_sync = 0; 15263 have_sync_lock = 0; 15264 recovonly = FALSE; 15265 recov_state.rs_flags = 0; 15266 recov_state.rs_num_retry_despite_err = 0; 15267 15268 /* 15269 * Second synchronize with recovery. 15270 */ 15271 if (!isrecov) { 15272 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15273 &recov_state, &recovonly); 15274 if (!ep->error) { 15275 did_start_op = 1; 15276 } else { 15277 close_failed = 1; 15278 /* 15279 * If we couldn't get start_fop, but have to 15280 * cleanup state, then at least acquire the 15281 * mi_recovlock so we can synchronize with 15282 * recovery. 15283 */ 15284 if (close_type == CLOSE_FORCE) { 15285 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15286 RW_READER, FALSE); 15287 did_force_recovlock = 1; 15288 } else 15289 goto out; 15290 } 15291 } 15292 15293 /* 15294 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15295 * set 'recovonly' to TRUE since most likely this is due to 15296 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15297 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15298 * to retry, causing us to loop until recovery finishes. Plus we 15299 * don't need protection over the open seqid since we're not going 15300 * OTW, hence don't need to use the seqid. 15301 */ 15302 if (recovonly == FALSE) { 15303 /* need to grab the open owner sync before 'os_sync_lock' */ 15304 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15305 if (ep->error == EAGAIN) { 15306 ASSERT(!isrecov); 15307 if (did_start_op) 15308 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15309 &recov_state, TRUE); 15310 if (did_force_recovlock) 15311 nfs_rw_exit(&mi->mi_recovlock); 15312 goto recov_retry; 15313 } 15314 did_start_seqid_sync = 1; 15315 } 15316 15317 /* 15318 * Third get an open stream and acquire 'os_sync_lock' to 15319 * sychronize the opening/creating of an open stream with the 15320 * closing/destroying of an open stream. 15321 */ 15322 if (!provided_osp) { 15323 /* returns with 'os_sync_lock' held */ 15324 osp = find_open_stream(oop, rp); 15325 if (!osp) { 15326 ep->error = EIO; 15327 goto out; 15328 } 15329 } else { 15330 osp = provided_osp; 15331 open_stream_hold(osp); 15332 mutex_enter(&osp->os_sync_lock); 15333 } 15334 have_sync_lock = 1; 15335 15336 ASSERT(oop == osp->os_open_owner); 15337 15338 /* 15339 * Fourth, do any special pre-OTW CLOSE processing 15340 * based on the specific close type. 15341 */ 15342 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15343 !did_dec_count) { 15344 ASSERT(osp->os_open_ref_count > 0); 15345 osp->os_open_ref_count--; 15346 did_dec_count = 1; 15347 if (osp->os_open_ref_count == 0) 15348 osp->os_final_close = 1; 15349 } 15350 15351 if (close_type == CLOSE_FORCE) { 15352 /* see if somebody reopened the open stream. */ 15353 if (!osp->os_force_close) { 15354 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15355 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15356 "was reopened, vp %p", (void *)osp, (void *)vp)); 15357 ep->error = 0; 15358 ep->stat = NFS4_OK; 15359 goto out; 15360 } 15361 15362 if (!osp->os_final_close && !did_dec_count) { 15363 osp->os_open_ref_count--; 15364 did_dec_count = 1; 15365 } 15366 15367 /* 15368 * We can't depend on os_open_ref_count being 0 due to the 15369 * way executables are opened (VN_RELE to match a VOP_OPEN). 15370 */ 15371 #ifdef NOTYET 15372 ASSERT(osp->os_open_ref_count == 0); 15373 #endif 15374 if (osp->os_open_ref_count != 0) { 15375 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15376 "nfs4close_one: should panic here on an " 15377 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15378 "since this is probably the exec problem.")); 15379 15380 osp->os_open_ref_count = 0; 15381 } 15382 15383 /* 15384 * There is the possibility that nfs4close_one() 15385 * for close_type == CLOSE_DELMAP couldn't find the 15386 * open stream, thus couldn't decrement its os_mapcnt; 15387 * therefore we can't use this ASSERT yet. 15388 */ 15389 #ifdef NOTYET 15390 ASSERT(osp->os_mapcnt == 0); 15391 #endif 15392 osp->os_mapcnt = 0; 15393 } 15394 15395 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15396 ASSERT(osp->os_mapcnt >= btopr(len)); 15397 15398 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15399 osp->os_mmap_write -= btopr(len); 15400 if (maxprot & PROT_READ) 15401 osp->os_mmap_read -= btopr(len); 15402 if (maxprot & PROT_EXEC) 15403 osp->os_mmap_read -= btopr(len); 15404 /* mirror the PROT_NONE check in nfs4_addmap() */ 15405 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15406 !(maxprot & PROT_EXEC)) 15407 osp->os_mmap_read -= btopr(len); 15408 osp->os_mapcnt -= btopr(len); 15409 did_dec_count = 1; 15410 } 15411 15412 if (recovonly) { 15413 nfs4_lost_rqst_t lost_rqst; 15414 15415 /* request should not already be in recovery queue */ 15416 ASSERT(lrp == NULL); 15417 nfs4_error_init(ep, EINTR); 15418 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15419 osp, cred_otw, vp); 15420 mutex_exit(&osp->os_sync_lock); 15421 have_sync_lock = 0; 15422 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15423 lost_rqst.lr_op == OP_CLOSE ? 15424 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL); 15425 close_failed = 1; 15426 force_close = 0; 15427 goto close_cleanup; 15428 } 15429 15430 /* 15431 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15432 * we stopped operating on the open owner's <old oo_name, old seqid> 15433 * space, which means we stopped operating on the open stream 15434 * too. So don't go OTW (as the seqid is likely bad, and the 15435 * stateid could be stale, potentially triggering a false 15436 * setclientid), and just clean up the client's internal state. 15437 */ 15438 if (osp->os_orig_oo_name != oop->oo_name) { 15439 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15440 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15441 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15442 "oo_name %" PRIx64")", 15443 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15444 oop->oo_name)); 15445 close_failed = 1; 15446 } 15447 15448 /* If the file failed recovery, just quit. */ 15449 mutex_enter(&rp->r_statelock); 15450 if (rp->r_flags & R4RECOVERR) { 15451 close_failed = 1; 15452 } 15453 mutex_exit(&rp->r_statelock); 15454 15455 /* 15456 * If the force close path failed to obtain start_fop 15457 * then skip the OTW close and just remove the state. 15458 */ 15459 if (close_failed) 15460 goto close_cleanup; 15461 15462 /* 15463 * Fifth, check to see if there are still mapped pages or other 15464 * opens using this open stream. If there are then we can't 15465 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15466 */ 15467 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15468 nfs4_lost_rqst_t new_lost_rqst; 15469 bool_t needrecov = FALSE; 15470 cred_t *odg_cred_otw = NULL; 15471 seqid4 open_dg_seqid = 0; 15472 15473 if (osp->os_delegation) { 15474 /* 15475 * If this open stream was never OPENed OTW then we 15476 * surely can't DOWNGRADE it (especially since the 15477 * osp->open_stateid is really a delegation stateid 15478 * when os_delegation is 1). 15479 */ 15480 if (access_bits & FREAD) 15481 osp->os_share_acc_read--; 15482 if (access_bits & FWRITE) 15483 osp->os_share_acc_write--; 15484 osp->os_share_deny_none--; 15485 nfs4_error_zinit(ep); 15486 goto out; 15487 } 15488 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15489 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15490 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15491 if (needrecov && !isrecov) { 15492 bool_t abort; 15493 nfs4_bseqid_entry_t *bsep = NULL; 15494 15495 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15496 bsep = nfs4_create_bseqid_entry(oop, NULL, 15497 vp, 0, 15498 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15499 open_dg_seqid); 15500 15501 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15502 oop, osp, odg_cred_otw, vp, access_bits, 0); 15503 mutex_exit(&osp->os_sync_lock); 15504 have_sync_lock = 0; 15505 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15506 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15507 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15508 bsep, NULL, NULL); 15509 if (odg_cred_otw) 15510 crfree(odg_cred_otw); 15511 if (bsep) 15512 kmem_free(bsep, sizeof (*bsep)); 15513 15514 if (abort == TRUE) 15515 goto out; 15516 15517 if (did_start_seqid_sync) { 15518 nfs4_end_open_seqid_sync(oop); 15519 did_start_seqid_sync = 0; 15520 } 15521 open_stream_rele(osp, rp); 15522 15523 if (did_start_op) 15524 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15525 &recov_state, FALSE); 15526 if (did_force_recovlock) 15527 nfs_rw_exit(&mi->mi_recovlock); 15528 15529 goto recov_retry; 15530 } else { 15531 if (odg_cred_otw) 15532 crfree(odg_cred_otw); 15533 } 15534 goto out; 15535 } 15536 15537 /* 15538 * If this open stream was created as the results of an open 15539 * while holding a delegation, then just release it; no need 15540 * to do an OTW close. Otherwise do a "normal" OTW close. 15541 */ 15542 if (osp->os_delegation) { 15543 nfs4close_notw(vp, osp, &have_sync_lock); 15544 nfs4_error_zinit(ep); 15545 goto out; 15546 } 15547 15548 /* 15549 * If this stream is not valid, we're done. 15550 */ 15551 if (!osp->os_valid) { 15552 nfs4_error_zinit(ep); 15553 goto out; 15554 } 15555 15556 /* 15557 * Last open or mmap ref has vanished, need to do an OTW close. 15558 * First check to see if a close is still necessary. 15559 */ 15560 if (osp->os_failed_reopen) { 15561 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15562 "don't close OTW osp %p since reopen failed.", 15563 (void *)osp)); 15564 /* 15565 * Reopen of the open stream failed, hence the 15566 * stateid of the open stream is invalid/stale, and 15567 * sending this OTW would incorrectly cause another 15568 * round of recovery. In this case, we need to set 15569 * the 'os_valid' bit to 0 so another thread doesn't 15570 * come in and re-open this open stream before 15571 * this "closing" thread cleans up state (decrementing 15572 * the nfs4_server_t's state_ref_count and decrementing 15573 * the os_ref_count). 15574 */ 15575 osp->os_valid = 0; 15576 /* 15577 * This removes the reference obtained at OPEN; ie, 15578 * when the open stream structure was created. 15579 * 15580 * We don't have to worry about calling 'open_stream_rele' 15581 * since we our currently holding a reference to this 15582 * open stream which means the count can not go to 0 with 15583 * this decrement. 15584 */ 15585 ASSERT(osp->os_ref_count >= 2); 15586 osp->os_ref_count--; 15587 nfs4_error_zinit(ep); 15588 close_failed = 0; 15589 goto close_cleanup; 15590 } 15591 15592 ASSERT(osp->os_ref_count > 1); 15593 15594 /* 15595 * Sixth, try the CLOSE OTW. 15596 */ 15597 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15598 close_type, ep, &have_sync_lock); 15599 15600 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15601 /* 15602 * Let the recovery thread be responsible for 15603 * removing the state for CLOSE. 15604 */ 15605 close_failed = 1; 15606 force_close = 0; 15607 retry = 0; 15608 } 15609 15610 /* See if we need to retry with a different cred */ 15611 if ((ep->error == EACCES || 15612 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15613 cred_otw != cr) { 15614 crfree(cred_otw); 15615 cred_otw = cr; 15616 crhold(cred_otw); 15617 retry = 1; 15618 } 15619 15620 if (ep->error || ep->stat) 15621 close_failed = 1; 15622 15623 if (retry && !isrecov && num_retries-- > 0) { 15624 if (have_sync_lock) { 15625 mutex_exit(&osp->os_sync_lock); 15626 have_sync_lock = 0; 15627 } 15628 if (did_start_seqid_sync) { 15629 nfs4_end_open_seqid_sync(oop); 15630 did_start_seqid_sync = 0; 15631 } 15632 open_stream_rele(osp, rp); 15633 15634 if (did_start_op) 15635 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15636 &recov_state, FALSE); 15637 if (did_force_recovlock) 15638 nfs_rw_exit(&mi->mi_recovlock); 15639 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15640 "nfs4close_one: need to retry the close " 15641 "operation")); 15642 goto recov_retry; 15643 } 15644 close_cleanup: 15645 /* 15646 * Seventh and lastly, process our results. 15647 */ 15648 if (close_failed && force_close) { 15649 /* 15650 * It's ok to drop and regrab the 'os_sync_lock' since 15651 * nfs4close_notw() will recheck to make sure the 15652 * "close"/removal of state should happen. 15653 */ 15654 if (!have_sync_lock) { 15655 mutex_enter(&osp->os_sync_lock); 15656 have_sync_lock = 1; 15657 } 15658 /* 15659 * This is last call, remove the ref on the open 15660 * stream created by open and clean everything up. 15661 */ 15662 osp->os_pending_close = 0; 15663 nfs4close_notw(vp, osp, &have_sync_lock); 15664 nfs4_error_zinit(ep); 15665 } 15666 15667 if (!close_failed) { 15668 if (have_sync_lock) { 15669 osp->os_pending_close = 0; 15670 mutex_exit(&osp->os_sync_lock); 15671 have_sync_lock = 0; 15672 } else { 15673 mutex_enter(&osp->os_sync_lock); 15674 osp->os_pending_close = 0; 15675 mutex_exit(&osp->os_sync_lock); 15676 } 15677 if (did_start_op && recov_state.rs_sp != NULL) { 15678 mutex_enter(&recov_state.rs_sp->s_lock); 15679 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15680 mutex_exit(&recov_state.rs_sp->s_lock); 15681 } else { 15682 nfs4_dec_state_ref_count(mi); 15683 } 15684 nfs4_error_zinit(ep); 15685 } 15686 15687 out: 15688 if (have_sync_lock) 15689 mutex_exit(&osp->os_sync_lock); 15690 if (did_start_op) 15691 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15692 recovonly ? TRUE : FALSE); 15693 if (did_force_recovlock) 15694 nfs_rw_exit(&mi->mi_recovlock); 15695 if (cred_otw) 15696 crfree(cred_otw); 15697 if (osp) 15698 open_stream_rele(osp, rp); 15699 if (oop) { 15700 if (did_start_seqid_sync) 15701 nfs4_end_open_seqid_sync(oop); 15702 open_owner_rele(oop); 15703 } 15704 } 15705 15706 /* 15707 * Convert information returned by the server in the LOCK4denied 15708 * structure to the form required by fcntl. 15709 */ 15710 static void 15711 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15712 { 15713 nfs4_lo_name_t *lo; 15714 15715 #ifdef DEBUG 15716 if (denied_to_flk_debug) { 15717 lockt_denied_debug = lockt_denied; 15718 debug_enter("lockt_denied"); 15719 } 15720 #endif 15721 15722 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15723 flk->l_whence = 0; /* aka SEEK_SET */ 15724 flk->l_start = lockt_denied->offset; 15725 flk->l_len = lockt_denied->length; 15726 15727 /* 15728 * If the blocking clientid matches our client id, then we can 15729 * interpret the lockowner (since we built it). If not, then 15730 * fabricate a sysid and pid. Note that the l_sysid field 15731 * in *flk already has the local sysid. 15732 */ 15733 15734 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15735 15736 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15737 lo = (nfs4_lo_name_t *) 15738 lockt_denied->owner.owner_val; 15739 15740 flk->l_pid = lo->ln_pid; 15741 } else { 15742 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15743 "denied_to_flk: bad lock owner length\n")); 15744 15745 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15746 } 15747 } else { 15748 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15749 "denied_to_flk: foreign clientid\n")); 15750 15751 /* 15752 * Construct a new sysid which should be different from 15753 * sysids of other systems. 15754 */ 15755 15756 flk->l_sysid++; 15757 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15758 } 15759 } 15760 15761 static pid_t 15762 lo_to_pid(lock_owner4 *lop) 15763 { 15764 pid_t pid = 0; 15765 uchar_t *cp; 15766 int i; 15767 15768 cp = (uchar_t *)&lop->clientid; 15769 15770 for (i = 0; i < sizeof (lop->clientid); i++) 15771 pid += (pid_t)*cp++; 15772 15773 cp = (uchar_t *)lop->owner_val; 15774 15775 for (i = 0; i < lop->owner_len; i++) 15776 pid += (pid_t)*cp++; 15777 15778 return (pid); 15779 } 15780 15781 /* 15782 * Given a lock pointer, returns the length of that lock. 15783 * "end" is the last locked offset the "l_len" covers from 15784 * the start of the lock. 15785 */ 15786 static off64_t 15787 lock_to_end(flock64_t *lock) 15788 { 15789 off64_t lock_end; 15790 15791 if (lock->l_len == 0) 15792 lock_end = (off64_t)MAXEND; 15793 else 15794 lock_end = lock->l_start + lock->l_len - 1; 15795 15796 return (lock_end); 15797 } 15798 15799 /* 15800 * Given the end of a lock, it will return you the length "l_len" for that lock. 15801 */ 15802 static off64_t 15803 end_to_len(off64_t start, off64_t end) 15804 { 15805 off64_t lock_len; 15806 15807 ASSERT(end >= start); 15808 if (end == MAXEND) 15809 lock_len = 0; 15810 else 15811 lock_len = end - start + 1; 15812 15813 return (lock_len); 15814 } 15815 15816 /* 15817 * On given end for a lock it determines if it is the last locked offset 15818 * or not, if so keeps it as is, else adds one to return the length for 15819 * valid start. 15820 */ 15821 static off64_t 15822 start_check(off64_t x) 15823 { 15824 if (x == MAXEND) 15825 return (x); 15826 else 15827 return (x + 1); 15828 } 15829 15830 /* 15831 * See if these two locks overlap, and if so return 1; 15832 * otherwise, return 0. 15833 */ 15834 static int 15835 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15836 { 15837 off64_t llfp_end, curfp_end; 15838 15839 llfp_end = lock_to_end(llfp); 15840 curfp_end = lock_to_end(curfp); 15841 15842 if (((llfp_end >= curfp->l_start) && 15843 (llfp->l_start <= curfp->l_start)) || 15844 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15845 return (1); 15846 return (0); 15847 } 15848 15849 /* 15850 * Determine what the intersecting lock region is, and add that to the 15851 * 'nl_llpp' locklist in increasing order (by l_start). 15852 */ 15853 static void 15854 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15855 locklist_t **nl_llpp, vnode_t *vp) 15856 { 15857 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15858 off64_t lost_flp_end, local_flp_end, len, start; 15859 15860 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15861 15862 if (!locks_intersect(lost_flp, local_flp)) 15863 return; 15864 15865 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15866 "locks intersect")); 15867 15868 lost_flp_end = lock_to_end(lost_flp); 15869 local_flp_end = lock_to_end(local_flp); 15870 15871 /* Find the starting point of the intersecting region */ 15872 if (local_flp->l_start > lost_flp->l_start) 15873 start = local_flp->l_start; 15874 else 15875 start = lost_flp->l_start; 15876 15877 /* Find the lenght of the intersecting region */ 15878 if (lost_flp_end < local_flp_end) 15879 len = end_to_len(start, lost_flp_end); 15880 else 15881 len = end_to_len(start, local_flp_end); 15882 15883 /* 15884 * Prepare the flock structure for the intersection found and insert 15885 * it into the new list in increasing l_start order. This list contains 15886 * intersections of locks registered by the client with the local host 15887 * and the lost lock. 15888 * The lock type of this lock is the same as that of the local_flp. 15889 */ 15890 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15891 intersect_llp->ll_flock.l_start = start; 15892 intersect_llp->ll_flock.l_len = len; 15893 intersect_llp->ll_flock.l_type = local_flp->l_type; 15894 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15895 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15896 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15897 intersect_llp->ll_vp = vp; 15898 15899 tmp_fllp = *nl_llpp; 15900 cur_fllp = NULL; 15901 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15902 intersect_llp->ll_flock.l_start) { 15903 cur_fllp = tmp_fllp; 15904 tmp_fllp = tmp_fllp->ll_next; 15905 } 15906 if (cur_fllp == NULL) { 15907 /* first on the list */ 15908 intersect_llp->ll_next = *nl_llpp; 15909 *nl_llpp = intersect_llp; 15910 } else { 15911 intersect_llp->ll_next = cur_fllp->ll_next; 15912 cur_fllp->ll_next = intersect_llp; 15913 } 15914 15915 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15916 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15917 intersect_llp->ll_flock.l_start, 15918 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15919 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15920 } 15921 15922 /* 15923 * Our local locking current state is potentially different than 15924 * what the NFSv4 server thinks we have due to a lost lock that was 15925 * resent and then received. We need to reset our "NFSv4" locking 15926 * state to match the current local locking state for this pid since 15927 * that is what the user/application sees as what the world is. 15928 * 15929 * We cannot afford to drop the open/lock seqid sync since then we can 15930 * get confused about what the current local locking state "is" versus 15931 * "was". 15932 * 15933 * If we are unable to fix up the locks, we send SIGLOST to the affected 15934 * process. This is not done if the filesystem has been forcibly 15935 * unmounted, in case the process has already exited and a new process 15936 * exists with the same pid. 15937 */ 15938 static void 15939 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15940 nfs4_lock_owner_t *lop) 15941 { 15942 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15943 mntinfo4_t *mi = VTOMI4(vp); 15944 const int cmd = F_SETLK; 15945 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15946 flock64_t ul_fl; 15947 15948 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15949 "nfs4_reinstitute_local_lock_state")); 15950 15951 /* 15952 * Find active locks for this vp from the local locking code. 15953 * Scan through this list and find out the locks that intersect with 15954 * the lost lock. Once we find the lock that intersects, add the 15955 * intersection area as a new lock to a new list "ri_llp". The lock 15956 * type of the intersection region lock added to ri_llp is the same 15957 * as that found in the active lock list, "list". The intersecting 15958 * region locks are added to ri_llp in increasing l_start order. 15959 */ 15960 ASSERT(nfs_zone() == mi->mi_zone); 15961 15962 locks = flk_active_locks_for_vp(vp); 15963 ri_llp = NULL; 15964 15965 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15966 ASSERT(llp->ll_vp == vp); 15967 /* 15968 * Pick locks that belong to this pid/lockowner 15969 */ 15970 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15971 continue; 15972 15973 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15974 } 15975 15976 /* 15977 * Now we have the list of intersections with the lost lock. These are 15978 * the locks that were/are active before the server replied to the 15979 * last/lost lock. Issue these locks to the server here. Playing these 15980 * locks to the server will re-establish aur current local locking state 15981 * with the v4 server. 15982 * If we get an error, send SIGLOST to the application for that lock. 15983 */ 15984 15985 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15986 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15987 "nfs4_reinstitute_local_lock_state: need to issue " 15988 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15989 llp->ll_flock.l_start, 15990 llp->ll_flock.l_start + llp->ll_flock.l_len, 15991 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15992 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15993 /* 15994 * No need to relock what we already have 15995 */ 15996 if (llp->ll_flock.l_type == lost_flp->l_type) 15997 continue; 15998 15999 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 16000 } 16001 16002 /* 16003 * Now keeping the start of the lost lock as our reference parse the 16004 * newly created ri_llp locklist to find the ranges that we have locked 16005 * with the v4 server but not in the current local locking. We need 16006 * to unlock these ranges. 16007 * These ranges can also be reffered to as those ranges, where the lost 16008 * lock does not overlap with the locks in the ri_llp but are locked 16009 * since the server replied to the lost lock. 16010 */ 16011 cur_start = lost_flp->l_start; 16012 lost_flp_end = lock_to_end(lost_flp); 16013 16014 ul_fl.l_type = F_UNLCK; 16015 ul_fl.l_whence = 0; /* aka SEEK_SET */ 16016 ul_fl.l_sysid = lost_flp->l_sysid; 16017 ul_fl.l_pid = lost_flp->l_pid; 16018 16019 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 16020 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 16021 16022 if (llp->ll_flock.l_start <= cur_start) { 16023 cur_start = start_check(llp_ll_flock_end); 16024 continue; 16025 } 16026 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 16027 "nfs4_reinstitute_local_lock_state: " 16028 "UNLOCK [%"PRIx64" - %"PRIx64"]", 16029 cur_start, llp->ll_flock.l_start)); 16030 16031 ul_fl.l_start = cur_start; 16032 ul_fl.l_len = end_to_len(cur_start, 16033 (llp->ll_flock.l_start - 1)); 16034 16035 push_reinstate(vp, cmd, &ul_fl, cr, lop); 16036 cur_start = start_check(llp_ll_flock_end); 16037 } 16038 16039 /* 16040 * In the case where the lost lock ends after all intersecting locks, 16041 * unlock the last part of the lost lock range. 16042 */ 16043 if (cur_start != start_check(lost_flp_end)) { 16044 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 16045 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 16046 "lost lock region [%"PRIx64" - %"PRIx64"]", 16047 cur_start, lost_flp->l_start + lost_flp->l_len)); 16048 16049 ul_fl.l_start = cur_start; 16050 /* 16051 * Is it an to-EOF lock? if so unlock till the end 16052 */ 16053 if (lost_flp->l_len == 0) 16054 ul_fl.l_len = 0; 16055 else 16056 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 16057 16058 push_reinstate(vp, cmd, &ul_fl, cr, lop); 16059 } 16060 16061 if (locks != NULL) 16062 flk_free_locklist(locks); 16063 16064 /* Free up our newly created locklist */ 16065 for (llp = ri_llp; llp != NULL; ) { 16066 tmp_llp = llp->ll_next; 16067 kmem_free(llp, sizeof (locklist_t)); 16068 llp = tmp_llp; 16069 } 16070 16071 /* 16072 * Now return back to the original calling nfs4frlock() 16073 * and let us naturally drop our seqid syncs. 16074 */ 16075 } 16076 16077 /* 16078 * Create a lost state record for the given lock reinstantiation request 16079 * and push it onto the lost state queue. 16080 */ 16081 static void 16082 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 16083 nfs4_lock_owner_t *lop) 16084 { 16085 nfs4_lost_rqst_t req; 16086 nfs_lock_type4 locktype; 16087 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 16088 16089 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 16090 16091 locktype = flk_to_locktype(cmd, flk->l_type); 16092 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 16093 NULL, NULL, lop, flk, &req, cr, vp); 16094 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 16095 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 16096 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 16097 NULL, NULL, NULL); 16098 } 16099