1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 24 */ 25 26 /* 27 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 28 * Use is subject to license terms. 29 */ 30 31 /* 32 * Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T. 33 * All Rights Reserved 34 */ 35 36 /* 37 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 38 */ 39 40 /* 41 * Copyright (c) 2014, STRATO AG. All rights reserved. 42 */ 43 44 #include <sys/param.h> 45 #include <sys/types.h> 46 #include <sys/systm.h> 47 #include <sys/cred.h> 48 #include <sys/time.h> 49 #include <sys/vnode.h> 50 #include <sys/vfs.h> 51 #include <sys/vfs_opreg.h> 52 #include <sys/file.h> 53 #include <sys/filio.h> 54 #include <sys/uio.h> 55 #include <sys/buf.h> 56 #include <sys/mman.h> 57 #include <sys/pathname.h> 58 #include <sys/dirent.h> 59 #include <sys/debug.h> 60 #include <sys/vmsystm.h> 61 #include <sys/fcntl.h> 62 #include <sys/flock.h> 63 #include <sys/swap.h> 64 #include <sys/errno.h> 65 #include <sys/strsubr.h> 66 #include <sys/sysmacros.h> 67 #include <sys/kmem.h> 68 #include <sys/cmn_err.h> 69 #include <sys/pathconf.h> 70 #include <sys/utsname.h> 71 #include <sys/dnlc.h> 72 #include <sys/acl.h> 73 #include <sys/systeminfo.h> 74 #include <sys/policy.h> 75 #include <sys/sdt.h> 76 #include <sys/list.h> 77 #include <sys/stat.h> 78 #include <sys/zone.h> 79 80 #include <rpc/types.h> 81 #include <rpc/auth.h> 82 #include <rpc/clnt.h> 83 84 #include <nfs/nfs.h> 85 #include <nfs/nfs_clnt.h> 86 #include <nfs/nfs_acl.h> 87 #include <nfs/lm.h> 88 #include <nfs/nfs4.h> 89 #include <nfs/nfs4_kprot.h> 90 #include <nfs/rnode4.h> 91 #include <nfs/nfs4_clnt.h> 92 93 #include <vm/hat.h> 94 #include <vm/as.h> 95 #include <vm/page.h> 96 #include <vm/pvn.h> 97 #include <vm/seg.h> 98 #include <vm/seg_map.h> 99 #include <vm/seg_kpm.h> 100 #include <vm/seg_vn.h> 101 102 #include <fs/fs_subr.h> 103 104 #include <sys/ddi.h> 105 #include <sys/int_fmtio.h> 106 #include <sys/fs/autofs.h> 107 108 typedef struct { 109 nfs4_ga_res_t *di_garp; 110 cred_t *di_cred; 111 hrtime_t di_time_call; 112 } dirattr_info_t; 113 114 typedef enum nfs4_acl_op { 115 NFS4_ACL_GET, 116 NFS4_ACL_SET 117 } nfs4_acl_op_t; 118 119 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi); 120 static int nfs4frlock_get_sysid(struct lm_sysid **, vnode_t *, flock64_t *); 121 122 static void nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *, 123 char *, dirattr_info_t *); 124 125 static void nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *, 126 nfs4_open_stream_t *, int *, int *, nfs4_close_type_t, 127 nfs4_error_t *, int *); 128 static int nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int, 129 cred_t *); 130 static int nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *, 131 stable_how4 *); 132 static int nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *, 133 cred_t *, bool_t, struct uio *); 134 static int nfs4setattr(vnode_t *, struct vattr *, int, cred_t *, 135 vsecattr_t *); 136 static int nfs4openattr(vnode_t *, vnode_t **, int, cred_t *); 137 static int nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int); 138 static int nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *); 139 static int nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *); 140 static int nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *); 141 static int nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl, 142 int, vnode_t **, cred_t *); 143 static int nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **, 144 cred_t *, int, int, enum createmode4, int); 145 static int nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 146 caller_context_t *); 147 static int nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *, 148 vnode_t *, char *, cred_t *, nfsstat4 *); 149 static int nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *, 150 vnode_t *, char *, cred_t *, nfsstat4 *); 151 static int do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 152 static void nfs4readdir(vnode_t *, rddir4_cache *, cred_t *); 153 static int nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t); 154 static int nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *, 155 page_t *[], size_t, struct seg *, caddr_t, 156 enum seg_rw, cred_t *); 157 static void nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *, 158 cred_t *); 159 static int nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t, 160 int, cred_t *); 161 static int nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t, 162 int, cred_t *); 163 static int nfs4_commit(vnode_t *, offset4, count4, cred_t *); 164 static void nfs4_set_mod(vnode_t *); 165 static void nfs4_get_commit(vnode_t *); 166 static void nfs4_get_commit_range(vnode_t *, u_offset_t, size_t); 167 static int nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *); 168 static int nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int); 169 static int nfs4_sync_commit(vnode_t *, page_t *, offset3, count3, 170 cred_t *); 171 static void do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3, 172 cred_t *); 173 static int nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *, 174 hrtime_t, vnode_t *, cred_t *); 175 static int nfs4_open_non_reg_file(vnode_t **, int, cred_t *); 176 static int nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *); 177 static int nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *); 178 static int nfs4_block_and_wait(clock_t *); 179 static cred_t *state_to_cred(nfs4_open_stream_t *); 180 static void denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *); 181 static pid_t lo_to_pid(lock_owner4 *); 182 static void nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *, 183 cred_t *, nfs4_lock_owner_t *); 184 static void push_reinstate(vnode_t *, int, flock64_t *, cred_t *, 185 nfs4_lock_owner_t *); 186 static int open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **); 187 static void nfs4_delmap_callback(struct as *, void *, uint_t); 188 static void nfs4_free_delmapcall(nfs4_delmapcall_t *); 189 static nfs4_delmapcall_t *nfs4_init_delmapcall(); 190 static int nfs4_find_and_delete_delmapcall(rnode4_t *, int *); 191 static int nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t); 192 static int nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *, 193 uid_t, gid_t, int); 194 195 /* 196 * Routines that implement the setting of v4 args for the misc. ops 197 */ 198 static void nfs4args_lock_free(nfs_argop4 *); 199 static void nfs4args_lockt_free(nfs_argop4 *); 200 static void nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *, 201 int, rnode4_t *, cred_t *, bitmap4, int *, 202 nfs4_stateid_types_t *); 203 static void nfs4args_setattr_free(nfs_argop4 *); 204 static int nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4, 205 bitmap4); 206 static void nfs4args_verify_free(nfs_argop4 *); 207 static void nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *, 208 WRITE4args **, nfs4_stateid_types_t *); 209 210 /* 211 * These are the vnode ops functions that implement the vnode interface to 212 * the networked file system. See more comments below at nfs4_vnodeops. 213 */ 214 static int nfs4_open(vnode_t **, int, cred_t *, caller_context_t *); 215 static int nfs4_close(vnode_t *, int, int, offset_t, cred_t *, 216 caller_context_t *); 217 static int nfs4_read(vnode_t *, struct uio *, int, cred_t *, 218 caller_context_t *); 219 static int nfs4_write(vnode_t *, struct uio *, int, cred_t *, 220 caller_context_t *); 221 static int nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *, 222 caller_context_t *); 223 static int nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *, 224 caller_context_t *); 225 static int nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *); 226 static int nfs4_readlink(vnode_t *, struct uio *, cred_t *, 227 caller_context_t *); 228 static int nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *); 229 static int nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl, 230 int, vnode_t **, cred_t *, int, caller_context_t *, 231 vsecattr_t *); 232 static int nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *, 233 int); 234 static int nfs4_link(vnode_t *, vnode_t *, char *, cred_t *, 235 caller_context_t *, int); 236 static int nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *, 237 caller_context_t *, int); 238 static int nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **, 239 cred_t *, caller_context_t *, int, vsecattr_t *); 240 static int nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *, 241 caller_context_t *, int); 242 static int nfs4_symlink(vnode_t *, char *, struct vattr *, char *, 243 cred_t *, caller_context_t *, int); 244 static int nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *, 245 caller_context_t *, int); 246 static int nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *); 247 static int nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *, 248 page_t *[], size_t, struct seg *, caddr_t, 249 enum seg_rw, cred_t *, caller_context_t *); 250 static int nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *, 251 caller_context_t *); 252 static int nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t, 253 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 254 static int nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 255 uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *); 256 static int nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *); 257 static int nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t, 258 struct flk_callback *, cred_t *, caller_context_t *); 259 static int nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t, 260 cred_t *, caller_context_t *); 261 static int nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t, 262 uint_t, uint_t, uint_t, cred_t *, caller_context_t *); 263 static int nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int, 264 cred_t *, caller_context_t *); 265 static void nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *, 266 caller_context_t *); 267 static int nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 268 caller_context_t *); 269 /* 270 * These vnode ops are required to be called from outside this source file, 271 * e.g. by ephemeral mount stub vnode ops, and so may not be declared 272 * as static. 273 */ 274 int nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *, 275 caller_context_t *); 276 void nfs4_inactive(vnode_t *, cred_t *, caller_context_t *); 277 int nfs4_lookup(vnode_t *, char *, vnode_t **, 278 struct pathname *, int, vnode_t *, cred_t *, 279 caller_context_t *, int *, pathname_t *); 280 int nfs4_fid(vnode_t *, fid_t *, caller_context_t *); 281 int nfs4_rwlock(vnode_t *, int, caller_context_t *); 282 void nfs4_rwunlock(vnode_t *, int, caller_context_t *); 283 int nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *); 284 int nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *, 285 caller_context_t *); 286 int nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *, 287 caller_context_t *); 288 int nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *, 289 caller_context_t *); 290 291 /* 292 * Used for nfs4_commit_vp() to indicate if we should 293 * wait on pending writes. 294 */ 295 #define NFS4_WRITE_NOWAIT 0 296 #define NFS4_WRITE_WAIT 1 297 298 /* 299 * Error flags used to pass information about certain special errors 300 * which need to be handled specially. 301 */ 302 #define NFS_EOF -98 303 #define NFS_VERF_MISMATCH -97 304 305 /* 306 * Flags used to differentiate between which operation drove the 307 * potential CLOSE OTW. (see nfs4_close_otw_if_necessary) 308 */ 309 #define NFS4_CLOSE_OP 0x1 310 #define NFS4_DELMAP_OP 0x2 311 #define NFS4_INACTIVE_OP 0x3 312 313 #define ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO)) 314 315 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */ 316 #define ALIGN64(x, ptr, sz) \ 317 x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1); \ 318 if (x) { \ 319 x = sizeof (uint64_t) - (x); \ 320 sz -= (x); \ 321 ptr += (x); \ 322 } 323 324 #ifdef DEBUG 325 int nfs4_client_attr_debug = 0; 326 int nfs4_client_state_debug = 0; 327 int nfs4_client_shadow_debug = 0; 328 int nfs4_client_lock_debug = 0; 329 int nfs4_seqid_sync = 0; 330 int nfs4_client_map_debug = 0; 331 static int nfs4_pageio_debug = 0; 332 int nfs4_client_inactive_debug = 0; 333 int nfs4_client_recov_debug = 0; 334 int nfs4_client_failover_debug = 0; 335 int nfs4_client_call_debug = 0; 336 int nfs4_client_lookup_debug = 0; 337 int nfs4_client_zone_debug = 0; 338 int nfs4_lost_rqst_debug = 0; 339 int nfs4_rdattrerr_debug = 0; 340 int nfs4_open_stream_debug = 0; 341 342 int nfs4read_error_inject; 343 344 static int nfs4_create_misses = 0; 345 346 static int nfs4_readdir_cache_shorts = 0; 347 static int nfs4_readdir_readahead = 0; 348 349 static int nfs4_bio_do_stop = 0; 350 351 static int nfs4_lostpage = 0; /* number of times we lost original page */ 352 353 int nfs4_mmap_debug = 0; 354 355 static int nfs4_pathconf_cache_hits = 0; 356 static int nfs4_pathconf_cache_misses = 0; 357 358 int nfs4close_all_cnt; 359 int nfs4close_one_debug = 0; 360 int nfs4close_notw_debug = 0; 361 362 int denied_to_flk_debug = 0; 363 void *lockt_denied_debug; 364 365 #endif 366 367 /* 368 * In milliseconds. Should be less than half of the lease time or better, 369 * less than one second. 370 */ 371 int nfs4_base_wait_time = 20; 372 int nfs4_max_base_wait_time = 1 * 1000; /* 1 sec */ 373 374 /* 375 * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT 376 * or NFS4ERR_RESOURCE. 377 */ 378 static int confirm_retry_sec = 30; 379 380 static int nfs4_lookup_neg_cache = 1; 381 382 /* 383 * number of pages to read ahead 384 * optimized for 100 base-T. 385 */ 386 static int nfs4_nra = 4; 387 388 static int nfs4_do_symlink_cache = 1; 389 390 static int nfs4_pathconf_disable_cache = 0; 391 392 /* 393 * These are the vnode ops routines which implement the vnode interface to 394 * the networked file system. These routines just take their parameters, 395 * make them look networkish by putting the right info into interface structs, 396 * and then calling the appropriate remote routine(s) to do the work. 397 * 398 * Note on directory name lookup cacheing: If we detect a stale fhandle, 399 * we purge the directory cache relative to that vnode. This way, the 400 * user won't get burned by the cache repeatedly. See <nfs/rnode4.h> for 401 * more details on rnode locking. 402 */ 403 404 struct vnodeops *nfs4_vnodeops; 405 406 const fs_operation_def_t nfs4_vnodeops_template[] = { 407 VOPNAME_OPEN, { .vop_open = nfs4_open }, 408 VOPNAME_CLOSE, { .vop_close = nfs4_close }, 409 VOPNAME_READ, { .vop_read = nfs4_read }, 410 VOPNAME_WRITE, { .vop_write = nfs4_write }, 411 VOPNAME_IOCTL, { .vop_ioctl = nfs4_ioctl }, 412 VOPNAME_GETATTR, { .vop_getattr = nfs4_getattr }, 413 VOPNAME_SETATTR, { .vop_setattr = nfs4_setattr }, 414 VOPNAME_ACCESS, { .vop_access = nfs4_access }, 415 VOPNAME_LOOKUP, { .vop_lookup = nfs4_lookup }, 416 VOPNAME_CREATE, { .vop_create = nfs4_create }, 417 VOPNAME_REMOVE, { .vop_remove = nfs4_remove }, 418 VOPNAME_LINK, { .vop_link = nfs4_link }, 419 VOPNAME_RENAME, { .vop_rename = nfs4_rename }, 420 VOPNAME_MKDIR, { .vop_mkdir = nfs4_mkdir }, 421 VOPNAME_RMDIR, { .vop_rmdir = nfs4_rmdir }, 422 VOPNAME_READDIR, { .vop_readdir = nfs4_readdir }, 423 VOPNAME_SYMLINK, { .vop_symlink = nfs4_symlink }, 424 VOPNAME_READLINK, { .vop_readlink = nfs4_readlink }, 425 VOPNAME_FSYNC, { .vop_fsync = nfs4_fsync }, 426 VOPNAME_INACTIVE, { .vop_inactive = nfs4_inactive }, 427 VOPNAME_FID, { .vop_fid = nfs4_fid }, 428 VOPNAME_RWLOCK, { .vop_rwlock = nfs4_rwlock }, 429 VOPNAME_RWUNLOCK, { .vop_rwunlock = nfs4_rwunlock }, 430 VOPNAME_SEEK, { .vop_seek = nfs4_seek }, 431 VOPNAME_FRLOCK, { .vop_frlock = nfs4_frlock }, 432 VOPNAME_SPACE, { .vop_space = nfs4_space }, 433 VOPNAME_REALVP, { .vop_realvp = nfs4_realvp }, 434 VOPNAME_GETPAGE, { .vop_getpage = nfs4_getpage }, 435 VOPNAME_PUTPAGE, { .vop_putpage = nfs4_putpage }, 436 VOPNAME_MAP, { .vop_map = nfs4_map }, 437 VOPNAME_ADDMAP, { .vop_addmap = nfs4_addmap }, 438 VOPNAME_DELMAP, { .vop_delmap = nfs4_delmap }, 439 /* no separate nfs4_dump */ 440 VOPNAME_DUMP, { .vop_dump = nfs_dump }, 441 VOPNAME_PATHCONF, { .vop_pathconf = nfs4_pathconf }, 442 VOPNAME_PAGEIO, { .vop_pageio = nfs4_pageio }, 443 VOPNAME_DISPOSE, { .vop_dispose = nfs4_dispose }, 444 VOPNAME_SETSECATTR, { .vop_setsecattr = nfs4_setsecattr }, 445 VOPNAME_GETSECATTR, { .vop_getsecattr = nfs4_getsecattr }, 446 VOPNAME_SHRLOCK, { .vop_shrlock = nfs4_shrlock }, 447 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, 448 NULL, NULL 449 }; 450 451 /* 452 * The following are subroutines and definitions to set args or get res 453 * for the different nfsv4 ops 454 */ 455 456 void 457 nfs4args_lookup_free(nfs_argop4 *argop, int arglen) 458 { 459 int i; 460 461 for (i = 0; i < arglen; i++) { 462 if (argop[i].argop == OP_LOOKUP) { 463 kmem_free( 464 argop[i].nfs_argop4_u.oplookup. 465 objname.utf8string_val, 466 argop[i].nfs_argop4_u.oplookup. 467 objname.utf8string_len); 468 } 469 } 470 } 471 472 static void 473 nfs4args_lock_free(nfs_argop4 *argop) 474 { 475 locker4 *locker = &argop->nfs_argop4_u.oplock.locker; 476 477 if (locker->new_lock_owner == TRUE) { 478 open_to_lock_owner4 *open_owner; 479 480 open_owner = &locker->locker4_u.open_owner; 481 if (open_owner->lock_owner.owner_val != NULL) { 482 kmem_free(open_owner->lock_owner.owner_val, 483 open_owner->lock_owner.owner_len); 484 } 485 } 486 } 487 488 static void 489 nfs4args_lockt_free(nfs_argop4 *argop) 490 { 491 lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner; 492 493 if (lowner->owner_val != NULL) { 494 kmem_free(lowner->owner_val, lowner->owner_len); 495 } 496 } 497 498 static void 499 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags, 500 rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error, 501 nfs4_stateid_types_t *sid_types) 502 { 503 fattr4 *attr = &argop->nfs_argop4_u.opsetattr.obj_attributes; 504 mntinfo4_t *mi; 505 506 argop->argop = OP_SETATTR; 507 /* 508 * The stateid is set to 0 if client is not modifying the size 509 * and otherwise to whatever nfs4_get_stateid() returns. 510 * 511 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no 512 * state struct could be found for the process/file pair. We may 513 * want to change this in the future (by OPENing the file). See 514 * bug # 4474852. 515 */ 516 if (vap->va_mask & AT_SIZE) { 517 518 ASSERT(rp != NULL); 519 mi = VTOMI4(RTOV4(rp)); 520 521 argop->nfs_argop4_u.opsetattr.stateid = 522 nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 523 OP_SETATTR, sid_types, FALSE); 524 } else { 525 bzero(&argop->nfs_argop4_u.opsetattr.stateid, 526 sizeof (stateid4)); 527 } 528 529 *error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp); 530 if (*error) 531 bzero(attr, sizeof (*attr)); 532 } 533 534 static void 535 nfs4args_setattr_free(nfs_argop4 *argop) 536 { 537 nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes); 538 } 539 540 static int 541 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op, 542 bitmap4 supp) 543 { 544 fattr4 *attr; 545 int error = 0; 546 547 argop->argop = op; 548 switch (op) { 549 case OP_VERIFY: 550 attr = &argop->nfs_argop4_u.opverify.obj_attributes; 551 break; 552 case OP_NVERIFY: 553 attr = &argop->nfs_argop4_u.opnverify.obj_attributes; 554 break; 555 default: 556 return (EINVAL); 557 } 558 if (!error) 559 error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp); 560 if (error) 561 bzero(attr, sizeof (*attr)); 562 return (error); 563 } 564 565 static void 566 nfs4args_verify_free(nfs_argop4 *argop) 567 { 568 switch (argop->argop) { 569 case OP_VERIFY: 570 nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes); 571 break; 572 case OP_NVERIFY: 573 nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes); 574 break; 575 default: 576 break; 577 } 578 } 579 580 static void 581 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr, 582 WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp) 583 { 584 WRITE4args *wargs = &argop->nfs_argop4_u.opwrite; 585 mntinfo4_t *mi = VTOMI4(RTOV4(rp)); 586 587 argop->argop = OP_WRITE; 588 wargs->stable = stable; 589 wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id, 590 mi, OP_WRITE, sid_tp); 591 wargs->mblk = NULL; 592 *wargs_pp = wargs; 593 } 594 595 void 596 nfs4args_copen_free(OPEN4cargs *open_args) 597 { 598 if (open_args->owner.owner_val) { 599 kmem_free(open_args->owner.owner_val, 600 open_args->owner.owner_len); 601 } 602 if ((open_args->opentype == OPEN4_CREATE) && 603 (open_args->mode != EXCLUSIVE4)) { 604 nfs4_fattr4_free(&open_args->createhow4_u.createattrs); 605 } 606 } 607 608 /* 609 * XXX: This is referenced in modstubs.s 610 */ 611 struct vnodeops * 612 nfs4_getvnodeops(void) 613 { 614 return (nfs4_vnodeops); 615 } 616 617 /* 618 * The OPEN operation opens a regular file. 619 */ 620 /*ARGSUSED3*/ 621 static int 622 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct) 623 { 624 vnode_t *dvp = NULL; 625 rnode4_t *rp, *drp; 626 int error; 627 int just_been_created; 628 char fn[MAXNAMELEN]; 629 630 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: ")); 631 if (nfs_zone() != VTOMI4(*vpp)->mi_zone) 632 return (EIO); 633 rp = VTOR4(*vpp); 634 635 /* 636 * Check to see if opening something besides a regular file; 637 * if so skip the OTW call 638 */ 639 if ((*vpp)->v_type != VREG) { 640 error = nfs4_open_non_reg_file(vpp, flag, cr); 641 return (error); 642 } 643 644 /* 645 * XXX - would like a check right here to know if the file is 646 * executable or not, so as to skip OTW 647 */ 648 649 if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0) 650 return (error); 651 652 drp = VTOR4(dvp); 653 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 654 return (EINTR); 655 656 if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) { 657 nfs_rw_exit(&drp->r_rwlock); 658 return (error); 659 } 660 661 /* 662 * See if this file has just been CREATEd. 663 * If so, clear the flag and update the dnlc, which was previously 664 * skipped in nfs4_create. 665 * XXX need better serilization on this. 666 * XXX move this into the nf4open_otw call, after we have 667 * XXX acquired the open owner seqid sync. 668 */ 669 mutex_enter(&rp->r_statev4_lock); 670 if (rp->created_v4) { 671 rp->created_v4 = 0; 672 mutex_exit(&rp->r_statev4_lock); 673 674 dnlc_update(dvp, fn, *vpp); 675 /* This is needed so we don't bump the open ref count */ 676 just_been_created = 1; 677 } else { 678 mutex_exit(&rp->r_statev4_lock); 679 just_been_created = 0; 680 } 681 682 /* 683 * If caller specified O_TRUNC/FTRUNC, then be sure to set 684 * FWRITE (to drive successful setattr(size=0) after open) 685 */ 686 if (flag & FTRUNC) 687 flag |= FWRITE; 688 689 error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0, 690 just_been_created); 691 692 if (!error && !((*vpp)->v_flag & VROOT)) 693 dnlc_update(dvp, fn, *vpp); 694 695 nfs_rw_exit(&drp->r_rwlock); 696 697 /* release the hold from vtodv */ 698 VN_RELE(dvp); 699 700 /* exchange the shadow for the master vnode, if needed */ 701 702 if (error == 0 && IS_SHADOW(*vpp, rp)) 703 sv_exchange(vpp); 704 705 return (error); 706 } 707 708 /* 709 * See if there's a "lost open" request to be saved and recovered. 710 */ 711 static void 712 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 713 nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp, 714 vnode_t *dvp, OPEN4cargs *open_args) 715 { 716 vfs_t *vfsp; 717 char *srccfp; 718 719 vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp); 720 721 if (error != ETIMEDOUT && error != EINTR && 722 !NFS4_FRC_UNMT_ERR(error, vfsp)) { 723 lost_rqstp->lr_op = 0; 724 return; 725 } 726 727 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 728 "nfs4open_save_lost_rqst: error %d", error)); 729 730 lost_rqstp->lr_op = OP_OPEN; 731 732 /* 733 * The vp (if it is not NULL) and dvp are held and rele'd via 734 * the recovery code. See nfs4_save_lost_rqst. 735 */ 736 lost_rqstp->lr_vp = vp; 737 lost_rqstp->lr_dvp = dvp; 738 lost_rqstp->lr_oop = oop; 739 lost_rqstp->lr_osp = NULL; 740 lost_rqstp->lr_lop = NULL; 741 lost_rqstp->lr_cr = cr; 742 lost_rqstp->lr_flk = NULL; 743 lost_rqstp->lr_oacc = open_args->share_access; 744 lost_rqstp->lr_odeny = open_args->share_deny; 745 lost_rqstp->lr_oclaim = open_args->claim; 746 if (open_args->claim == CLAIM_DELEGATE_CUR) { 747 lost_rqstp->lr_ostateid = 748 open_args->open_claim4_u.delegate_cur_info.delegate_stateid; 749 srccfp = open_args->open_claim4_u.delegate_cur_info.cfile; 750 } else { 751 srccfp = open_args->open_claim4_u.cfile; 752 } 753 lost_rqstp->lr_ofile.utf8string_len = 0; 754 lost_rqstp->lr_ofile.utf8string_val = NULL; 755 (void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile); 756 lost_rqstp->lr_putfirst = FALSE; 757 } 758 759 struct nfs4_excl_time { 760 uint32 seconds; 761 uint32 nseconds; 762 }; 763 764 /* 765 * The OPEN operation creates and/or opens a regular file 766 * 767 * ARGSUSED 768 */ 769 static int 770 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va, 771 vnode_t **vpp, cred_t *cr, int create_flag, int open_flag, 772 enum createmode4 createmode, int file_just_been_created) 773 { 774 rnode4_t *rp; 775 rnode4_t *drp = VTOR4(dvp); 776 vnode_t *vp = NULL; 777 vnode_t *vpi = *vpp; 778 bool_t needrecov = FALSE; 779 780 int doqueue = 1; 781 782 COMPOUND4args_clnt args; 783 COMPOUND4res_clnt res; 784 nfs_argop4 *argop; 785 nfs_resop4 *resop; 786 int argoplist_size; 787 int idx_open, idx_fattr; 788 789 GETFH4res *gf_res = NULL; 790 OPEN4res *op_res = NULL; 791 nfs4_ga_res_t *garp; 792 fattr4 *attr = NULL; 793 struct nfs4_excl_time verf; 794 bool_t did_excl_setup = FALSE; 795 int created_osp; 796 797 OPEN4cargs *open_args; 798 nfs4_open_owner_t *oop = NULL; 799 nfs4_open_stream_t *osp = NULL; 800 seqid4 seqid = 0; 801 bool_t retry_open = FALSE; 802 nfs4_recov_state_t recov_state; 803 nfs4_lost_rqst_t lost_rqst; 804 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 805 hrtime_t t; 806 int acc = 0; 807 cred_t *cred_otw = NULL; /* cred used to do the RPC call */ 808 cred_t *ncr = NULL; 809 810 nfs4_sharedfh_t *otw_sfh; 811 nfs4_sharedfh_t *orig_sfh; 812 int fh_differs = 0; 813 int numops, setgid_flag; 814 int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1; 815 816 /* 817 * Make sure we properly deal with setting the right gid on 818 * a newly created file to reflect the parent's setgid bit 819 */ 820 setgid_flag = 0; 821 if (create_flag && in_va) { 822 823 /* 824 * If there is grpid mount flag used or 825 * the parent's directory has the setgid bit set 826 * _and_ the client was able to get a valid mapping 827 * for the parent dir's owner_group, we want to 828 * append NVERIFY(owner_group == dva.va_gid) and 829 * SETATTR to the CREATE compound. 830 */ 831 mutex_enter(&drp->r_statelock); 832 if ((VTOMI4(dvp)->mi_flags & MI4_GRPID || 833 drp->r_attr.va_mode & VSGID) && 834 drp->r_attr.va_gid != GID_NOBODY) { 835 in_va->va_mask |= AT_GID; 836 in_va->va_gid = drp->r_attr.va_gid; 837 setgid_flag = 1; 838 } 839 mutex_exit(&drp->r_statelock); 840 } 841 842 /* 843 * Normal/non-create compound: 844 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) 845 * 846 * Open(create) compound no setgid: 847 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) + 848 * RESTOREFH + GETATTR 849 * 850 * Open(create) setgid: 851 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) + 852 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH + 853 * NVERIFY(grp) + SETATTR 854 */ 855 if (setgid_flag) { 856 numops = 10; 857 idx_open = 1; 858 idx_fattr = 3; 859 } else if (create_flag) { 860 numops = 7; 861 idx_open = 2; 862 idx_fattr = 4; 863 } else { 864 numops = 4; 865 idx_open = 1; 866 idx_fattr = 3; 867 } 868 869 args.array_len = numops; 870 argoplist_size = numops * sizeof (nfs_argop4); 871 argop = kmem_alloc(argoplist_size, KM_SLEEP); 872 873 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: " 874 "open %s open flag 0x%x cred %p", file_name, open_flag, 875 (void *)cr)); 876 877 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 878 if (create_flag) { 879 /* 880 * We are to create a file. Initialize the passed in vnode 881 * pointer. 882 */ 883 vpi = NULL; 884 } else { 885 /* 886 * Check to see if the client owns a read delegation and is 887 * trying to open for write. If so, then return the delegation 888 * to avoid the server doing a cb_recall and returning DELAY. 889 * NB - we don't use the statev4_lock here because we'd have 890 * to drop the lock anyway and the result would be stale. 891 */ 892 if ((open_flag & FWRITE) && 893 VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ) 894 (void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN); 895 896 /* 897 * If the file has a delegation, then do an access check up 898 * front. This avoids having to an access check later after 899 * we've already done start_op, which could deadlock. 900 */ 901 if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) { 902 if (open_flag & FREAD && 903 nfs4_access(vpi, VREAD, 0, cr, NULL) == 0) 904 acc |= VREAD; 905 if (open_flag & FWRITE && 906 nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0) 907 acc |= VWRITE; 908 } 909 } 910 911 drp = VTOR4(dvp); 912 913 recov_state.rs_flags = 0; 914 recov_state.rs_num_retry_despite_err = 0; 915 cred_otw = cr; 916 917 recov_retry: 918 fh_differs = 0; 919 nfs4_error_zinit(&e); 920 921 e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state); 922 if (e.error) { 923 if (ncr != NULL) 924 crfree(ncr); 925 kmem_free(argop, argoplist_size); 926 return (e.error); 927 } 928 929 args.ctag = TAG_OPEN; 930 args.array_len = numops; 931 args.array = argop; 932 933 /* putfh directory fh */ 934 argop[0].argop = OP_CPUTFH; 935 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 936 937 /* OPEN: either op 1 or op 2 depending upon create/setgid flags */ 938 argop[idx_open].argop = OP_COPEN; 939 open_args = &argop[idx_open].nfs_argop4_u.opcopen; 940 open_args->claim = CLAIM_NULL; 941 942 /* name of file */ 943 open_args->open_claim4_u.cfile = file_name; 944 open_args->owner.owner_len = 0; 945 open_args->owner.owner_val = NULL; 946 947 if (create_flag) { 948 /* CREATE a file */ 949 open_args->opentype = OPEN4_CREATE; 950 open_args->mode = createmode; 951 if (createmode == EXCLUSIVE4) { 952 if (did_excl_setup == FALSE) { 953 verf.seconds = zone_get_hostid(NULL); 954 if (verf.seconds != 0) 955 verf.nseconds = newnum(); 956 else { 957 timestruc_t now; 958 959 gethrestime(&now); 960 verf.seconds = now.tv_sec; 961 verf.nseconds = now.tv_nsec; 962 } 963 /* 964 * Since the server will use this value for the 965 * mtime, make sure that it can't overflow. Zero 966 * out the MSB. The actual value does not matter 967 * here, only its uniqeness. 968 */ 969 verf.seconds &= INT32_MAX; 970 did_excl_setup = TRUE; 971 } 972 973 /* Now copy over verifier to OPEN4args. */ 974 open_args->createhow4_u.createverf = *(uint64_t *)&verf; 975 } else { 976 int v_error; 977 bitmap4 supp_attrs; 978 servinfo4_t *svp; 979 980 attr = &open_args->createhow4_u.createattrs; 981 982 svp = drp->r_server; 983 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 984 supp_attrs = svp->sv_supp_attrs; 985 nfs_rw_exit(&svp->sv_lock); 986 987 /* GUARDED4 or UNCHECKED4 */ 988 v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN, 989 supp_attrs); 990 if (v_error) { 991 bzero(attr, sizeof (*attr)); 992 nfs4args_copen_free(open_args); 993 nfs4_end_op(VTOMI4(dvp), dvp, vpi, 994 &recov_state, FALSE); 995 if (ncr != NULL) 996 crfree(ncr); 997 kmem_free(argop, argoplist_size); 998 return (v_error); 999 } 1000 } 1001 } else { 1002 /* NO CREATE */ 1003 open_args->opentype = OPEN4_NOCREATE; 1004 } 1005 1006 if (recov_state.rs_sp != NULL) { 1007 mutex_enter(&recov_state.rs_sp->s_lock); 1008 open_args->owner.clientid = recov_state.rs_sp->clientid; 1009 mutex_exit(&recov_state.rs_sp->s_lock); 1010 } else { 1011 /* XXX should we just fail here? */ 1012 open_args->owner.clientid = 0; 1013 } 1014 1015 /* 1016 * This increments oop's ref count or creates a temporary 'just_created' 1017 * open owner that will become valid when this OPEN/OPEN_CONFIRM call 1018 * completes. 1019 */ 1020 mutex_enter(&VTOMI4(dvp)->mi_lock); 1021 1022 /* See if a permanent or just created open owner exists */ 1023 oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp)); 1024 if (!oop) { 1025 /* 1026 * This open owner does not exist so create a temporary 1027 * just created one. 1028 */ 1029 oop = create_open_owner(cr, VTOMI4(dvp)); 1030 ASSERT(oop != NULL); 1031 } 1032 mutex_exit(&VTOMI4(dvp)->mi_lock); 1033 1034 /* this length never changes, do alloc before seqid sync */ 1035 open_args->owner.owner_len = sizeof (oop->oo_name); 1036 open_args->owner.owner_val = 1037 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1038 1039 e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp)); 1040 if (e.error == EAGAIN) { 1041 open_owner_rele(oop); 1042 nfs4args_copen_free(open_args); 1043 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1044 if (ncr != NULL) { 1045 crfree(ncr); 1046 ncr = NULL; 1047 } 1048 goto recov_retry; 1049 } 1050 1051 /* Check to see if we need to do the OTW call */ 1052 if (!create_flag) { 1053 if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi, 1054 file_just_been_created, &e.error, acc, &recov_state)) { 1055 1056 /* 1057 * The OTW open is not necessary. Either 1058 * the open can succeed without it (eg. 1059 * delegation, error == 0) or the open 1060 * must fail due to an access failure 1061 * (error != 0). In either case, tidy 1062 * up and return. 1063 */ 1064 1065 nfs4_end_open_seqid_sync(oop); 1066 open_owner_rele(oop); 1067 nfs4args_copen_free(open_args); 1068 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE); 1069 if (ncr != NULL) 1070 crfree(ncr); 1071 kmem_free(argop, argoplist_size); 1072 return (e.error); 1073 } 1074 } 1075 1076 bcopy(&oop->oo_name, open_args->owner.owner_val, 1077 open_args->owner.owner_len); 1078 1079 seqid = nfs4_get_open_seqid(oop) + 1; 1080 open_args->seqid = seqid; 1081 open_args->share_access = 0; 1082 if (open_flag & FREAD) 1083 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1084 if (open_flag & FWRITE) 1085 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1086 open_args->share_deny = OPEN4_SHARE_DENY_NONE; 1087 1088 1089 1090 /* 1091 * getfh w/sanity check for idx_open/idx_fattr 1092 */ 1093 ASSERT((idx_open + 1) == (idx_fattr - 1)); 1094 argop[idx_open + 1].argop = OP_GETFH; 1095 1096 /* getattr */ 1097 argop[idx_fattr].argop = OP_GETATTR; 1098 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1099 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1100 1101 if (setgid_flag) { 1102 vattr_t _v; 1103 servinfo4_t *svp; 1104 bitmap4 supp_attrs; 1105 1106 svp = drp->r_server; 1107 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 1108 supp_attrs = svp->sv_supp_attrs; 1109 nfs_rw_exit(&svp->sv_lock); 1110 1111 /* 1112 * For setgid case, we need to: 1113 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 1114 */ 1115 argop[4].argop = OP_SAVEFH; 1116 1117 argop[5].argop = OP_CPUTFH; 1118 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 1119 1120 argop[6].argop = OP_GETATTR; 1121 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1122 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1123 1124 argop[7].argop = OP_RESTOREFH; 1125 1126 /* 1127 * nverify 1128 */ 1129 _v.va_mask = AT_GID; 1130 _v.va_gid = in_va->va_gid; 1131 if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 1132 supp_attrs))) { 1133 1134 /* 1135 * setattr 1136 * 1137 * We _know_ we're not messing with AT_SIZE or 1138 * AT_XTIME, so no need for stateid or flags. 1139 * Also we specify NULL rp since we're only 1140 * interested in setting owner_group attributes. 1141 */ 1142 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, 1143 supp_attrs, &e.error, 0); 1144 if (e.error) 1145 nfs4args_verify_free(&argop[8]); 1146 } 1147 1148 if (e.error) { 1149 /* 1150 * XXX - Revisit the last argument to nfs4_end_op() 1151 * once 5020486 is fixed. 1152 */ 1153 nfs4_end_open_seqid_sync(oop); 1154 open_owner_rele(oop); 1155 nfs4args_copen_free(open_args); 1156 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE); 1157 if (ncr != NULL) 1158 crfree(ncr); 1159 kmem_free(argop, argoplist_size); 1160 return (e.error); 1161 } 1162 } else if (create_flag) { 1163 argop[1].argop = OP_SAVEFH; 1164 1165 argop[5].argop = OP_RESTOREFH; 1166 1167 argop[6].argop = OP_GETATTR; 1168 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1169 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 1170 } 1171 1172 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 1173 "nfs4open_otw: %s call, nm %s, rp %s", 1174 needrecov ? "recov" : "first", file_name, 1175 rnode4info(VTOR4(dvp)))); 1176 1177 t = gethrtime(); 1178 1179 rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e); 1180 1181 if (!e.error && nfs4_need_to_bump_seqid(&res)) 1182 nfs4_set_open_seqid(seqid, oop, args.ctag); 1183 1184 needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp); 1185 1186 if (e.error || needrecov) { 1187 bool_t abort = FALSE; 1188 1189 if (needrecov) { 1190 nfs4_bseqid_entry_t *bsep = NULL; 1191 1192 nfs4open_save_lost_rqst(e.error, &lost_rqst, oop, 1193 cred_otw, vpi, dvp, open_args); 1194 1195 if (!e.error && res.status == NFS4ERR_BAD_SEQID) { 1196 bsep = nfs4_create_bseqid_entry(oop, NULL, 1197 vpi, 0, args.ctag, open_args->seqid); 1198 num_bseqid_retry--; 1199 } 1200 1201 abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi, 1202 NULL, lost_rqst.lr_op == OP_OPEN ? 1203 &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL); 1204 1205 if (bsep) 1206 kmem_free(bsep, sizeof (*bsep)); 1207 /* give up if we keep getting BAD_SEQID */ 1208 if (num_bseqid_retry == 0) 1209 abort = TRUE; 1210 if (abort == TRUE && e.error == 0) 1211 e.error = geterrno4(res.status); 1212 } 1213 nfs4_end_open_seqid_sync(oop); 1214 open_owner_rele(oop); 1215 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1216 nfs4args_copen_free(open_args); 1217 if (setgid_flag) { 1218 nfs4args_verify_free(&argop[8]); 1219 nfs4args_setattr_free(&argop[9]); 1220 } 1221 if (!e.error) 1222 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1223 if (ncr != NULL) { 1224 crfree(ncr); 1225 ncr = NULL; 1226 } 1227 if (!needrecov || abort == TRUE || e.error == EINTR || 1228 NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) { 1229 kmem_free(argop, argoplist_size); 1230 return (e.error); 1231 } 1232 goto recov_retry; 1233 } 1234 1235 /* 1236 * Will check and update lease after checking the rflag for 1237 * OPEN_CONFIRM in the successful OPEN call. 1238 */ 1239 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 1240 1241 /* 1242 * XXX what if we're crossing mount points from server1:/drp 1243 * to server2:/drp/rp. 1244 */ 1245 1246 /* Signal our end of use of the open seqid */ 1247 nfs4_end_open_seqid_sync(oop); 1248 1249 /* 1250 * This will destroy the open owner if it was just created, 1251 * and no one else has put a reference on it. 1252 */ 1253 open_owner_rele(oop); 1254 if (create_flag && (createmode != EXCLUSIVE4) && 1255 res.status == NFS4ERR_BADOWNER) 1256 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1257 1258 e.error = geterrno4(res.status); 1259 nfs4args_copen_free(open_args); 1260 if (setgid_flag) { 1261 nfs4args_verify_free(&argop[8]); 1262 nfs4args_setattr_free(&argop[9]); 1263 } 1264 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1265 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1266 /* 1267 * If the reply is NFS4ERR_ACCESS, it may be because 1268 * we are root (no root net access). If the real uid 1269 * is not root, then retry with the real uid instead. 1270 */ 1271 if (ncr != NULL) { 1272 crfree(ncr); 1273 ncr = NULL; 1274 } 1275 if (res.status == NFS4ERR_ACCESS && 1276 (ncr = crnetadjust(cred_otw)) != NULL) { 1277 cred_otw = ncr; 1278 goto recov_retry; 1279 } 1280 kmem_free(argop, argoplist_size); 1281 return (e.error); 1282 } 1283 1284 resop = &res.array[idx_open]; /* open res */ 1285 op_res = &resop->nfs_resop4_u.opopen; 1286 1287 #ifdef DEBUG 1288 /* 1289 * verify attrset bitmap 1290 */ 1291 if (create_flag && 1292 (createmode == UNCHECKED4 || createmode == GUARDED4)) { 1293 /* make sure attrset returned is what we asked for */ 1294 /* XXX Ignore this 'error' for now */ 1295 if (attr->attrmask != op_res->attrset) 1296 /* EMPTY */; 1297 } 1298 #endif 1299 1300 if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) { 1301 mutex_enter(&VTOMI4(dvp)->mi_lock); 1302 VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK; 1303 mutex_exit(&VTOMI4(dvp)->mi_lock); 1304 } 1305 1306 resop = &res.array[idx_open + 1]; /* getfh res */ 1307 gf_res = &resop->nfs_resop4_u.opgetfh; 1308 1309 otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp)); 1310 1311 /* 1312 * The open stateid has been updated on the server but not 1313 * on the client yet. There is a path: makenfs4node->nfs4_attr_cache-> 1314 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW 1315 * WRITE call. That, however, will use the old stateid, so go ahead 1316 * and upate the open stateid now, before any call to makenfs4node. 1317 */ 1318 if (vpi) { 1319 nfs4_open_stream_t *tmp_osp; 1320 rnode4_t *tmp_rp = VTOR4(vpi); 1321 1322 tmp_osp = find_open_stream(oop, tmp_rp); 1323 if (tmp_osp) { 1324 tmp_osp->open_stateid = op_res->stateid; 1325 mutex_exit(&tmp_osp->os_sync_lock); 1326 open_stream_rele(tmp_osp, tmp_rp); 1327 } 1328 1329 /* 1330 * We must determine if the file handle given by the otw open 1331 * is the same as the file handle which was passed in with 1332 * *vpp. This case can be reached if the file we are trying 1333 * to open has been removed and another file has been created 1334 * having the same file name. The passed in vnode is released 1335 * later. 1336 */ 1337 orig_sfh = VTOR4(vpi)->r_fh; 1338 fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh); 1339 } 1340 1341 garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res; 1342 1343 if (create_flag || fh_differs) { 1344 int rnode_err = 0; 1345 1346 vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr, 1347 dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh)); 1348 1349 if (e.error) 1350 PURGE_ATTRCACHE4(vp); 1351 /* 1352 * For the newly created vp case, make sure the rnode 1353 * isn't bad before using it. 1354 */ 1355 mutex_enter(&(VTOR4(vp))->r_statelock); 1356 if (VTOR4(vp)->r_flags & R4RECOVERR) 1357 rnode_err = EIO; 1358 mutex_exit(&(VTOR4(vp))->r_statelock); 1359 1360 if (rnode_err) { 1361 nfs4_end_open_seqid_sync(oop); 1362 nfs4args_copen_free(open_args); 1363 if (setgid_flag) { 1364 nfs4args_verify_free(&argop[8]); 1365 nfs4args_setattr_free(&argop[9]); 1366 } 1367 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1368 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1369 needrecov); 1370 open_owner_rele(oop); 1371 VN_RELE(vp); 1372 if (ncr != NULL) 1373 crfree(ncr); 1374 sfh4_rele(&otw_sfh); 1375 kmem_free(argop, argoplist_size); 1376 return (EIO); 1377 } 1378 } else { 1379 vp = vpi; 1380 } 1381 sfh4_rele(&otw_sfh); 1382 1383 /* 1384 * It seems odd to get a full set of attrs and then not update 1385 * the object's attrcache in the non-create case. Create case uses 1386 * the attrs since makenfs4node checks to see if the attrs need to 1387 * be updated (and then updates them). The non-create case should 1388 * update attrs also. 1389 */ 1390 if (! create_flag && ! fh_differs && !e.error) { 1391 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 1392 } 1393 1394 nfs4_error_zinit(&e); 1395 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 1396 /* This does not do recovery for vp explicitly. */ 1397 nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE, 1398 &retry_open, oop, FALSE, &e, &num_bseqid_retry); 1399 1400 if (e.error || e.stat) { 1401 nfs4_end_open_seqid_sync(oop); 1402 nfs4args_copen_free(open_args); 1403 if (setgid_flag) { 1404 nfs4args_verify_free(&argop[8]); 1405 nfs4args_setattr_free(&argop[9]); 1406 } 1407 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1408 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, 1409 needrecov); 1410 open_owner_rele(oop); 1411 if (create_flag || fh_differs) { 1412 /* rele the makenfs4node */ 1413 VN_RELE(vp); 1414 } 1415 if (ncr != NULL) { 1416 crfree(ncr); 1417 ncr = NULL; 1418 } 1419 if (retry_open == TRUE) { 1420 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1421 "nfs4open_otw: retry the open since OPEN " 1422 "CONFIRM failed with error %d stat %d", 1423 e.error, e.stat)); 1424 if (create_flag && createmode == GUARDED4) { 1425 NFS4_DEBUG(nfs4_client_recov_debug, 1426 (CE_NOTE, "nfs4open_otw: switch " 1427 "createmode from GUARDED4 to " 1428 "UNCHECKED4")); 1429 createmode = UNCHECKED4; 1430 } 1431 goto recov_retry; 1432 } 1433 if (!e.error) { 1434 if (create_flag && (createmode != EXCLUSIVE4) && 1435 e.stat == NFS4ERR_BADOWNER) 1436 nfs4_log_badowner(VTOMI4(dvp), OP_OPEN); 1437 1438 e.error = geterrno4(e.stat); 1439 } 1440 kmem_free(argop, argoplist_size); 1441 return (e.error); 1442 } 1443 } 1444 1445 rp = VTOR4(vp); 1446 1447 mutex_enter(&rp->r_statev4_lock); 1448 if (create_flag) 1449 rp->created_v4 = 1; 1450 mutex_exit(&rp->r_statev4_lock); 1451 1452 mutex_enter(&oop->oo_lock); 1453 /* Doesn't matter if 'oo_just_created' already was set as this */ 1454 oop->oo_just_created = NFS4_PERM_CREATED; 1455 if (oop->oo_cred_otw) 1456 crfree(oop->oo_cred_otw); 1457 oop->oo_cred_otw = cred_otw; 1458 crhold(oop->oo_cred_otw); 1459 mutex_exit(&oop->oo_lock); 1460 1461 /* returns with 'os_sync_lock' held */ 1462 osp = find_or_create_open_stream(oop, rp, &created_osp); 1463 if (!osp) { 1464 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1465 "nfs4open_otw: failed to create an open stream")); 1466 NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: " 1467 "signal our end of use of the open seqid")); 1468 1469 nfs4_end_open_seqid_sync(oop); 1470 open_owner_rele(oop); 1471 nfs4args_copen_free(open_args); 1472 if (setgid_flag) { 1473 nfs4args_verify_free(&argop[8]); 1474 nfs4args_setattr_free(&argop[9]); 1475 } 1476 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1477 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1478 if (create_flag || fh_differs) 1479 VN_RELE(vp); 1480 if (ncr != NULL) 1481 crfree(ncr); 1482 1483 kmem_free(argop, argoplist_size); 1484 return (EINVAL); 1485 1486 } 1487 1488 osp->open_stateid = op_res->stateid; 1489 1490 if (open_flag & FREAD) 1491 osp->os_share_acc_read++; 1492 if (open_flag & FWRITE) 1493 osp->os_share_acc_write++; 1494 osp->os_share_deny_none++; 1495 1496 /* 1497 * Need to reset this bitfield for the possible case where we were 1498 * going to OTW CLOSE the file, got a non-recoverable error, and before 1499 * we could retry the CLOSE, OPENed the file again. 1500 */ 1501 ASSERT(osp->os_open_owner->oo_seqid_inuse); 1502 osp->os_final_close = 0; 1503 osp->os_force_close = 0; 1504 #ifdef DEBUG 1505 if (osp->os_failed_reopen) 1506 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:" 1507 " clearing os_failed_reopen for osp %p, cr %p, rp %s", 1508 (void *)osp, (void *)cr, rnode4info(rp))); 1509 #endif 1510 osp->os_failed_reopen = 0; 1511 1512 mutex_exit(&osp->os_sync_lock); 1513 1514 nfs4_end_open_seqid_sync(oop); 1515 1516 if (created_osp && recov_state.rs_sp != NULL) { 1517 mutex_enter(&recov_state.rs_sp->s_lock); 1518 nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp)); 1519 mutex_exit(&recov_state.rs_sp->s_lock); 1520 } 1521 1522 /* get rid of our reference to find oop */ 1523 open_owner_rele(oop); 1524 1525 open_stream_rele(osp, rp); 1526 1527 /* accept delegation, if any */ 1528 nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw); 1529 1530 nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov); 1531 1532 if (createmode == EXCLUSIVE4 && 1533 (in_va->va_mask & ~(AT_GID | AT_SIZE))) { 1534 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:" 1535 " EXCLUSIVE4: sending a SETATTR")); 1536 /* 1537 * If doing an exclusive create, then generate 1538 * a SETATTR to set the initial attributes. 1539 * Try to set the mtime and the atime to the 1540 * server's current time. It is somewhat 1541 * expected that these fields will be used to 1542 * store the exclusive create cookie. If not, 1543 * server implementors will need to know that 1544 * a SETATTR will follow an exclusive create 1545 * and the cookie should be destroyed if 1546 * appropriate. 1547 * 1548 * The AT_GID and AT_SIZE bits are turned off 1549 * so that the SETATTR request will not attempt 1550 * to process these. The gid will be set 1551 * separately if appropriate. The size is turned 1552 * off because it is assumed that a new file will 1553 * be created empty and if the file wasn't empty, 1554 * then the exclusive create will have failed 1555 * because the file must have existed already. 1556 * Therefore, no truncate operation is needed. 1557 */ 1558 in_va->va_mask &= ~(AT_GID | AT_SIZE); 1559 in_va->va_mask |= (AT_MTIME | AT_ATIME); 1560 1561 e.error = nfs4setattr(vp, in_va, 0, cr, NULL); 1562 if (e.error) { 1563 /* 1564 * Couldn't correct the attributes of 1565 * the newly created file and the 1566 * attributes are wrong. Remove the 1567 * file and return an error to the 1568 * application. 1569 */ 1570 /* XXX will this take care of client state ? */ 1571 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 1572 "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:" 1573 " remove file", e.error)); 1574 VN_RELE(vp); 1575 (void) nfs4_remove(dvp, file_name, cr, NULL, 0); 1576 /* 1577 * Since we've reled the vnode and removed 1578 * the file we now need to return the error. 1579 * At this point we don't want to update the 1580 * dircaches, call nfs4_waitfor_purge_complete 1581 * or set vpp to vp so we need to skip these 1582 * as well. 1583 */ 1584 goto skip_update_dircaches; 1585 } 1586 } 1587 1588 /* 1589 * If we created or found the correct vnode, due to create_flag or 1590 * fh_differs being set, then update directory cache attribute, readdir 1591 * and dnlc caches. 1592 */ 1593 if (create_flag || fh_differs) { 1594 dirattr_info_t dinfo, *dinfop; 1595 1596 /* 1597 * Make sure getattr succeeded before using results. 1598 * note: op 7 is getattr(dir) for both flavors of 1599 * open(create). 1600 */ 1601 if (create_flag && res.status == NFS4_OK) { 1602 dinfo.di_time_call = t; 1603 dinfo.di_cred = cr; 1604 dinfo.di_garp = 1605 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 1606 dinfop = &dinfo; 1607 } else { 1608 dinfop = NULL; 1609 } 1610 1611 nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name, 1612 dinfop); 1613 } 1614 1615 /* 1616 * If the page cache for this file was flushed from actions 1617 * above, it was done asynchronously and if that is true, 1618 * there is a need to wait here for it to complete. This must 1619 * be done outside of start_fop/end_fop. 1620 */ 1621 (void) nfs4_waitfor_purge_complete(vp); 1622 1623 /* 1624 * It is implicit that we are in the open case (create_flag == 0) since 1625 * fh_differs can only be set to a non-zero value in the open case. 1626 */ 1627 if (fh_differs != 0 && vpi != NULL) 1628 VN_RELE(vpi); 1629 1630 /* 1631 * Be sure to set *vpp to the correct value before returning. 1632 */ 1633 *vpp = vp; 1634 1635 skip_update_dircaches: 1636 1637 nfs4args_copen_free(open_args); 1638 if (setgid_flag) { 1639 nfs4args_verify_free(&argop[8]); 1640 nfs4args_setattr_free(&argop[9]); 1641 } 1642 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1643 1644 if (ncr) 1645 crfree(ncr); 1646 kmem_free(argop, argoplist_size); 1647 return (e.error); 1648 } 1649 1650 /* 1651 * Reopen an open instance. cf. nfs4open_otw(). 1652 * 1653 * Errors are returned by the nfs4_error_t parameter. 1654 * - ep->error contains an errno value or zero. 1655 * - if it is zero, ep->stat is set to an NFS status code, if any. 1656 * If the file could not be reopened, but the caller should continue, the 1657 * file is marked dead and no error values are returned. If the caller 1658 * should stop recovering open files and start over, either the ep->error 1659 * value or ep->stat will indicate an error (either something that requires 1660 * recovery or EAGAIN). Note that some recovery (e.g., expired volatile 1661 * filehandles) may be handled silently by this routine. 1662 * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state 1663 * will be started, so the caller should not do it. 1664 * 1665 * Gotos: 1666 * - kill_file : reopen failed in such a fashion to constitute marking the 1667 * file dead and setting the open stream's 'os_failed_reopen' as 1. This 1668 * is for cases where recovery is not possible. 1669 * - failed_reopen : same as above, except that the file has already been 1670 * marked dead, so no need to do it again. 1671 * - bailout : reopen failed but we are able to recover and retry the reopen - 1672 * either within this function immediately or via the calling function. 1673 */ 1674 1675 void 1676 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep, 1677 open_claim_type4 claim, bool_t frc_use_claim_previous, 1678 bool_t is_recov) 1679 { 1680 COMPOUND4args_clnt args; 1681 COMPOUND4res_clnt res; 1682 nfs_argop4 argop[4]; 1683 nfs_resop4 *resop; 1684 OPEN4res *op_res = NULL; 1685 OPEN4cargs *open_args; 1686 GETFH4res *gf_res; 1687 rnode4_t *rp = VTOR4(vp); 1688 int doqueue = 1; 1689 cred_t *cr = NULL, *cred_otw = NULL; 1690 nfs4_open_owner_t *oop = NULL; 1691 seqid4 seqid; 1692 nfs4_ga_res_t *garp; 1693 char fn[MAXNAMELEN]; 1694 nfs4_recov_state_t recov = {NULL, 0}; 1695 nfs4_lost_rqst_t lost_rqst; 1696 mntinfo4_t *mi = VTOMI4(vp); 1697 bool_t abort; 1698 char *failed_msg = ""; 1699 int fh_different; 1700 hrtime_t t; 1701 nfs4_bseqid_entry_t *bsep = NULL; 1702 1703 ASSERT(nfs4_consistent_type(vp)); 1704 ASSERT(nfs_zone() == mi->mi_zone); 1705 1706 nfs4_error_zinit(ep); 1707 1708 /* this is the cred used to find the open owner */ 1709 cr = state_to_cred(osp); 1710 if (cr == NULL) { 1711 failed_msg = "Couldn't reopen: no cred"; 1712 goto kill_file; 1713 } 1714 /* use this cred for OTW operations */ 1715 cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner); 1716 1717 top: 1718 nfs4_error_zinit(ep); 1719 1720 if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) { 1721 /* File system has been unmounted, quit */ 1722 ep->error = EIO; 1723 failed_msg = "Couldn't reopen: file system has been unmounted"; 1724 goto kill_file; 1725 } 1726 1727 oop = osp->os_open_owner; 1728 1729 ASSERT(oop != NULL); 1730 if (oop == NULL) { /* be defensive in non-DEBUG */ 1731 failed_msg = "can't reopen: no open owner"; 1732 goto kill_file; 1733 } 1734 open_owner_hold(oop); 1735 1736 ep->error = nfs4_start_open_seqid_sync(oop, mi); 1737 if (ep->error) { 1738 open_owner_rele(oop); 1739 oop = NULL; 1740 goto bailout; 1741 } 1742 1743 /* 1744 * If the rnode has a delegation and the delegation has been 1745 * recovered and the server didn't request a recall and the caller 1746 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during 1747 * recovery) and the rnode hasn't been marked dead, then install 1748 * the delegation stateid in the open stream. Otherwise, proceed 1749 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN. 1750 */ 1751 mutex_enter(&rp->r_statev4_lock); 1752 if (rp->r_deleg_type != OPEN_DELEGATE_NONE && 1753 !rp->r_deleg_return_pending && 1754 (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) && 1755 !rp->r_deleg_needs_recall && 1756 claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous && 1757 !(rp->r_flags & R4RECOVERR)) { 1758 mutex_enter(&osp->os_sync_lock); 1759 osp->os_delegation = 1; 1760 osp->open_stateid = rp->r_deleg_stateid; 1761 mutex_exit(&osp->os_sync_lock); 1762 mutex_exit(&rp->r_statev4_lock); 1763 goto bailout; 1764 } 1765 mutex_exit(&rp->r_statev4_lock); 1766 1767 /* 1768 * If the file failed recovery, just quit. This failure need not 1769 * affect other reopens, so don't return an error. 1770 */ 1771 mutex_enter(&rp->r_statelock); 1772 if (rp->r_flags & R4RECOVERR) { 1773 mutex_exit(&rp->r_statelock); 1774 ep->error = 0; 1775 goto failed_reopen; 1776 } 1777 mutex_exit(&rp->r_statelock); 1778 1779 /* 1780 * argop is empty here 1781 * 1782 * PUTFH, OPEN, GETATTR 1783 */ 1784 args.ctag = TAG_REOPEN; 1785 args.array_len = 4; 1786 args.array = argop; 1787 1788 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 1789 "nfs4_reopen: file is type %d, id %s", 1790 vp->v_type, rnode4info(VTOR4(vp)))); 1791 1792 argop[0].argop = OP_CPUTFH; 1793 1794 if (claim != CLAIM_PREVIOUS) { 1795 /* 1796 * if this is a file mount then 1797 * use the mntinfo parentfh 1798 */ 1799 argop[0].nfs_argop4_u.opcputfh.sfh = 1800 (vp->v_flag & VROOT) ? mi->mi_srvparentfh : 1801 VTOSV(vp)->sv_dfh; 1802 } else { 1803 /* putfh fh to reopen */ 1804 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 1805 } 1806 1807 argop[1].argop = OP_COPEN; 1808 open_args = &argop[1].nfs_argop4_u.opcopen; 1809 open_args->claim = claim; 1810 1811 if (claim == CLAIM_NULL) { 1812 1813 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1814 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1815 "failed for vp 0x%p for CLAIM_NULL with %m", 1816 (void *)vp); 1817 failed_msg = "Couldn't reopen: vtoname failed for " 1818 "CLAIM_NULL"; 1819 /* nothing allocated yet */ 1820 goto kill_file; 1821 } 1822 1823 open_args->open_claim4_u.cfile = fn; 1824 } else if (claim == CLAIM_PREVIOUS) { 1825 1826 /* 1827 * We have two cases to deal with here: 1828 * 1) We're being called to reopen files in order to satisfy 1829 * a lock operation request which requires us to explicitly 1830 * reopen files which were opened under a delegation. If 1831 * we're in recovery, we *must* use CLAIM_PREVIOUS. In 1832 * that case, frc_use_claim_previous is TRUE and we must 1833 * use the rnode's current delegation type (r_deleg_type). 1834 * 2) We're reopening files during some form of recovery. 1835 * In this case, frc_use_claim_previous is FALSE and we 1836 * use the delegation type appropriate for recovery 1837 * (r_deleg_needs_recovery). 1838 */ 1839 mutex_enter(&rp->r_statev4_lock); 1840 open_args->open_claim4_u.delegate_type = 1841 frc_use_claim_previous ? 1842 rp->r_deleg_type : 1843 rp->r_deleg_needs_recovery; 1844 mutex_exit(&rp->r_statev4_lock); 1845 1846 } else if (claim == CLAIM_DELEGATE_CUR) { 1847 1848 if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) { 1849 nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname " 1850 "failed for vp 0x%p for CLAIM_DELEGATE_CUR " 1851 "with %m", (void *)vp); 1852 failed_msg = "Couldn't reopen: vtoname failed for " 1853 "CLAIM_DELEGATE_CUR"; 1854 /* nothing allocated yet */ 1855 goto kill_file; 1856 } 1857 1858 mutex_enter(&rp->r_statev4_lock); 1859 open_args->open_claim4_u.delegate_cur_info.delegate_stateid = 1860 rp->r_deleg_stateid; 1861 mutex_exit(&rp->r_statev4_lock); 1862 1863 open_args->open_claim4_u.delegate_cur_info.cfile = fn; 1864 } 1865 open_args->opentype = OPEN4_NOCREATE; 1866 open_args->owner.clientid = mi2clientid(mi); 1867 open_args->owner.owner_len = sizeof (oop->oo_name); 1868 open_args->owner.owner_val = 1869 kmem_alloc(open_args->owner.owner_len, KM_SLEEP); 1870 bcopy(&oop->oo_name, open_args->owner.owner_val, 1871 open_args->owner.owner_len); 1872 open_args->share_access = 0; 1873 open_args->share_deny = 0; 1874 1875 mutex_enter(&osp->os_sync_lock); 1876 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp " 1877 "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: " 1878 "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ", 1879 (void *)osp, (void *)rp, osp->os_share_acc_read, 1880 osp->os_share_acc_write, osp->os_open_ref_count, 1881 osp->os_mmap_read, osp->os_mmap_write, claim)); 1882 1883 if (osp->os_share_acc_read || osp->os_mmap_read) 1884 open_args->share_access |= OPEN4_SHARE_ACCESS_READ; 1885 if (osp->os_share_acc_write || osp->os_mmap_write) 1886 open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE; 1887 if (osp->os_share_deny_read) 1888 open_args->share_deny |= OPEN4_SHARE_DENY_READ; 1889 if (osp->os_share_deny_write) 1890 open_args->share_deny |= OPEN4_SHARE_DENY_WRITE; 1891 mutex_exit(&osp->os_sync_lock); 1892 1893 seqid = nfs4_get_open_seqid(oop) + 1; 1894 open_args->seqid = seqid; 1895 1896 /* Construct the getfh part of the compound */ 1897 argop[2].argop = OP_GETFH; 1898 1899 /* Construct the getattr part of the compound */ 1900 argop[3].argop = OP_GETATTR; 1901 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 1902 argop[3].nfs_argop4_u.opgetattr.mi = mi; 1903 1904 t = gethrtime(); 1905 1906 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 1907 1908 if (ep->error) { 1909 if (!is_recov && !frc_use_claim_previous && 1910 (ep->error == EINTR || ep->error == ETIMEDOUT || 1911 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) { 1912 nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop, 1913 cred_otw, vp, NULL, open_args); 1914 abort = nfs4_start_recovery(ep, 1915 VTOMI4(vp), vp, NULL, NULL, 1916 lost_rqst.lr_op == OP_OPEN ? 1917 &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL); 1918 nfs4args_copen_free(open_args); 1919 goto bailout; 1920 } 1921 1922 nfs4args_copen_free(open_args); 1923 1924 if (ep->error == EACCES && cred_otw != cr) { 1925 crfree(cred_otw); 1926 cred_otw = cr; 1927 crhold(cred_otw); 1928 nfs4_end_open_seqid_sync(oop); 1929 open_owner_rele(oop); 1930 oop = NULL; 1931 goto top; 1932 } 1933 if (ep->error == ETIMEDOUT) 1934 goto bailout; 1935 failed_msg = "Couldn't reopen: rpc error"; 1936 goto kill_file; 1937 } 1938 1939 if (nfs4_need_to_bump_seqid(&res)) 1940 nfs4_set_open_seqid(seqid, oop, args.ctag); 1941 1942 switch (res.status) { 1943 case NFS4_OK: 1944 if (recov.rs_flags & NFS4_RS_DELAY_MSG) { 1945 mutex_enter(&rp->r_statelock); 1946 rp->r_delay_interval = 0; 1947 mutex_exit(&rp->r_statelock); 1948 } 1949 break; 1950 case NFS4ERR_BAD_SEQID: 1951 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0, 1952 args.ctag, open_args->seqid); 1953 1954 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 1955 NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst : 1956 NULL, OP_OPEN, bsep, NULL, NULL); 1957 1958 nfs4args_copen_free(open_args); 1959 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1960 nfs4_end_open_seqid_sync(oop); 1961 open_owner_rele(oop); 1962 oop = NULL; 1963 kmem_free(bsep, sizeof (*bsep)); 1964 1965 goto kill_file; 1966 case NFS4ERR_NO_GRACE: 1967 nfs4args_copen_free(open_args); 1968 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1969 nfs4_end_open_seqid_sync(oop); 1970 open_owner_rele(oop); 1971 oop = NULL; 1972 if (claim == CLAIM_PREVIOUS) { 1973 /* 1974 * Retry as a plain open. We don't need to worry about 1975 * checking the changeinfo: it is acceptable for a 1976 * client to re-open a file and continue processing 1977 * (in the absence of locks). 1978 */ 1979 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 1980 "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; " 1981 "will retry as CLAIM_NULL")); 1982 claim = CLAIM_NULL; 1983 nfs4_mi_kstat_inc_no_grace(mi); 1984 goto top; 1985 } 1986 failed_msg = 1987 "Couldn't reopen: tried reclaim outside grace period. "; 1988 goto kill_file; 1989 case NFS4ERR_GRACE: 1990 nfs4_set_grace_wait(mi); 1991 nfs4args_copen_free(open_args); 1992 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 1993 nfs4_end_open_seqid_sync(oop); 1994 open_owner_rele(oop); 1995 oop = NULL; 1996 ep->error = nfs4_wait_for_grace(mi, &recov); 1997 if (ep->error != 0) 1998 goto bailout; 1999 goto top; 2000 case NFS4ERR_DELAY: 2001 nfs4_set_delay_wait(vp); 2002 nfs4args_copen_free(open_args); 2003 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2004 nfs4_end_open_seqid_sync(oop); 2005 open_owner_rele(oop); 2006 oop = NULL; 2007 ep->error = nfs4_wait_for_delay(vp, &recov); 2008 nfs4_mi_kstat_inc_delay(mi); 2009 if (ep->error != 0) 2010 goto bailout; 2011 goto top; 2012 case NFS4ERR_FHEXPIRED: 2013 /* recover filehandle and retry */ 2014 abort = nfs4_start_recovery(ep, 2015 mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL); 2016 nfs4args_copen_free(open_args); 2017 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2018 nfs4_end_open_seqid_sync(oop); 2019 open_owner_rele(oop); 2020 oop = NULL; 2021 if (abort == FALSE) 2022 goto top; 2023 failed_msg = "Couldn't reopen: recovery aborted"; 2024 goto kill_file; 2025 case NFS4ERR_RESOURCE: 2026 case NFS4ERR_STALE_CLIENTID: 2027 case NFS4ERR_WRONGSEC: 2028 case NFS4ERR_EXPIRED: 2029 /* 2030 * Do not mark the file dead and let the calling 2031 * function initiate recovery. 2032 */ 2033 nfs4args_copen_free(open_args); 2034 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2035 nfs4_end_open_seqid_sync(oop); 2036 open_owner_rele(oop); 2037 oop = NULL; 2038 goto bailout; 2039 case NFS4ERR_ACCESS: 2040 if (cred_otw != cr) { 2041 crfree(cred_otw); 2042 cred_otw = cr; 2043 crhold(cred_otw); 2044 nfs4args_copen_free(open_args); 2045 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2046 nfs4_end_open_seqid_sync(oop); 2047 open_owner_rele(oop); 2048 oop = NULL; 2049 goto top; 2050 } 2051 /* fall through */ 2052 default: 2053 NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE, 2054 "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s", 2055 (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv, 2056 rnode4info(VTOR4(vp)))); 2057 failed_msg = "Couldn't reopen: NFSv4 error"; 2058 nfs4args_copen_free(open_args); 2059 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2060 goto kill_file; 2061 } 2062 2063 resop = &res.array[1]; /* open res */ 2064 op_res = &resop->nfs_resop4_u.opopen; 2065 2066 garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res; 2067 2068 /* 2069 * Check if the path we reopened really is the same 2070 * file. We could end up in a situation where the file 2071 * was removed and a new file created with the same name. 2072 */ 2073 resop = &res.array[2]; 2074 gf_res = &resop->nfs_resop4_u.opgetfh; 2075 (void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0); 2076 fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0); 2077 if (fh_different) { 2078 if (mi->mi_fh_expire_type == FH4_PERSISTENT || 2079 mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) { 2080 /* Oops, we don't have the same file */ 2081 if (mi->mi_fh_expire_type == FH4_PERSISTENT) 2082 failed_msg = "Couldn't reopen: Persistent " 2083 "file handle changed"; 2084 else 2085 failed_msg = "Couldn't reopen: Volatile " 2086 "(no expire on open) file handle changed"; 2087 2088 nfs4args_copen_free(open_args); 2089 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2090 nfs_rw_exit(&mi->mi_fh_lock); 2091 goto kill_file; 2092 2093 } else { 2094 /* 2095 * We have volatile file handles that don't compare. 2096 * If the fids are the same then we assume that the 2097 * file handle expired but the rnode still refers to 2098 * the same file object. 2099 * 2100 * First check that we have fids or not. 2101 * If we don't we have a dumb server so we will 2102 * just assume every thing is ok for now. 2103 */ 2104 if (!ep->error && garp->n4g_va.va_mask & AT_NODEID && 2105 rp->r_attr.va_mask & AT_NODEID && 2106 rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) { 2107 /* 2108 * We have fids, but they don't 2109 * compare. So kill the file. 2110 */ 2111 failed_msg = 2112 "Couldn't reopen: file handle changed" 2113 " due to mismatched fids"; 2114 nfs4args_copen_free(open_args); 2115 (void) xdr_free(xdr_COMPOUND4res_clnt, 2116 (caddr_t)&res); 2117 nfs_rw_exit(&mi->mi_fh_lock); 2118 goto kill_file; 2119 } else { 2120 /* 2121 * We have volatile file handles that refers 2122 * to the same file (at least they have the 2123 * same fid) or we don't have fids so we 2124 * can't tell. :(. We'll be a kind and accepting 2125 * client so we'll update the rnode's file 2126 * handle with the otw handle. 2127 * 2128 * We need to drop mi->mi_fh_lock since 2129 * sh4_update acquires it. Since there is 2130 * only one recovery thread there is no 2131 * race. 2132 */ 2133 nfs_rw_exit(&mi->mi_fh_lock); 2134 sfh4_update(rp->r_fh, &gf_res->object); 2135 } 2136 } 2137 } else { 2138 nfs_rw_exit(&mi->mi_fh_lock); 2139 } 2140 2141 ASSERT(nfs4_consistent_type(vp)); 2142 2143 /* 2144 * If the server wanted an OPEN_CONFIRM but that fails, just start 2145 * over. Presumably if there is a persistent error it will show up 2146 * when we resend the OPEN. 2147 */ 2148 if (op_res->rflags & OPEN4_RESULT_CONFIRM) { 2149 bool_t retry_open = FALSE; 2150 2151 nfs4open_confirm(vp, &seqid, &op_res->stateid, 2152 cred_otw, is_recov, &retry_open, 2153 oop, FALSE, ep, NULL); 2154 if (ep->error || ep->stat) { 2155 nfs4args_copen_free(open_args); 2156 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2157 nfs4_end_open_seqid_sync(oop); 2158 open_owner_rele(oop); 2159 oop = NULL; 2160 goto top; 2161 } 2162 } 2163 2164 mutex_enter(&osp->os_sync_lock); 2165 osp->open_stateid = op_res->stateid; 2166 osp->os_delegation = 0; 2167 /* 2168 * Need to reset this bitfield for the possible case where we were 2169 * going to OTW CLOSE the file, got a non-recoverable error, and before 2170 * we could retry the CLOSE, OPENed the file again. 2171 */ 2172 ASSERT(osp->os_open_owner->oo_seqid_inuse); 2173 osp->os_final_close = 0; 2174 osp->os_force_close = 0; 2175 if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS) 2176 osp->os_dc_openacc = open_args->share_access; 2177 mutex_exit(&osp->os_sync_lock); 2178 2179 nfs4_end_open_seqid_sync(oop); 2180 2181 /* accept delegation, if any */ 2182 nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw); 2183 2184 nfs4args_copen_free(open_args); 2185 2186 nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL); 2187 2188 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2189 2190 ASSERT(nfs4_consistent_type(vp)); 2191 2192 open_owner_rele(oop); 2193 crfree(cr); 2194 crfree(cred_otw); 2195 return; 2196 2197 kill_file: 2198 nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat); 2199 failed_reopen: 2200 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 2201 "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s", 2202 (void *)osp, (void *)cr, rnode4info(rp))); 2203 mutex_enter(&osp->os_sync_lock); 2204 osp->os_failed_reopen = 1; 2205 mutex_exit(&osp->os_sync_lock); 2206 bailout: 2207 if (oop != NULL) { 2208 nfs4_end_open_seqid_sync(oop); 2209 open_owner_rele(oop); 2210 } 2211 if (cr != NULL) 2212 crfree(cr); 2213 if (cred_otw != NULL) 2214 crfree(cred_otw); 2215 } 2216 2217 /* for . and .. OPENs */ 2218 /* ARGSUSED */ 2219 static int 2220 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr) 2221 { 2222 rnode4_t *rp; 2223 nfs4_ga_res_t gar; 2224 2225 ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone); 2226 2227 /* 2228 * If close-to-open consistency checking is turned off or 2229 * if there is no cached data, we can avoid 2230 * the over the wire getattr. Otherwise, force a 2231 * call to the server to get fresh attributes and to 2232 * check caches. This is required for close-to-open 2233 * consistency. 2234 */ 2235 rp = VTOR4(*vpp); 2236 if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO || 2237 (rp->r_dir == NULL && !nfs4_has_pages(*vpp))) 2238 return (0); 2239 2240 return (nfs4_getattr_otw(*vpp, &gar, cr, 0)); 2241 } 2242 2243 /* 2244 * CLOSE a file 2245 */ 2246 /* ARGSUSED */ 2247 static int 2248 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, 2249 caller_context_t *ct) 2250 { 2251 rnode4_t *rp; 2252 int error = 0; 2253 int r_error = 0; 2254 int n4error = 0; 2255 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 2256 2257 /* 2258 * Remove client state for this (lockowner, file) pair. 2259 * Issue otw v4 call to have the server do the same. 2260 */ 2261 2262 rp = VTOR4(vp); 2263 2264 /* 2265 * zone_enter(2) prevents processes from changing zones with NFS files 2266 * open; if we happen to get here from the wrong zone we can't do 2267 * anything over the wire. 2268 */ 2269 if (VTOMI4(vp)->mi_zone != nfs_zone()) { 2270 /* 2271 * We could attempt to clean up locks, except we're sure 2272 * that the current process didn't acquire any locks on 2273 * the file: any attempt to lock a file belong to another zone 2274 * will fail, and one can't lock an NFS file and then change 2275 * zones, as that fails too. 2276 * 2277 * Returning an error here is the sane thing to do. A 2278 * subsequent call to VN_RELE() which translates to a 2279 * nfs4_inactive() will clean up state: if the zone of the 2280 * vnode's origin is still alive and kicking, the inactive 2281 * thread will handle the request (from the correct zone), and 2282 * everything (minus the OTW close call) should be OK. If the 2283 * zone is going away nfs4_async_inactive() will throw away 2284 * delegations, open streams and cached pages inline. 2285 */ 2286 return (EIO); 2287 } 2288 2289 /* 2290 * If we are using local locking for this filesystem, then 2291 * release all of the SYSV style record locks. Otherwise, 2292 * we are doing network locking and we need to release all 2293 * of the network locks. All of the locks held by this 2294 * process on this file are released no matter what the 2295 * incoming reference count is. 2296 */ 2297 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) { 2298 cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 2299 cleanshares(vp, ttoproc(curthread)->p_pid); 2300 } else 2301 e.error = nfs4_lockrelease(vp, flag, offset, cr); 2302 2303 if (e.error) { 2304 struct lm_sysid *lmsid; 2305 lmsid = nfs4_find_sysid(VTOMI4(vp)); 2306 if (lmsid == NULL) { 2307 DTRACE_PROBE2(unknown__sysid, int, e.error, 2308 vnode_t *, vp); 2309 } else { 2310 cleanlocks(vp, ttoproc(curthread)->p_pid, 2311 (lm_sysidt(lmsid) | LM_SYSID_CLIENT)); 2312 2313 lm_rel_sysid(lmsid); 2314 } 2315 return (e.error); 2316 } 2317 2318 if (count > 1) 2319 return (0); 2320 2321 /* 2322 * If the file has been `unlinked', then purge the 2323 * DNLC so that this vnode will get reycled quicker 2324 * and the .nfs* file on the server will get removed. 2325 */ 2326 if (rp->r_unldvp != NULL) 2327 dnlc_purge_vp(vp); 2328 2329 /* 2330 * If the file was open for write and there are pages, 2331 * do a synchronous flush and commit of all of the 2332 * dirty and uncommitted pages. 2333 */ 2334 ASSERT(!e.error); 2335 if ((flag & FWRITE) && nfs4_has_pages(vp)) 2336 error = nfs4_putpage_commit(vp, 0, 0, cr); 2337 2338 mutex_enter(&rp->r_statelock); 2339 r_error = rp->r_error; 2340 rp->r_error = 0; 2341 mutex_exit(&rp->r_statelock); 2342 2343 /* 2344 * If this file type is one for which no explicit 'open' was 2345 * done, then bail now (ie. no need for protocol 'close'). If 2346 * there was an error w/the vm subsystem, return _that_ error, 2347 * otherwise, return any errors that may've been reported via 2348 * the rnode. 2349 */ 2350 if (vp->v_type != VREG) 2351 return (error ? error : r_error); 2352 2353 /* 2354 * The sync putpage commit may have failed above, but since 2355 * we're working w/a regular file, we need to do the protocol 2356 * 'close' (nfs4close_one will figure out if an otw close is 2357 * needed or not). Report any errors _after_ doing the protocol 2358 * 'close'. 2359 */ 2360 nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0); 2361 n4error = e.error ? e.error : geterrno4(e.stat); 2362 2363 /* 2364 * Error reporting prio (Hi -> Lo) 2365 * 2366 * i) nfs4_putpage_commit (error) 2367 * ii) rnode's (r_error) 2368 * iii) nfs4close_one (n4error) 2369 */ 2370 return (error ? error : (r_error ? r_error : n4error)); 2371 } 2372 2373 /* 2374 * Initialize *lost_rqstp. 2375 */ 2376 2377 static void 2378 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp, 2379 nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr, 2380 vnode_t *vp) 2381 { 2382 if (error != ETIMEDOUT && error != EINTR && 2383 !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 2384 lost_rqstp->lr_op = 0; 2385 return; 2386 } 2387 2388 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 2389 "nfs4close_save_lost_rqst: error %d", error)); 2390 2391 lost_rqstp->lr_op = OP_CLOSE; 2392 /* 2393 * The vp is held and rele'd via the recovery code. 2394 * See nfs4_save_lost_rqst. 2395 */ 2396 lost_rqstp->lr_vp = vp; 2397 lost_rqstp->lr_dvp = NULL; 2398 lost_rqstp->lr_oop = oop; 2399 lost_rqstp->lr_osp = osp; 2400 ASSERT(osp != NULL); 2401 ASSERT(mutex_owned(&osp->os_sync_lock)); 2402 osp->os_pending_close = 1; 2403 lost_rqstp->lr_lop = NULL; 2404 lost_rqstp->lr_cr = cr; 2405 lost_rqstp->lr_flk = NULL; 2406 lost_rqstp->lr_putfirst = FALSE; 2407 } 2408 2409 /* 2410 * Assumes you already have the open seqid sync grabbed as well as the 2411 * 'os_sync_lock'. Note: this will release the open seqid sync and 2412 * 'os_sync_lock' if client recovery starts. Calling functions have to 2413 * be prepared to handle this. 2414 * 2415 * 'recov' is returned as 1 if the CLOSE operation detected client recovery 2416 * was needed and was started, and that the calling function should retry 2417 * this function; otherwise it is returned as 0. 2418 * 2419 * Errors are returned via the nfs4_error_t parameter. 2420 */ 2421 static void 2422 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, 2423 nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp, 2424 nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp) 2425 { 2426 COMPOUND4args_clnt args; 2427 COMPOUND4res_clnt res; 2428 CLOSE4args *close_args; 2429 nfs_resop4 *resop; 2430 nfs_argop4 argop[3]; 2431 int doqueue = 1; 2432 mntinfo4_t *mi; 2433 seqid4 seqid; 2434 vnode_t *vp; 2435 bool_t needrecov = FALSE; 2436 nfs4_lost_rqst_t lost_rqst; 2437 hrtime_t t; 2438 2439 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 2440 2441 ASSERT(MUTEX_HELD(&osp->os_sync_lock)); 2442 2443 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw")); 2444 2445 /* Only set this to 1 if recovery is started */ 2446 *recov = 0; 2447 2448 /* do the OTW call to close the file */ 2449 2450 if (close_type == CLOSE_RESEND) 2451 args.ctag = TAG_CLOSE_LOST; 2452 else if (close_type == CLOSE_AFTER_RESEND) 2453 args.ctag = TAG_CLOSE_UNDO; 2454 else 2455 args.ctag = TAG_CLOSE; 2456 2457 args.array_len = 3; 2458 args.array = argop; 2459 2460 vp = RTOV4(rp); 2461 2462 mi = VTOMI4(vp); 2463 2464 /* putfh target fh */ 2465 argop[0].argop = OP_CPUTFH; 2466 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 2467 2468 argop[1].argop = OP_GETATTR; 2469 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 2470 argop[1].nfs_argop4_u.opgetattr.mi = mi; 2471 2472 argop[2].argop = OP_CLOSE; 2473 close_args = &argop[2].nfs_argop4_u.opclose; 2474 2475 seqid = nfs4_get_open_seqid(oop) + 1; 2476 2477 close_args->seqid = seqid; 2478 close_args->open_stateid = osp->open_stateid; 2479 2480 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 2481 "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first", 2482 rnode4info(rp))); 2483 2484 t = gethrtime(); 2485 2486 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep); 2487 2488 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 2489 nfs4_set_open_seqid(seqid, oop, args.ctag); 2490 } 2491 2492 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 2493 if (ep->error && !needrecov) { 2494 /* 2495 * if there was an error and no recovery is to be done 2496 * then then set up the file to flush its cache if 2497 * needed for the next caller. 2498 */ 2499 mutex_enter(&rp->r_statelock); 2500 PURGE_ATTRCACHE4_LOCKED(rp); 2501 rp->r_flags &= ~R4WRITEMODIFIED; 2502 mutex_exit(&rp->r_statelock); 2503 return; 2504 } 2505 2506 if (needrecov) { 2507 bool_t abort; 2508 nfs4_bseqid_entry_t *bsep = NULL; 2509 2510 if (close_type != CLOSE_RESEND) 2511 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 2512 osp, cred_otw, vp); 2513 2514 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 2515 bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 2516 0, args.ctag, close_args->seqid); 2517 2518 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 2519 "nfs4close_otw: initiating recovery. error %d " 2520 "res.status %d", ep->error, res.status)); 2521 2522 /* 2523 * Drop the 'os_sync_lock' here so we don't hit 2524 * a potential recursive mutex_enter via an 2525 * 'open_stream_hold()'. 2526 */ 2527 mutex_exit(&osp->os_sync_lock); 2528 *have_sync_lockp = 0; 2529 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 2530 (close_type != CLOSE_RESEND && 2531 lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL, 2532 OP_CLOSE, bsep, NULL, NULL); 2533 2534 /* drop open seq sync, and let the calling function regrab it */ 2535 nfs4_end_open_seqid_sync(oop); 2536 *did_start_seqid_syncp = 0; 2537 2538 if (bsep) 2539 kmem_free(bsep, sizeof (*bsep)); 2540 /* 2541 * For signals, the caller wants to quit, so don't say to 2542 * retry. For forced unmount, if it's a user thread, it 2543 * wants to quit. If it's a recovery thread, the retry 2544 * will happen higher-up on the call stack. Either way, 2545 * don't say to retry. 2546 */ 2547 if (abort == FALSE && ep->error != EINTR && 2548 !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) && 2549 close_type != CLOSE_RESEND && 2550 close_type != CLOSE_AFTER_RESEND) 2551 *recov = 1; 2552 else 2553 *recov = 0; 2554 2555 if (!ep->error) 2556 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2557 return; 2558 } 2559 2560 if (res.status) { 2561 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2562 return; 2563 } 2564 2565 mutex_enter(&rp->r_statev4_lock); 2566 rp->created_v4 = 0; 2567 mutex_exit(&rp->r_statev4_lock); 2568 2569 resop = &res.array[2]; 2570 osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid; 2571 osp->os_valid = 0; 2572 2573 /* 2574 * This removes the reference obtained at OPEN; ie, when the 2575 * open stream structure was created. 2576 * 2577 * We don't have to worry about calling 'open_stream_rele' 2578 * since we our currently holding a reference to the open 2579 * stream which means the count cannot go to 0 with this 2580 * decrement. 2581 */ 2582 ASSERT(osp->os_ref_count >= 2); 2583 osp->os_ref_count--; 2584 2585 if (ep->error == 0) { 2586 mutex_exit(&osp->os_sync_lock); 2587 *have_sync_lockp = 0; 2588 2589 nfs4_attr_cache(vp, 2590 &res.array[1].nfs_resop4_u.opgetattr.ga_res, 2591 t, cred_otw, TRUE, NULL); 2592 } 2593 2594 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:" 2595 " returning %d", ep->error)); 2596 2597 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 2598 } 2599 2600 /* ARGSUSED */ 2601 static int 2602 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2603 caller_context_t *ct) 2604 { 2605 rnode4_t *rp; 2606 u_offset_t off; 2607 offset_t diff; 2608 uint_t on; 2609 uint_t n; 2610 caddr_t base; 2611 uint_t flags; 2612 int error; 2613 mntinfo4_t *mi; 2614 2615 rp = VTOR4(vp); 2616 2617 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 2618 2619 if (IS_SHADOW(vp, rp)) 2620 vp = RTOV4(rp); 2621 2622 if (vp->v_type != VREG) 2623 return (EISDIR); 2624 2625 mi = VTOMI4(vp); 2626 2627 if (nfs_zone() != mi->mi_zone) 2628 return (EIO); 2629 2630 if (uiop->uio_resid == 0) 2631 return (0); 2632 2633 if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0) 2634 return (EINVAL); 2635 2636 mutex_enter(&rp->r_statelock); 2637 if (rp->r_flags & R4RECOVERRP) 2638 error = (rp->r_error ? rp->r_error : EIO); 2639 else 2640 error = 0; 2641 mutex_exit(&rp->r_statelock); 2642 if (error) 2643 return (error); 2644 2645 /* 2646 * Bypass VM if caching has been disabled (e.g., locking) or if 2647 * using client-side direct I/O and the file is not mmap'd and 2648 * there are no cached pages. 2649 */ 2650 if ((vp->v_flag & VNOCACHE) || 2651 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2652 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2653 size_t resid = 0; 2654 2655 return (nfs4read(vp, NULL, uiop->uio_loffset, 2656 uiop->uio_resid, &resid, cr, FALSE, uiop)); 2657 } 2658 2659 error = 0; 2660 2661 do { 2662 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2663 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2664 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2665 2666 if (error = nfs4_validate_caches(vp, cr)) 2667 break; 2668 2669 mutex_enter(&rp->r_statelock); 2670 while (rp->r_flags & R4INCACHEPURGE) { 2671 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2672 mutex_exit(&rp->r_statelock); 2673 return (EINTR); 2674 } 2675 } 2676 diff = rp->r_size - uiop->uio_loffset; 2677 mutex_exit(&rp->r_statelock); 2678 if (diff <= 0) 2679 break; 2680 if (diff < n) 2681 n = (uint_t)diff; 2682 2683 if (vpm_enable) { 2684 /* 2685 * Copy data. 2686 */ 2687 error = vpm_data_copy(vp, off + on, n, uiop, 2688 1, NULL, 0, S_READ); 2689 } else { 2690 base = segmap_getmapflt(segkmap, vp, off + on, n, 1, 2691 S_READ); 2692 2693 error = uiomove(base + on, n, UIO_READ, uiop); 2694 } 2695 2696 if (!error) { 2697 /* 2698 * If read a whole block or read to eof, 2699 * won't need this buffer again soon. 2700 */ 2701 mutex_enter(&rp->r_statelock); 2702 if (n + on == MAXBSIZE || 2703 uiop->uio_loffset == rp->r_size) 2704 flags = SM_DONTNEED; 2705 else 2706 flags = 0; 2707 mutex_exit(&rp->r_statelock); 2708 if (vpm_enable) { 2709 error = vpm_sync_pages(vp, off, n, flags); 2710 } else { 2711 error = segmap_release(segkmap, base, flags); 2712 } 2713 } else { 2714 if (vpm_enable) { 2715 (void) vpm_sync_pages(vp, off, n, 0); 2716 } else { 2717 (void) segmap_release(segkmap, base, 0); 2718 } 2719 } 2720 } while (!error && uiop->uio_resid > 0); 2721 2722 return (error); 2723 } 2724 2725 /* ARGSUSED */ 2726 static int 2727 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr, 2728 caller_context_t *ct) 2729 { 2730 rlim64_t limit = uiop->uio_llimit; 2731 rnode4_t *rp; 2732 u_offset_t off; 2733 caddr_t base; 2734 uint_t flags; 2735 int remainder; 2736 size_t n; 2737 int on; 2738 int error; 2739 int resid; 2740 u_offset_t offset; 2741 mntinfo4_t *mi; 2742 uint_t bsize; 2743 2744 rp = VTOR4(vp); 2745 2746 if (IS_SHADOW(vp, rp)) 2747 vp = RTOV4(rp); 2748 2749 if (vp->v_type != VREG) 2750 return (EISDIR); 2751 2752 mi = VTOMI4(vp); 2753 2754 if (nfs_zone() != mi->mi_zone) 2755 return (EIO); 2756 2757 if (uiop->uio_resid == 0) 2758 return (0); 2759 2760 mutex_enter(&rp->r_statelock); 2761 if (rp->r_flags & R4RECOVERRP) 2762 error = (rp->r_error ? rp->r_error : EIO); 2763 else 2764 error = 0; 2765 mutex_exit(&rp->r_statelock); 2766 if (error) 2767 return (error); 2768 2769 if (ioflag & FAPPEND) { 2770 struct vattr va; 2771 2772 /* 2773 * Must serialize if appending. 2774 */ 2775 if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) { 2776 nfs_rw_exit(&rp->r_rwlock); 2777 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, 2778 INTR4(vp))) 2779 return (EINTR); 2780 } 2781 2782 va.va_mask = AT_SIZE; 2783 error = nfs4getattr(vp, &va, cr); 2784 if (error) 2785 return (error); 2786 uiop->uio_loffset = va.va_size; 2787 } 2788 2789 offset = uiop->uio_loffset + uiop->uio_resid; 2790 2791 if (uiop->uio_loffset < (offset_t)0 || offset < 0) 2792 return (EINVAL); 2793 2794 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) 2795 limit = MAXOFFSET_T; 2796 2797 /* 2798 * Check to make sure that the process will not exceed 2799 * its limit on file size. It is okay to write up to 2800 * the limit, but not beyond. Thus, the write which 2801 * reaches the limit will be short and the next write 2802 * will return an error. 2803 */ 2804 remainder = 0; 2805 if (offset > uiop->uio_llimit) { 2806 remainder = offset - uiop->uio_llimit; 2807 uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset; 2808 if (uiop->uio_resid <= 0) { 2809 proc_t *p = ttoproc(curthread); 2810 2811 uiop->uio_resid += remainder; 2812 mutex_enter(&p->p_lock); 2813 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], 2814 p->p_rctls, p, RCA_UNSAFE_SIGINFO); 2815 mutex_exit(&p->p_lock); 2816 return (EFBIG); 2817 } 2818 } 2819 2820 /* update the change attribute, if we have a write delegation */ 2821 2822 mutex_enter(&rp->r_statev4_lock); 2823 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) 2824 rp->r_deleg_change++; 2825 2826 mutex_exit(&rp->r_statev4_lock); 2827 2828 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, INTR4(vp))) 2829 return (EINTR); 2830 2831 /* 2832 * Bypass VM if caching has been disabled (e.g., locking) or if 2833 * using client-side direct I/O and the file is not mmap'd and 2834 * there are no cached pages. 2835 */ 2836 if ((vp->v_flag & VNOCACHE) || 2837 (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) && 2838 rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) { 2839 size_t bufsize; 2840 int count; 2841 u_offset_t org_offset; 2842 stable_how4 stab_comm; 2843 nfs4_fwrite: 2844 if (rp->r_flags & R4STALE) { 2845 resid = uiop->uio_resid; 2846 offset = uiop->uio_loffset; 2847 error = rp->r_error; 2848 /* 2849 * A close may have cleared r_error, if so, 2850 * propagate ESTALE error return properly 2851 */ 2852 if (error == 0) 2853 error = ESTALE; 2854 goto bottom; 2855 } 2856 2857 bufsize = MIN(uiop->uio_resid, mi->mi_stsize); 2858 base = kmem_alloc(bufsize, KM_SLEEP); 2859 do { 2860 if (ioflag & FDSYNC) 2861 stab_comm = DATA_SYNC4; 2862 else 2863 stab_comm = FILE_SYNC4; 2864 resid = uiop->uio_resid; 2865 offset = uiop->uio_loffset; 2866 count = MIN(uiop->uio_resid, bufsize); 2867 org_offset = uiop->uio_loffset; 2868 error = uiomove(base, count, UIO_WRITE, uiop); 2869 if (!error) { 2870 error = nfs4write(vp, base, org_offset, 2871 count, cr, &stab_comm); 2872 if (!error) { 2873 mutex_enter(&rp->r_statelock); 2874 if (rp->r_size < uiop->uio_loffset) 2875 rp->r_size = uiop->uio_loffset; 2876 mutex_exit(&rp->r_statelock); 2877 } 2878 } 2879 } while (!error && uiop->uio_resid > 0); 2880 kmem_free(base, bufsize); 2881 goto bottom; 2882 } 2883 2884 bsize = vp->v_vfsp->vfs_bsize; 2885 2886 do { 2887 off = uiop->uio_loffset & MAXBMASK; /* mapping offset */ 2888 on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */ 2889 n = MIN(MAXBSIZE - on, uiop->uio_resid); 2890 2891 resid = uiop->uio_resid; 2892 offset = uiop->uio_loffset; 2893 2894 if (rp->r_flags & R4STALE) { 2895 error = rp->r_error; 2896 /* 2897 * A close may have cleared r_error, if so, 2898 * propagate ESTALE error return properly 2899 */ 2900 if (error == 0) 2901 error = ESTALE; 2902 break; 2903 } 2904 2905 /* 2906 * Don't create dirty pages faster than they 2907 * can be cleaned so that the system doesn't 2908 * get imbalanced. If the async queue is 2909 * maxed out, then wait for it to drain before 2910 * creating more dirty pages. Also, wait for 2911 * any threads doing pagewalks in the vop_getattr 2912 * entry points so that they don't block for 2913 * long periods. 2914 */ 2915 mutex_enter(&rp->r_statelock); 2916 while ((mi->mi_max_threads != 0 && 2917 rp->r_awcount > 2 * mi->mi_max_threads) || 2918 rp->r_gcount > 0) { 2919 if (INTR4(vp)) { 2920 klwp_t *lwp = ttolwp(curthread); 2921 2922 if (lwp != NULL) 2923 lwp->lwp_nostop++; 2924 if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { 2925 mutex_exit(&rp->r_statelock); 2926 if (lwp != NULL) 2927 lwp->lwp_nostop--; 2928 error = EINTR; 2929 goto bottom; 2930 } 2931 if (lwp != NULL) 2932 lwp->lwp_nostop--; 2933 } else 2934 cv_wait(&rp->r_cv, &rp->r_statelock); 2935 } 2936 mutex_exit(&rp->r_statelock); 2937 2938 /* 2939 * Touch the page and fault it in if it is not in core 2940 * before segmap_getmapflt or vpm_data_copy can lock it. 2941 * This is to avoid the deadlock if the buffer is mapped 2942 * to the same file through mmap which we want to write. 2943 */ 2944 uio_prefaultpages((long)n, uiop); 2945 2946 if (vpm_enable) { 2947 /* 2948 * It will use kpm mappings, so no need to 2949 * pass an address. 2950 */ 2951 error = writerp4(rp, NULL, n, uiop, 0); 2952 } else { 2953 if (segmap_kpm) { 2954 int pon = uiop->uio_loffset & PAGEOFFSET; 2955 size_t pn = MIN(PAGESIZE - pon, 2956 uiop->uio_resid); 2957 int pagecreate; 2958 2959 mutex_enter(&rp->r_statelock); 2960 pagecreate = (pon == 0) && (pn == PAGESIZE || 2961 uiop->uio_loffset + pn >= rp->r_size); 2962 mutex_exit(&rp->r_statelock); 2963 2964 base = segmap_getmapflt(segkmap, vp, off + on, 2965 pn, !pagecreate, S_WRITE); 2966 2967 error = writerp4(rp, base + pon, n, uiop, 2968 pagecreate); 2969 2970 } else { 2971 base = segmap_getmapflt(segkmap, vp, off + on, 2972 n, 0, S_READ); 2973 error = writerp4(rp, base + on, n, uiop, 0); 2974 } 2975 } 2976 2977 if (!error) { 2978 if (mi->mi_flags & MI4_NOAC) 2979 flags = SM_WRITE; 2980 else if ((uiop->uio_loffset % bsize) == 0 || 2981 IS_SWAPVP(vp)) { 2982 /* 2983 * Have written a whole block. 2984 * Start an asynchronous write 2985 * and mark the buffer to 2986 * indicate that it won't be 2987 * needed again soon. 2988 */ 2989 flags = SM_WRITE | SM_ASYNC | SM_DONTNEED; 2990 } else 2991 flags = 0; 2992 if ((ioflag & (FSYNC|FDSYNC)) || 2993 (rp->r_flags & R4OUTOFSPACE)) { 2994 flags &= ~SM_ASYNC; 2995 flags |= SM_WRITE; 2996 } 2997 if (vpm_enable) { 2998 error = vpm_sync_pages(vp, off, n, flags); 2999 } else { 3000 error = segmap_release(segkmap, base, flags); 3001 } 3002 } else { 3003 if (vpm_enable) { 3004 (void) vpm_sync_pages(vp, off, n, 0); 3005 } else { 3006 (void) segmap_release(segkmap, base, 0); 3007 } 3008 /* 3009 * In the event that we got an access error while 3010 * faulting in a page for a write-only file just 3011 * force a write. 3012 */ 3013 if (error == EACCES) 3014 goto nfs4_fwrite; 3015 } 3016 } while (!error && uiop->uio_resid > 0); 3017 3018 bottom: 3019 if (error) { 3020 uiop->uio_resid = resid + remainder; 3021 uiop->uio_loffset = offset; 3022 } else { 3023 uiop->uio_resid += remainder; 3024 3025 mutex_enter(&rp->r_statev4_lock); 3026 if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 3027 gethrestime(&rp->r_attr.va_mtime); 3028 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3029 } 3030 mutex_exit(&rp->r_statev4_lock); 3031 } 3032 3033 nfs_rw_exit(&rp->r_lkserlock); 3034 3035 return (error); 3036 } 3037 3038 /* 3039 * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED} 3040 */ 3041 static int 3042 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len, 3043 int flags, cred_t *cr) 3044 { 3045 struct buf *bp; 3046 int error; 3047 page_t *savepp; 3048 uchar_t fsdata; 3049 stable_how4 stab_comm; 3050 3051 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3052 bp = pageio_setup(pp, len, vp, flags); 3053 ASSERT(bp != NULL); 3054 3055 /* 3056 * pageio_setup should have set b_addr to 0. This 3057 * is correct since we want to do I/O on a page 3058 * boundary. bp_mapin will use this addr to calculate 3059 * an offset, and then set b_addr to the kernel virtual 3060 * address it allocated for us. 3061 */ 3062 ASSERT(bp->b_un.b_addr == 0); 3063 3064 bp->b_edev = 0; 3065 bp->b_dev = 0; 3066 bp->b_lblkno = lbtodb(off); 3067 bp->b_file = vp; 3068 bp->b_offset = (offset_t)off; 3069 bp_mapin(bp); 3070 3071 if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) && 3072 freemem > desfree) 3073 stab_comm = UNSTABLE4; 3074 else 3075 stab_comm = FILE_SYNC4; 3076 3077 error = nfs4_bio(bp, &stab_comm, cr, FALSE); 3078 3079 bp_mapout(bp); 3080 pageio_done(bp); 3081 3082 if (stab_comm == UNSTABLE4) 3083 fsdata = C_DELAYCOMMIT; 3084 else 3085 fsdata = C_NOCOMMIT; 3086 3087 savepp = pp; 3088 do { 3089 pp->p_fsdata = fsdata; 3090 } while ((pp = pp->p_next) != savepp); 3091 3092 return (error); 3093 } 3094 3095 /* 3096 */ 3097 static int 3098 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr) 3099 { 3100 nfs4_open_owner_t *oop; 3101 nfs4_open_stream_t *osp; 3102 rnode4_t *rp = VTOR4(vp); 3103 mntinfo4_t *mi = VTOMI4(vp); 3104 int reopen_needed; 3105 3106 ASSERT(nfs_zone() == mi->mi_zone); 3107 3108 3109 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 3110 if (!oop) 3111 return (EIO); 3112 3113 /* returns with 'os_sync_lock' held */ 3114 osp = find_open_stream(oop, rp); 3115 if (!osp) { 3116 open_owner_rele(oop); 3117 return (EIO); 3118 } 3119 3120 if (osp->os_failed_reopen) { 3121 mutex_exit(&osp->os_sync_lock); 3122 open_stream_rele(osp, rp); 3123 open_owner_rele(oop); 3124 return (EIO); 3125 } 3126 3127 /* 3128 * Determine whether a reopen is needed. If this 3129 * is a delegation open stream, then the os_delegation bit 3130 * should be set. 3131 */ 3132 3133 reopen_needed = osp->os_delegation; 3134 3135 mutex_exit(&osp->os_sync_lock); 3136 open_owner_rele(oop); 3137 3138 if (reopen_needed) { 3139 nfs4_error_zinit(ep); 3140 nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE); 3141 mutex_enter(&osp->os_sync_lock); 3142 if (ep->error || ep->stat || osp->os_failed_reopen) { 3143 mutex_exit(&osp->os_sync_lock); 3144 open_stream_rele(osp, rp); 3145 return (EIO); 3146 } 3147 mutex_exit(&osp->os_sync_lock); 3148 } 3149 open_stream_rele(osp, rp); 3150 3151 return (0); 3152 } 3153 3154 /* 3155 * Write to file. Writes to remote server in largest size 3156 * chunks that the server can handle. Write is synchronous. 3157 */ 3158 static int 3159 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr, 3160 stable_how4 *stab_comm) 3161 { 3162 mntinfo4_t *mi; 3163 COMPOUND4args_clnt args; 3164 COMPOUND4res_clnt res; 3165 WRITE4args *wargs; 3166 WRITE4res *wres; 3167 nfs_argop4 argop[2]; 3168 nfs_resop4 *resop; 3169 int tsize; 3170 stable_how4 stable; 3171 rnode4_t *rp; 3172 int doqueue = 1; 3173 bool_t needrecov; 3174 nfs4_recov_state_t recov_state; 3175 nfs4_stateid_types_t sid_types; 3176 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3177 int recov; 3178 3179 rp = VTOR4(vp); 3180 mi = VTOMI4(vp); 3181 3182 ASSERT(nfs_zone() == mi->mi_zone); 3183 3184 stable = *stab_comm; 3185 *stab_comm = FILE_SYNC4; 3186 3187 needrecov = FALSE; 3188 recov_state.rs_flags = 0; 3189 recov_state.rs_num_retry_despite_err = 0; 3190 nfs4_init_stateid_types(&sid_types); 3191 3192 /* Is curthread the recovery thread? */ 3193 mutex_enter(&mi->mi_lock); 3194 recov = (mi->mi_recovthread == curthread); 3195 mutex_exit(&mi->mi_lock); 3196 3197 recov_retry: 3198 args.ctag = TAG_WRITE; 3199 args.array_len = 2; 3200 args.array = argop; 3201 3202 if (!recov) { 3203 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3204 &recov_state, NULL); 3205 if (e.error) 3206 return (e.error); 3207 } 3208 3209 /* 0. putfh target fh */ 3210 argop[0].argop = OP_CPUTFH; 3211 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3212 3213 /* 1. write */ 3214 nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types); 3215 3216 do { 3217 3218 wargs->offset = (offset4)offset; 3219 wargs->data_val = base; 3220 3221 if (mi->mi_io_kstats) { 3222 mutex_enter(&mi->mi_lock); 3223 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3224 mutex_exit(&mi->mi_lock); 3225 } 3226 3227 if ((vp->v_flag & VNOCACHE) || 3228 (rp->r_flags & R4DIRECTIO) || 3229 (mi->mi_flags & MI4_DIRECTIO)) 3230 tsize = MIN(mi->mi_stsize, count); 3231 else 3232 tsize = MIN(mi->mi_curwrite, count); 3233 wargs->data_len = (uint_t)tsize; 3234 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3235 3236 if (mi->mi_io_kstats) { 3237 mutex_enter(&mi->mi_lock); 3238 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3239 mutex_exit(&mi->mi_lock); 3240 } 3241 3242 if (!recov) { 3243 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3244 if (e.error && !needrecov) { 3245 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3246 &recov_state, needrecov); 3247 return (e.error); 3248 } 3249 } else { 3250 if (e.error) 3251 return (e.error); 3252 } 3253 3254 /* 3255 * Do handling of OLD_STATEID outside 3256 * of the normal recovery framework. 3257 * 3258 * If write receives a BAD stateid error while using a 3259 * delegation stateid, retry using the open stateid (if it 3260 * exists). If it doesn't have an open stateid, reopen the 3261 * file first, then retry. 3262 */ 3263 if (!e.error && res.status == NFS4ERR_OLD_STATEID && 3264 sid_types.cur_sid_type != SPEC_SID) { 3265 nfs4_save_stateid(&wargs->stateid, &sid_types); 3266 if (!recov) 3267 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3268 &recov_state, needrecov); 3269 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3270 goto recov_retry; 3271 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3272 sid_types.cur_sid_type == DEL_SID) { 3273 nfs4_save_stateid(&wargs->stateid, &sid_types); 3274 mutex_enter(&rp->r_statev4_lock); 3275 rp->r_deleg_return_pending = TRUE; 3276 mutex_exit(&rp->r_statev4_lock); 3277 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3278 if (!recov) 3279 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3280 &recov_state, needrecov); 3281 (void) xdr_free(xdr_COMPOUND4res_clnt, 3282 (caddr_t)&res); 3283 return (EIO); 3284 } 3285 if (!recov) 3286 nfs4_end_fop(mi, vp, NULL, OH_WRITE, 3287 &recov_state, needrecov); 3288 /* hold needed for nfs4delegreturn_thread */ 3289 VN_HOLD(vp); 3290 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3291 NFS4_DR_DISCARD), FALSE); 3292 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3293 goto recov_retry; 3294 } 3295 3296 if (needrecov) { 3297 bool_t abort; 3298 3299 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3300 "nfs4write: client got error %d, res.status %d" 3301 ", so start recovery", e.error, res.status)); 3302 3303 abort = nfs4_start_recovery(&e, 3304 VTOMI4(vp), vp, NULL, &wargs->stateid, 3305 NULL, OP_WRITE, NULL, NULL, NULL); 3306 if (!e.error) { 3307 e.error = geterrno4(res.status); 3308 (void) xdr_free(xdr_COMPOUND4res_clnt, 3309 (caddr_t)&res); 3310 } 3311 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3312 &recov_state, needrecov); 3313 if (abort == FALSE) 3314 goto recov_retry; 3315 return (e.error); 3316 } 3317 3318 if (res.status) { 3319 e.error = geterrno4(res.status); 3320 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3321 if (!recov) 3322 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3323 &recov_state, needrecov); 3324 return (e.error); 3325 } 3326 3327 resop = &res.array[1]; /* write res */ 3328 wres = &resop->nfs_resop4_u.opwrite; 3329 3330 if ((int)wres->count > tsize) { 3331 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3332 3333 zcmn_err(getzoneid(), CE_WARN, 3334 "nfs4write: server wrote %u, requested was %u", 3335 (int)wres->count, tsize); 3336 if (!recov) 3337 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, 3338 &recov_state, needrecov); 3339 return (EIO); 3340 } 3341 if (wres->committed == UNSTABLE4) { 3342 *stab_comm = UNSTABLE4; 3343 if (wargs->stable == DATA_SYNC4 || 3344 wargs->stable == FILE_SYNC4) { 3345 (void) xdr_free(xdr_COMPOUND4res_clnt, 3346 (caddr_t)&res); 3347 zcmn_err(getzoneid(), CE_WARN, 3348 "nfs4write: server %s did not commit " 3349 "to stable storage", 3350 rp->r_server->sv_hostname); 3351 if (!recov) 3352 nfs4_end_fop(VTOMI4(vp), vp, NULL, 3353 OH_WRITE, &recov_state, needrecov); 3354 return (EIO); 3355 } 3356 } 3357 3358 tsize = (int)wres->count; 3359 count -= tsize; 3360 base += tsize; 3361 offset += tsize; 3362 if (mi->mi_io_kstats) { 3363 mutex_enter(&mi->mi_lock); 3364 KSTAT_IO_PTR(mi->mi_io_kstats)->writes++; 3365 KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten += 3366 tsize; 3367 mutex_exit(&mi->mi_lock); 3368 } 3369 lwp_stat_update(LWP_STAT_OUBLK, 1); 3370 mutex_enter(&rp->r_statelock); 3371 if (rp->r_flags & R4HAVEVERF) { 3372 if (rp->r_writeverf != wres->writeverf) { 3373 nfs4_set_mod(vp); 3374 rp->r_writeverf = wres->writeverf; 3375 } 3376 } else { 3377 rp->r_writeverf = wres->writeverf; 3378 rp->r_flags |= R4HAVEVERF; 3379 } 3380 PURGE_ATTRCACHE4_LOCKED(rp); 3381 rp->r_flags |= R4WRITEMODIFIED; 3382 gethrestime(&rp->r_attr.va_mtime); 3383 rp->r_attr.va_ctime = rp->r_attr.va_mtime; 3384 mutex_exit(&rp->r_statelock); 3385 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3386 } while (count); 3387 3388 if (!recov) 3389 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state, 3390 needrecov); 3391 3392 return (e.error); 3393 } 3394 3395 /* 3396 * Read from a file. Reads data in largest chunks our interface can handle. 3397 */ 3398 static int 3399 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count, 3400 size_t *residp, cred_t *cr, bool_t async, struct uio *uiop) 3401 { 3402 mntinfo4_t *mi; 3403 COMPOUND4args_clnt args; 3404 COMPOUND4res_clnt res; 3405 READ4args *rargs; 3406 nfs_argop4 argop[2]; 3407 int tsize; 3408 int doqueue; 3409 rnode4_t *rp; 3410 int data_len; 3411 bool_t is_eof; 3412 bool_t needrecov = FALSE; 3413 nfs4_recov_state_t recov_state; 3414 nfs4_stateid_types_t sid_types; 3415 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3416 3417 rp = VTOR4(vp); 3418 mi = VTOMI4(vp); 3419 doqueue = 1; 3420 3421 ASSERT(nfs_zone() == mi->mi_zone); 3422 3423 args.ctag = async ? TAG_READAHEAD : TAG_READ; 3424 3425 args.array_len = 2; 3426 args.array = argop; 3427 3428 nfs4_init_stateid_types(&sid_types); 3429 3430 recov_state.rs_flags = 0; 3431 recov_state.rs_num_retry_despite_err = 0; 3432 3433 recov_retry: 3434 e.error = nfs4_start_fop(mi, vp, NULL, OH_READ, 3435 &recov_state, NULL); 3436 if (e.error) 3437 return (e.error); 3438 3439 /* putfh target fh */ 3440 argop[0].argop = OP_CPUTFH; 3441 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3442 3443 /* read */ 3444 argop[1].argop = OP_READ; 3445 rargs = &argop[1].nfs_argop4_u.opread; 3446 rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi, 3447 OP_READ, &sid_types, async); 3448 3449 do { 3450 if (mi->mi_io_kstats) { 3451 mutex_enter(&mi->mi_lock); 3452 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 3453 mutex_exit(&mi->mi_lock); 3454 } 3455 3456 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 3457 "nfs4read: %s call, rp %s", 3458 needrecov ? "recov" : "first", 3459 rnode4info(rp))); 3460 3461 if ((vp->v_flag & VNOCACHE) || 3462 (rp->r_flags & R4DIRECTIO) || 3463 (mi->mi_flags & MI4_DIRECTIO)) 3464 tsize = MIN(mi->mi_tsize, count); 3465 else 3466 tsize = MIN(mi->mi_curread, count); 3467 3468 rargs->offset = (offset4)offset; 3469 rargs->count = (count4)tsize; 3470 rargs->res_data_val_alt = NULL; 3471 rargs->res_mblk = NULL; 3472 rargs->res_uiop = NULL; 3473 rargs->res_maxsize = 0; 3474 rargs->wlist = NULL; 3475 3476 if (uiop) 3477 rargs->res_uiop = uiop; 3478 else 3479 rargs->res_data_val_alt = base; 3480 rargs->res_maxsize = tsize; 3481 3482 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 3483 #ifdef DEBUG 3484 if (nfs4read_error_inject) { 3485 res.status = nfs4read_error_inject; 3486 nfs4read_error_inject = 0; 3487 } 3488 #endif 3489 3490 if (mi->mi_io_kstats) { 3491 mutex_enter(&mi->mi_lock); 3492 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 3493 mutex_exit(&mi->mi_lock); 3494 } 3495 3496 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 3497 if (e.error != 0 && !needrecov) { 3498 nfs4_end_fop(mi, vp, NULL, OH_READ, 3499 &recov_state, needrecov); 3500 return (e.error); 3501 } 3502 3503 /* 3504 * Do proper retry for OLD and BAD stateid errors outside 3505 * of the normal recovery framework. There are two differences 3506 * between async and sync reads. The first is that we allow 3507 * retry on BAD_STATEID for async reads, but not sync reads. 3508 * The second is that we mark the file dead for a failed 3509 * attempt with a special stateid for sync reads, but just 3510 * return EIO for async reads. 3511 * 3512 * If a sync read receives a BAD stateid error while using a 3513 * delegation stateid, retry using the open stateid (if it 3514 * exists). If it doesn't have an open stateid, reopen the 3515 * file first, then retry. 3516 */ 3517 if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID || 3518 res.status == NFS4ERR_BAD_STATEID) && async) { 3519 nfs4_end_fop(mi, vp, NULL, OH_READ, 3520 &recov_state, needrecov); 3521 if (sid_types.cur_sid_type == SPEC_SID) { 3522 (void) xdr_free(xdr_COMPOUND4res_clnt, 3523 (caddr_t)&res); 3524 return (EIO); 3525 } 3526 nfs4_save_stateid(&rargs->stateid, &sid_types); 3527 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3528 goto recov_retry; 3529 } else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3530 !async && sid_types.cur_sid_type != SPEC_SID) { 3531 nfs4_save_stateid(&rargs->stateid, &sid_types); 3532 nfs4_end_fop(mi, vp, NULL, OH_READ, 3533 &recov_state, needrecov); 3534 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3535 goto recov_retry; 3536 } else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID && 3537 sid_types.cur_sid_type == DEL_SID) { 3538 nfs4_save_stateid(&rargs->stateid, &sid_types); 3539 mutex_enter(&rp->r_statev4_lock); 3540 rp->r_deleg_return_pending = TRUE; 3541 mutex_exit(&rp->r_statev4_lock); 3542 if (nfs4rdwr_check_osid(vp, &e, cr)) { 3543 nfs4_end_fop(mi, vp, NULL, OH_READ, 3544 &recov_state, needrecov); 3545 (void) xdr_free(xdr_COMPOUND4res_clnt, 3546 (caddr_t)&res); 3547 return (EIO); 3548 } 3549 nfs4_end_fop(mi, vp, NULL, OH_READ, 3550 &recov_state, needrecov); 3551 /* hold needed for nfs4delegreturn_thread */ 3552 VN_HOLD(vp); 3553 nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN| 3554 NFS4_DR_DISCARD), FALSE); 3555 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3556 goto recov_retry; 3557 } 3558 if (needrecov) { 3559 bool_t abort; 3560 3561 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 3562 "nfs4read: initiating recovery\n")); 3563 abort = nfs4_start_recovery(&e, 3564 mi, vp, NULL, &rargs->stateid, 3565 NULL, OP_READ, NULL, NULL, NULL); 3566 nfs4_end_fop(mi, vp, NULL, OH_READ, 3567 &recov_state, needrecov); 3568 /* 3569 * Do not retry if we got OLD_STATEID using a special 3570 * stateid. This avoids looping with a broken server. 3571 */ 3572 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3573 sid_types.cur_sid_type == SPEC_SID) 3574 abort = TRUE; 3575 3576 if (abort == FALSE) { 3577 /* 3578 * Need to retry all possible stateids in 3579 * case the recovery error wasn't stateid 3580 * related or the stateids have become 3581 * stale (server reboot). 3582 */ 3583 nfs4_init_stateid_types(&sid_types); 3584 (void) xdr_free(xdr_COMPOUND4res_clnt, 3585 (caddr_t)&res); 3586 goto recov_retry; 3587 } 3588 3589 if (!e.error) { 3590 e.error = geterrno4(res.status); 3591 (void) xdr_free(xdr_COMPOUND4res_clnt, 3592 (caddr_t)&res); 3593 } 3594 return (e.error); 3595 } 3596 3597 if (res.status) { 3598 e.error = geterrno4(res.status); 3599 nfs4_end_fop(mi, vp, NULL, OH_READ, 3600 &recov_state, needrecov); 3601 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3602 return (e.error); 3603 } 3604 3605 data_len = res.array[1].nfs_resop4_u.opread.data_len; 3606 count -= data_len; 3607 if (base) 3608 base += data_len; 3609 offset += data_len; 3610 if (mi->mi_io_kstats) { 3611 mutex_enter(&mi->mi_lock); 3612 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 3613 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len; 3614 mutex_exit(&mi->mi_lock); 3615 } 3616 lwp_stat_update(LWP_STAT_INBLK, 1); 3617 is_eof = res.array[1].nfs_resop4_u.opread.eof; 3618 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 3619 3620 } while (count && !is_eof); 3621 3622 *residp = count; 3623 3624 nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov); 3625 3626 return (e.error); 3627 } 3628 3629 /* ARGSUSED */ 3630 static int 3631 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp, 3632 caller_context_t *ct) 3633 { 3634 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3635 return (EIO); 3636 switch (cmd) { 3637 case _FIODIRECTIO: 3638 return (nfs4_directio(vp, (int)arg, cr)); 3639 default: 3640 return (ENOTTY); 3641 } 3642 } 3643 3644 /* ARGSUSED */ 3645 int 3646 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3647 caller_context_t *ct) 3648 { 3649 int error; 3650 rnode4_t *rp = VTOR4(vp); 3651 3652 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3653 return (EIO); 3654 /* 3655 * If it has been specified that the return value will 3656 * just be used as a hint, and we are only being asked 3657 * for size, fsid or rdevid, then return the client's 3658 * notion of these values without checking to make sure 3659 * that the attribute cache is up to date. 3660 * The whole point is to avoid an over the wire GETATTR 3661 * call. 3662 */ 3663 if (flags & ATTR_HINT) { 3664 if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) { 3665 mutex_enter(&rp->r_statelock); 3666 if (vap->va_mask & AT_SIZE) 3667 vap->va_size = rp->r_size; 3668 if (vap->va_mask & AT_FSID) 3669 vap->va_fsid = rp->r_attr.va_fsid; 3670 if (vap->va_mask & AT_RDEV) 3671 vap->va_rdev = rp->r_attr.va_rdev; 3672 mutex_exit(&rp->r_statelock); 3673 return (0); 3674 } 3675 } 3676 3677 /* 3678 * Only need to flush pages if asking for the mtime 3679 * and if there any dirty pages or any outstanding 3680 * asynchronous (write) requests for this file. 3681 */ 3682 if (vap->va_mask & AT_MTIME) { 3683 rp = VTOR4(vp); 3684 if (nfs4_has_pages(vp)) { 3685 mutex_enter(&rp->r_statev4_lock); 3686 if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) { 3687 mutex_exit(&rp->r_statev4_lock); 3688 if (rp->r_flags & R4DIRTY || 3689 rp->r_awcount > 0) { 3690 mutex_enter(&rp->r_statelock); 3691 rp->r_gcount++; 3692 mutex_exit(&rp->r_statelock); 3693 error = 3694 nfs4_putpage(vp, (u_offset_t)0, 3695 0, 0, cr, NULL); 3696 mutex_enter(&rp->r_statelock); 3697 if (error && (error == ENOSPC || 3698 error == EDQUOT)) { 3699 if (!rp->r_error) 3700 rp->r_error = error; 3701 } 3702 if (--rp->r_gcount == 0) 3703 cv_broadcast(&rp->r_cv); 3704 mutex_exit(&rp->r_statelock); 3705 } 3706 } else { 3707 mutex_exit(&rp->r_statev4_lock); 3708 } 3709 } 3710 } 3711 return (nfs4getattr(vp, vap, cr)); 3712 } 3713 3714 int 3715 nfs4_compare_modes(mode_t from_server, mode_t on_client) 3716 { 3717 /* 3718 * If these are the only two bits cleared 3719 * on the server then return 0 (OK) else 3720 * return 1 (BAD). 3721 */ 3722 on_client &= ~(S_ISUID|S_ISGID); 3723 if (on_client == from_server) 3724 return (0); 3725 else 3726 return (1); 3727 } 3728 3729 /*ARGSUSED4*/ 3730 static int 3731 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3732 caller_context_t *ct) 3733 { 3734 int error; 3735 3736 if (vap->va_mask & AT_NOSET) 3737 return (EINVAL); 3738 3739 if (nfs_zone() != VTOMI4(vp)->mi_zone) 3740 return (EIO); 3741 3742 /* 3743 * Don't call secpolicy_vnode_setattr, the client cannot 3744 * use its cached attributes to make security decisions 3745 * as the server may be faking mode bits or mapping uid/gid. 3746 * Always just let the server to the checking. 3747 * If we provide the ability to remove basic priviledges 3748 * to setattr (e.g. basic without chmod) then we will 3749 * need to add a check here before calling the server. 3750 */ 3751 error = nfs4setattr(vp, vap, flags, cr, NULL); 3752 3753 if (error == 0 && (vap->va_mask & AT_SIZE) && vap->va_size == 0) 3754 vnevent_truncate(vp, ct); 3755 3756 return (error); 3757 } 3758 3759 /* 3760 * To replace the "guarded" version 3 setattr, we use two types of compound 3761 * setattr requests: 3762 * 1. The "normal" setattr, used when the size of the file isn't being 3763 * changed - { Putfh <fh>; Setattr; Getattr }/ 3764 * 2. If the size is changed, precede Setattr with: Getattr; Verify 3765 * with only ctime as the argument. If the server ctime differs from 3766 * what is cached on the client, the verify will fail, but we would 3767 * already have the ctime from the preceding getattr, so just set it 3768 * and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify; 3769 * Setattr; Getattr }. 3770 * 3771 * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in 3772 * this setattr and NULL if they are not. 3773 */ 3774 static int 3775 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr, 3776 vsecattr_t *vsap) 3777 { 3778 COMPOUND4args_clnt args; 3779 COMPOUND4res_clnt res, *resp = NULL; 3780 nfs4_ga_res_t *garp = NULL; 3781 int numops = 3; /* { Putfh; Setattr; Getattr } */ 3782 nfs_argop4 argop[5]; 3783 int verify_argop = -1; 3784 int setattr_argop = 1; 3785 nfs_resop4 *resop; 3786 vattr_t va; 3787 rnode4_t *rp; 3788 int doqueue = 1; 3789 uint_t mask = vap->va_mask; 3790 mode_t omode; 3791 vsecattr_t *vsp; 3792 timestruc_t ctime; 3793 bool_t needrecov = FALSE; 3794 nfs4_recov_state_t recov_state; 3795 nfs4_stateid_types_t sid_types; 3796 stateid4 stateid; 3797 hrtime_t t; 3798 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 3799 servinfo4_t *svp; 3800 bitmap4 supp_attrs; 3801 3802 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 3803 rp = VTOR4(vp); 3804 nfs4_init_stateid_types(&sid_types); 3805 3806 /* 3807 * Only need to flush pages if there are any pages and 3808 * if the file is marked as dirty in some fashion. The 3809 * file must be flushed so that we can accurately 3810 * determine the size of the file and the cached data 3811 * after the SETATTR returns. A file is considered to 3812 * be dirty if it is either marked with R4DIRTY, has 3813 * outstanding i/o's active, or is mmap'd. In this 3814 * last case, we can't tell whether there are dirty 3815 * pages, so we flush just to be sure. 3816 */ 3817 if (nfs4_has_pages(vp) && 3818 ((rp->r_flags & R4DIRTY) || 3819 rp->r_count > 0 || 3820 rp->r_mapcnt > 0)) { 3821 ASSERT(vp->v_type != VCHR); 3822 e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL); 3823 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 3824 mutex_enter(&rp->r_statelock); 3825 if (!rp->r_error) 3826 rp->r_error = e.error; 3827 mutex_exit(&rp->r_statelock); 3828 } 3829 } 3830 3831 if (mask & AT_SIZE) { 3832 /* 3833 * Verification setattr compound for non-deleg AT_SIZE: 3834 * { Putfh; Getattr; Verify; Setattr; Getattr } 3835 * Set ctime local here (outside the do_again label) 3836 * so that subsequent retries (after failed VERIFY) 3837 * will use ctime from GETATTR results (from failed 3838 * verify compound) as VERIFY arg. 3839 * If file has delegation, then VERIFY(time_metadata) 3840 * is of little added value, so don't bother. 3841 */ 3842 mutex_enter(&rp->r_statev4_lock); 3843 if (rp->r_deleg_type == OPEN_DELEGATE_NONE || 3844 rp->r_deleg_return_pending) { 3845 numops = 5; 3846 ctime = rp->r_attr.va_ctime; 3847 } 3848 mutex_exit(&rp->r_statev4_lock); 3849 } 3850 3851 recov_state.rs_flags = 0; 3852 recov_state.rs_num_retry_despite_err = 0; 3853 3854 args.ctag = TAG_SETATTR; 3855 do_again: 3856 recov_retry: 3857 setattr_argop = numops - 2; 3858 3859 args.array = argop; 3860 args.array_len = numops; 3861 3862 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 3863 if (e.error) 3864 return (e.error); 3865 3866 3867 /* putfh target fh */ 3868 argop[0].argop = OP_CPUTFH; 3869 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 3870 3871 if (numops == 5) { 3872 /* 3873 * We only care about the ctime, but need to get mtime 3874 * and size for proper cache update. 3875 */ 3876 /* getattr */ 3877 argop[1].argop = OP_GETATTR; 3878 argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3879 argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3880 3881 /* verify - set later in loop */ 3882 verify_argop = 2; 3883 } 3884 3885 /* setattr */ 3886 svp = rp->r_server; 3887 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3888 supp_attrs = svp->sv_supp_attrs; 3889 nfs_rw_exit(&svp->sv_lock); 3890 3891 nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr, 3892 supp_attrs, &e.error, &sid_types); 3893 stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid; 3894 if (e.error) { 3895 /* req time field(s) overflow - return immediately */ 3896 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 3897 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 3898 opsetattr.obj_attributes); 3899 return (e.error); 3900 } 3901 omode = rp->r_attr.va_mode; 3902 3903 /* getattr */ 3904 argop[numops-1].argop = OP_GETATTR; 3905 argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 3906 /* 3907 * If we are setting the ACL (indicated only by vsap != NULL), request 3908 * the ACL in this getattr. The ACL returned from this getattr will be 3909 * used in updating the ACL cache. 3910 */ 3911 if (vsap != NULL) 3912 argop[numops-1].nfs_argop4_u.opgetattr.attr_request |= 3913 FATTR4_ACL_MASK; 3914 argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 3915 3916 /* 3917 * setattr iterates if the object size is set and the cached ctime 3918 * does not match the file ctime. In that case, verify the ctime first. 3919 */ 3920 3921 do { 3922 if (verify_argop != -1) { 3923 /* 3924 * Verify that the ctime match before doing setattr. 3925 */ 3926 va.va_mask = AT_CTIME; 3927 va.va_ctime = ctime; 3928 svp = rp->r_server; 3929 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 3930 supp_attrs = svp->sv_supp_attrs; 3931 nfs_rw_exit(&svp->sv_lock); 3932 e.error = nfs4args_verify(&argop[verify_argop], &va, 3933 OP_VERIFY, supp_attrs); 3934 if (e.error) { 3935 /* req time field(s) overflow - return */ 3936 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3937 needrecov); 3938 break; 3939 } 3940 } 3941 3942 doqueue = 1; 3943 3944 t = gethrtime(); 3945 3946 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 3947 3948 /* 3949 * Purge the access cache and ACL cache if changing either the 3950 * owner of the file, the group owner, or the mode. These may 3951 * change the access permissions of the file, so purge old 3952 * information and start over again. 3953 */ 3954 if (mask & (AT_UID | AT_GID | AT_MODE)) { 3955 (void) nfs4_access_purge_rp(rp); 3956 if (rp->r_secattr != NULL) { 3957 mutex_enter(&rp->r_statelock); 3958 vsp = rp->r_secattr; 3959 rp->r_secattr = NULL; 3960 mutex_exit(&rp->r_statelock); 3961 if (vsp != NULL) 3962 nfs4_acl_free_cache(vsp); 3963 } 3964 } 3965 3966 /* 3967 * If res.array_len == numops, then everything succeeded, 3968 * except for possibly the final getattr. If only the 3969 * last getattr failed, give up, and don't try recovery. 3970 */ 3971 if (res.array_len == numops) { 3972 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3973 needrecov); 3974 if (! e.error) 3975 resp = &res; 3976 break; 3977 } 3978 3979 /* 3980 * if either rpc call failed or completely succeeded - done 3981 */ 3982 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 3983 if (e.error) { 3984 PURGE_ATTRCACHE4(vp); 3985 if (!needrecov) { 3986 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 3987 needrecov); 3988 break; 3989 } 3990 } 3991 3992 /* 3993 * Do proper retry for OLD_STATEID outside of the normal 3994 * recovery framework. 3995 */ 3996 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 3997 sid_types.cur_sid_type != SPEC_SID && 3998 sid_types.cur_sid_type != NO_SID) { 3999 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4000 needrecov); 4001 nfs4_save_stateid(&stateid, &sid_types); 4002 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4003 opsetattr.obj_attributes); 4004 if (verify_argop != -1) { 4005 nfs4args_verify_free(&argop[verify_argop]); 4006 verify_argop = -1; 4007 } 4008 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4009 goto recov_retry; 4010 } 4011 4012 if (needrecov) { 4013 bool_t abort; 4014 4015 abort = nfs4_start_recovery(&e, 4016 VTOMI4(vp), vp, NULL, NULL, NULL, 4017 OP_SETATTR, NULL, NULL, NULL); 4018 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4019 needrecov); 4020 /* 4021 * Do not retry if we failed with OLD_STATEID using 4022 * a special stateid. This is done to avoid looping 4023 * with a broken server. 4024 */ 4025 if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID && 4026 (sid_types.cur_sid_type == SPEC_SID || 4027 sid_types.cur_sid_type == NO_SID)) 4028 abort = TRUE; 4029 if (!e.error) { 4030 if (res.status == NFS4ERR_BADOWNER) 4031 nfs4_log_badowner(VTOMI4(vp), 4032 OP_SETATTR); 4033 4034 e.error = geterrno4(res.status); 4035 (void) xdr_free(xdr_COMPOUND4res_clnt, 4036 (caddr_t)&res); 4037 } 4038 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4039 opsetattr.obj_attributes); 4040 if (verify_argop != -1) { 4041 nfs4args_verify_free(&argop[verify_argop]); 4042 verify_argop = -1; 4043 } 4044 if (abort == FALSE) { 4045 /* 4046 * Need to retry all possible stateids in 4047 * case the recovery error wasn't stateid 4048 * related or the stateids have become 4049 * stale (server reboot). 4050 */ 4051 nfs4_init_stateid_types(&sid_types); 4052 goto recov_retry; 4053 } 4054 return (e.error); 4055 } 4056 4057 /* 4058 * Need to call nfs4_end_op before nfs4getattr to 4059 * avoid potential nfs4_start_op deadlock. See RFE 4060 * 4777612. Calls to nfs4_invalidate_pages() and 4061 * nfs4_purge_stale_fh() might also generate over the 4062 * wire calls which my cause nfs4_start_op() deadlock. 4063 */ 4064 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4065 4066 /* 4067 * Check to update lease. 4068 */ 4069 resp = &res; 4070 if (res.status == NFS4_OK) { 4071 break; 4072 } 4073 4074 /* 4075 * Check if verify failed to see if try again 4076 */ 4077 if ((verify_argop == -1) || (res.array_len != 3)) { 4078 /* 4079 * can't continue... 4080 */ 4081 if (res.status == NFS4ERR_BADOWNER) 4082 nfs4_log_badowner(VTOMI4(vp), OP_SETATTR); 4083 4084 e.error = geterrno4(res.status); 4085 } else { 4086 /* 4087 * When the verify request fails, the client ctime is 4088 * not in sync with the server. This is the same as 4089 * the version 3 "not synchronized" error, and we 4090 * handle it in a similar manner (XXX do we need to???). 4091 * Use the ctime returned in the first getattr for 4092 * the input to the next verify. 4093 * If we couldn't get the attributes, then we give up 4094 * because we can't complete the operation as required. 4095 */ 4096 garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res; 4097 } 4098 if (e.error) { 4099 PURGE_ATTRCACHE4(vp); 4100 nfs4_purge_stale_fh(e.error, vp, cr); 4101 } else { 4102 /* 4103 * retry with a new verify value 4104 */ 4105 ctime = garp->n4g_va.va_ctime; 4106 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4107 resp = NULL; 4108 } 4109 if (!e.error) { 4110 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4111 opsetattr.obj_attributes); 4112 if (verify_argop != -1) { 4113 nfs4args_verify_free(&argop[verify_argop]); 4114 verify_argop = -1; 4115 } 4116 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4117 goto do_again; 4118 } 4119 } while (!e.error); 4120 4121 if (e.error) { 4122 /* 4123 * If we are here, rfs4call has an irrecoverable error - return 4124 */ 4125 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4126 opsetattr.obj_attributes); 4127 if (verify_argop != -1) { 4128 nfs4args_verify_free(&argop[verify_argop]); 4129 verify_argop = -1; 4130 } 4131 if (resp) 4132 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4133 return (e.error); 4134 } 4135 4136 4137 4138 /* 4139 * If changing the size of the file, invalidate 4140 * any local cached data which is no longer part 4141 * of the file. We also possibly invalidate the 4142 * last page in the file. We could use 4143 * pvn_vpzero(), but this would mark the page as 4144 * modified and require it to be written back to 4145 * the server for no particularly good reason. 4146 * This way, if we access it, then we bring it 4147 * back in. A read should be cheaper than a 4148 * write. 4149 */ 4150 if (mask & AT_SIZE) { 4151 nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr); 4152 } 4153 4154 /* either no error or one of the postop getattr failed */ 4155 4156 /* 4157 * XXX Perform a simplified version of wcc checking. Instead of 4158 * have another getattr to get pre-op, just purge cache if 4159 * any of the ops prior to and including the getattr failed. 4160 * If the getattr succeeded then update the attrcache accordingly. 4161 */ 4162 4163 garp = NULL; 4164 if (res.status == NFS4_OK) { 4165 /* 4166 * Last getattr 4167 */ 4168 resop = &res.array[numops - 1]; 4169 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4170 } 4171 /* 4172 * In certain cases, nfs4_update_attrcache() will purge the attrcache, 4173 * rather than filling it. See the function itself for details. 4174 */ 4175 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4176 if (garp != NULL) { 4177 if (garp->n4g_resbmap & FATTR4_ACL_MASK) { 4178 nfs4_acl_fill_cache(rp, &garp->n4g_vsa); 4179 vs_ace4_destroy(&garp->n4g_vsa); 4180 } else { 4181 if (vsap != NULL) { 4182 /* 4183 * The ACL was supposed to be set and to be 4184 * returned in the last getattr of this 4185 * compound, but for some reason the getattr 4186 * result doesn't contain the ACL. In this 4187 * case, purge the ACL cache. 4188 */ 4189 if (rp->r_secattr != NULL) { 4190 mutex_enter(&rp->r_statelock); 4191 vsp = rp->r_secattr; 4192 rp->r_secattr = NULL; 4193 mutex_exit(&rp->r_statelock); 4194 if (vsp != NULL) 4195 nfs4_acl_free_cache(vsp); 4196 } 4197 } 4198 } 4199 } 4200 4201 if (res.status == NFS4_OK && (mask & AT_SIZE)) { 4202 /* 4203 * Set the size, rather than relying on getting it updated 4204 * via a GETATTR. With delegations the client tries to 4205 * suppress GETATTR calls. 4206 */ 4207 mutex_enter(&rp->r_statelock); 4208 rp->r_size = vap->va_size; 4209 mutex_exit(&rp->r_statelock); 4210 } 4211 4212 /* 4213 * Can free up request args and res 4214 */ 4215 nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u. 4216 opsetattr.obj_attributes); 4217 if (verify_argop != -1) { 4218 nfs4args_verify_free(&argop[verify_argop]); 4219 verify_argop = -1; 4220 } 4221 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4222 4223 /* 4224 * Some servers will change the mode to clear the setuid 4225 * and setgid bits when changing the uid or gid. The 4226 * client needs to compensate appropriately. 4227 */ 4228 if (mask & (AT_UID | AT_GID)) { 4229 int terror, do_setattr; 4230 4231 do_setattr = 0; 4232 va.va_mask = AT_MODE; 4233 terror = nfs4getattr(vp, &va, cr); 4234 if (!terror && 4235 (((mask & AT_MODE) && va.va_mode != vap->va_mode) || 4236 (!(mask & AT_MODE) && va.va_mode != omode))) { 4237 va.va_mask = AT_MODE; 4238 if (mask & AT_MODE) { 4239 /* 4240 * We asked the mode to be changed and what 4241 * we just got from the server in getattr is 4242 * not what we wanted it to be, so set it now. 4243 */ 4244 va.va_mode = vap->va_mode; 4245 do_setattr = 1; 4246 } else { 4247 /* 4248 * We did not ask the mode to be changed, 4249 * Check to see that the server just cleared 4250 * I_SUID and I_GUID from it. If not then 4251 * set mode to omode with UID/GID cleared. 4252 */ 4253 if (nfs4_compare_modes(va.va_mode, omode)) { 4254 omode &= ~(S_ISUID|S_ISGID); 4255 va.va_mode = omode; 4256 do_setattr = 1; 4257 } 4258 } 4259 4260 if (do_setattr) 4261 (void) nfs4setattr(vp, &va, 0, cr, NULL); 4262 } 4263 } 4264 4265 return (e.error); 4266 } 4267 4268 /* ARGSUSED */ 4269 static int 4270 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) 4271 { 4272 COMPOUND4args_clnt args; 4273 COMPOUND4res_clnt res; 4274 int doqueue; 4275 uint32_t acc, resacc, argacc; 4276 rnode4_t *rp; 4277 cred_t *cred, *ncr, *ncrfree = NULL; 4278 nfs4_access_type_t cacc; 4279 int num_ops; 4280 nfs_argop4 argop[3]; 4281 nfs_resop4 *resop; 4282 bool_t needrecov = FALSE, do_getattr; 4283 nfs4_recov_state_t recov_state; 4284 int rpc_error; 4285 hrtime_t t; 4286 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4287 mntinfo4_t *mi = VTOMI4(vp); 4288 4289 if (nfs_zone() != mi->mi_zone) 4290 return (EIO); 4291 4292 acc = 0; 4293 if (mode & VREAD) 4294 acc |= ACCESS4_READ; 4295 if (mode & VWRITE) { 4296 if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type)) 4297 return (EROFS); 4298 if (vp->v_type == VDIR) 4299 acc |= ACCESS4_DELETE; 4300 acc |= ACCESS4_MODIFY | ACCESS4_EXTEND; 4301 } 4302 if (mode & VEXEC) { 4303 if (vp->v_type == VDIR) 4304 acc |= ACCESS4_LOOKUP; 4305 else 4306 acc |= ACCESS4_EXECUTE; 4307 } 4308 4309 if (VTOR4(vp)->r_acache != NULL) { 4310 e.error = nfs4_validate_caches(vp, cr); 4311 if (e.error) 4312 return (e.error); 4313 } 4314 4315 rp = VTOR4(vp); 4316 if (vp->v_type == VDIR) 4317 argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY | 4318 ACCESS4_EXTEND | ACCESS4_LOOKUP; 4319 else 4320 argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND | 4321 ACCESS4_EXECUTE; 4322 recov_state.rs_flags = 0; 4323 recov_state.rs_num_retry_despite_err = 0; 4324 4325 cred = cr; 4326 /* 4327 * ncr and ncrfree both initially 4328 * point to the memory area returned 4329 * by crnetadjust(); 4330 * ncrfree not NULL when exiting means 4331 * that we need to release it 4332 */ 4333 ncr = crnetadjust(cred); 4334 ncrfree = ncr; 4335 4336 tryagain: 4337 cacc = nfs4_access_check(rp, acc, cred); 4338 if (cacc == NFS4_ACCESS_ALLOWED) { 4339 if (ncrfree != NULL) 4340 crfree(ncrfree); 4341 return (0); 4342 } 4343 if (cacc == NFS4_ACCESS_DENIED) { 4344 /* 4345 * If the cred can be adjusted, try again 4346 * with the new cred. 4347 */ 4348 if (ncr != NULL) { 4349 cred = ncr; 4350 ncr = NULL; 4351 goto tryagain; 4352 } 4353 if (ncrfree != NULL) 4354 crfree(ncrfree); 4355 return (EACCES); 4356 } 4357 4358 recov_retry: 4359 /* 4360 * Don't take with r_statev4_lock here. r_deleg_type could 4361 * change as soon as lock is released. Since it is an int, 4362 * there is no atomicity issue. 4363 */ 4364 do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE); 4365 num_ops = do_getattr ? 3 : 2; 4366 4367 args.ctag = TAG_ACCESS; 4368 4369 args.array_len = num_ops; 4370 args.array = argop; 4371 4372 if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS, 4373 &recov_state, NULL)) { 4374 if (ncrfree != NULL) 4375 crfree(ncrfree); 4376 return (e.error); 4377 } 4378 4379 /* putfh target fh */ 4380 argop[0].argop = OP_CPUTFH; 4381 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4382 4383 /* access */ 4384 argop[1].argop = OP_ACCESS; 4385 argop[1].nfs_argop4_u.opaccess.access = argacc; 4386 4387 /* getattr */ 4388 if (do_getattr) { 4389 argop[2].argop = OP_GETATTR; 4390 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4391 argop[2].nfs_argop4_u.opgetattr.mi = mi; 4392 } 4393 4394 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4395 "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first", 4396 rnode4info(VTOR4(vp)))); 4397 4398 doqueue = 1; 4399 t = gethrtime(); 4400 rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e); 4401 rpc_error = e.error; 4402 4403 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4404 if (needrecov) { 4405 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4406 "nfs4_access: initiating recovery\n")); 4407 4408 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4409 NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) { 4410 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS, 4411 &recov_state, needrecov); 4412 if (!e.error) 4413 (void) xdr_free(xdr_COMPOUND4res_clnt, 4414 (caddr_t)&res); 4415 goto recov_retry; 4416 } 4417 } 4418 nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov); 4419 4420 if (e.error) 4421 goto out; 4422 4423 if (res.status) { 4424 e.error = geterrno4(res.status); 4425 /* 4426 * This might generate over the wire calls throught 4427 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4428 * here to avoid a deadlock. 4429 */ 4430 nfs4_purge_stale_fh(e.error, vp, cr); 4431 goto out; 4432 } 4433 resop = &res.array[1]; /* access res */ 4434 4435 resacc = resop->nfs_resop4_u.opaccess.access; 4436 4437 if (do_getattr) { 4438 resop++; /* getattr res */ 4439 nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res, 4440 t, cr, FALSE, NULL); 4441 } 4442 4443 if (!e.error) { 4444 nfs4_access_cache(rp, argacc, resacc, cred); 4445 /* 4446 * we just cached results with cred; if cred is the 4447 * adjusted credentials from crnetadjust, we do not want 4448 * to release them before exiting: hence setting ncrfree 4449 * to NULL 4450 */ 4451 if (cred != cr) 4452 ncrfree = NULL; 4453 /* XXX check the supported bits too? */ 4454 if ((acc & resacc) != acc) { 4455 /* 4456 * The following code implements the semantic 4457 * that a setuid root program has *at least* the 4458 * permissions of the user that is running the 4459 * program. See rfs3call() for more portions 4460 * of the implementation of this functionality. 4461 */ 4462 /* XXX-LP */ 4463 if (ncr != NULL) { 4464 (void) xdr_free(xdr_COMPOUND4res_clnt, 4465 (caddr_t)&res); 4466 cred = ncr; 4467 ncr = NULL; 4468 goto tryagain; 4469 } 4470 e.error = EACCES; 4471 } 4472 } 4473 4474 out: 4475 if (!rpc_error) 4476 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4477 4478 if (ncrfree != NULL) 4479 crfree(ncrfree); 4480 4481 return (e.error); 4482 } 4483 4484 /* ARGSUSED */ 4485 static int 4486 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct) 4487 { 4488 COMPOUND4args_clnt args; 4489 COMPOUND4res_clnt res; 4490 int doqueue; 4491 rnode4_t *rp; 4492 nfs_argop4 argop[3]; 4493 nfs_resop4 *resop; 4494 READLINK4res *lr_res; 4495 nfs4_ga_res_t *garp; 4496 uint_t len; 4497 char *linkdata; 4498 bool_t needrecov = FALSE; 4499 nfs4_recov_state_t recov_state; 4500 hrtime_t t; 4501 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4502 4503 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4504 return (EIO); 4505 /* 4506 * Can't readlink anything other than a symbolic link. 4507 */ 4508 if (vp->v_type != VLNK) 4509 return (EINVAL); 4510 4511 rp = VTOR4(vp); 4512 if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) { 4513 e.error = nfs4_validate_caches(vp, cr); 4514 if (e.error) 4515 return (e.error); 4516 mutex_enter(&rp->r_statelock); 4517 if (rp->r_symlink.contents != NULL) { 4518 e.error = uiomove(rp->r_symlink.contents, 4519 rp->r_symlink.len, UIO_READ, uiop); 4520 mutex_exit(&rp->r_statelock); 4521 return (e.error); 4522 } 4523 mutex_exit(&rp->r_statelock); 4524 } 4525 recov_state.rs_flags = 0; 4526 recov_state.rs_num_retry_despite_err = 0; 4527 4528 recov_retry: 4529 args.array_len = 3; 4530 args.array = argop; 4531 args.ctag = TAG_READLINK; 4532 4533 e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state); 4534 if (e.error) { 4535 return (e.error); 4536 } 4537 4538 /* 0. putfh symlink fh */ 4539 argop[0].argop = OP_CPUTFH; 4540 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 4541 4542 /* 1. readlink */ 4543 argop[1].argop = OP_READLINK; 4544 4545 /* 2. getattr */ 4546 argop[2].argop = OP_GETATTR; 4547 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 4548 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp); 4549 4550 doqueue = 1; 4551 4552 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 4553 "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first", 4554 rnode4info(VTOR4(vp)))); 4555 4556 t = gethrtime(); 4557 4558 rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e); 4559 4560 needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp); 4561 if (needrecov) { 4562 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 4563 "nfs4_readlink: initiating recovery\n")); 4564 4565 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 4566 NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) { 4567 if (!e.error) 4568 (void) xdr_free(xdr_COMPOUND4res_clnt, 4569 (caddr_t)&res); 4570 4571 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, 4572 needrecov); 4573 goto recov_retry; 4574 } 4575 } 4576 4577 nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov); 4578 4579 if (e.error) 4580 return (e.error); 4581 4582 /* 4583 * There is an path in the code below which calls 4584 * nfs4_purge_stale_fh(), which may generate otw calls through 4585 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 4586 * here to avoid nfs4_start_op() deadlock. 4587 */ 4588 4589 if (res.status && (res.array_len < args.array_len)) { 4590 /* 4591 * either Putfh or Link failed 4592 */ 4593 e.error = geterrno4(res.status); 4594 nfs4_purge_stale_fh(e.error, vp, cr); 4595 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4596 return (e.error); 4597 } 4598 4599 resop = &res.array[1]; /* readlink res */ 4600 lr_res = &resop->nfs_resop4_u.opreadlink; 4601 4602 /* 4603 * treat symlink names as data 4604 */ 4605 linkdata = utf8_to_str((utf8string *)&lr_res->link, &len, NULL); 4606 if (linkdata != NULL) { 4607 int uio_len = len - 1; 4608 /* len includes null byte, which we won't uiomove */ 4609 e.error = uiomove(linkdata, uio_len, UIO_READ, uiop); 4610 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 4611 mutex_enter(&rp->r_statelock); 4612 if (rp->r_symlink.contents == NULL) { 4613 rp->r_symlink.contents = linkdata; 4614 rp->r_symlink.len = uio_len; 4615 rp->r_symlink.size = len; 4616 mutex_exit(&rp->r_statelock); 4617 } else { 4618 mutex_exit(&rp->r_statelock); 4619 kmem_free(linkdata, len); 4620 } 4621 } else { 4622 kmem_free(linkdata, len); 4623 } 4624 } 4625 if (res.status == NFS4_OK) { 4626 resop++; /* getattr res */ 4627 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 4628 } 4629 e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr); 4630 4631 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 4632 4633 /* 4634 * The over the wire error for attempting to readlink something 4635 * other than a symbolic link is ENXIO. However, we need to 4636 * return EINVAL instead of ENXIO, so we map it here. 4637 */ 4638 return (e.error == ENXIO ? EINVAL : e.error); 4639 } 4640 4641 /* 4642 * Flush local dirty pages to stable storage on the server. 4643 * 4644 * If FNODSYNC is specified, then there is nothing to do because 4645 * metadata changes are not cached on the client before being 4646 * sent to the server. 4647 */ 4648 /* ARGSUSED */ 4649 static int 4650 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) 4651 { 4652 int error; 4653 4654 if ((syncflag & FNODSYNC) || IS_SWAPVP(vp)) 4655 return (0); 4656 if (nfs_zone() != VTOMI4(vp)->mi_zone) 4657 return (EIO); 4658 error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr); 4659 if (!error) 4660 error = VTOR4(vp)->r_error; 4661 return (error); 4662 } 4663 4664 /* 4665 * Weirdness: if the file was removed or the target of a rename 4666 * operation while it was open, it got renamed instead. Here we 4667 * remove the renamed file. 4668 */ 4669 /* ARGSUSED */ 4670 void 4671 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) 4672 { 4673 rnode4_t *rp; 4674 4675 ASSERT(vp != DNLC_NO_VNODE); 4676 4677 rp = VTOR4(vp); 4678 4679 if (IS_SHADOW(vp, rp)) { 4680 sv_inactive(vp); 4681 return; 4682 } 4683 4684 /* 4685 * If this is coming from the wrong zone, we let someone in the right 4686 * zone take care of it asynchronously. We can get here due to 4687 * VN_RELE() being called from pageout() or fsflush(). This call may 4688 * potentially turn into an expensive no-op if, for instance, v_count 4689 * gets incremented in the meantime, but it's still correct. 4690 */ 4691 if (nfs_zone() != VTOMI4(vp)->mi_zone) { 4692 nfs4_async_inactive(vp, cr); 4693 return; 4694 } 4695 4696 /* 4697 * Some of the cleanup steps might require over-the-wire 4698 * operations. Since VOP_INACTIVE can get called as a result of 4699 * other over-the-wire operations (e.g., an attribute cache update 4700 * can lead to a DNLC purge), doing those steps now would lead to a 4701 * nested call to the recovery framework, which can deadlock. So 4702 * do any over-the-wire cleanups asynchronously, in a separate 4703 * thread. 4704 */ 4705 4706 mutex_enter(&rp->r_os_lock); 4707 mutex_enter(&rp->r_statelock); 4708 mutex_enter(&rp->r_statev4_lock); 4709 4710 if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) { 4711 mutex_exit(&rp->r_statev4_lock); 4712 mutex_exit(&rp->r_statelock); 4713 mutex_exit(&rp->r_os_lock); 4714 nfs4_async_inactive(vp, cr); 4715 return; 4716 } 4717 4718 if (rp->r_deleg_type == OPEN_DELEGATE_READ || 4719 rp->r_deleg_type == OPEN_DELEGATE_WRITE) { 4720 mutex_exit(&rp->r_statev4_lock); 4721 mutex_exit(&rp->r_statelock); 4722 mutex_exit(&rp->r_os_lock); 4723 nfs4_async_inactive(vp, cr); 4724 return; 4725 } 4726 4727 if (rp->r_unldvp != NULL) { 4728 mutex_exit(&rp->r_statev4_lock); 4729 mutex_exit(&rp->r_statelock); 4730 mutex_exit(&rp->r_os_lock); 4731 nfs4_async_inactive(vp, cr); 4732 return; 4733 } 4734 mutex_exit(&rp->r_statev4_lock); 4735 mutex_exit(&rp->r_statelock); 4736 mutex_exit(&rp->r_os_lock); 4737 4738 rp4_addfree(rp, cr); 4739 } 4740 4741 /* 4742 * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up 4743 * various bits of state. The caller must not refer to vp after this call. 4744 */ 4745 4746 void 4747 nfs4_inactive_otw(vnode_t *vp, cred_t *cr) 4748 { 4749 rnode4_t *rp = VTOR4(vp); 4750 nfs4_recov_state_t recov_state; 4751 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 4752 vnode_t *unldvp; 4753 char *unlname; 4754 cred_t *unlcred; 4755 COMPOUND4args_clnt args; 4756 COMPOUND4res_clnt res, *resp; 4757 nfs_argop4 argop[2]; 4758 int doqueue; 4759 #ifdef DEBUG 4760 char *name; 4761 #endif 4762 4763 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 4764 ASSERT(!IS_SHADOW(vp, rp)); 4765 4766 #ifdef DEBUG 4767 name = fn_name(VTOSV(vp)->sv_name); 4768 NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: " 4769 "release vnode %s", name)); 4770 kmem_free(name, MAXNAMELEN); 4771 #endif 4772 4773 if (vp->v_type == VREG) { 4774 bool_t recov_failed = FALSE; 4775 4776 e.error = nfs4close_all(vp, cr); 4777 if (e.error) { 4778 /* Check to see if recovery failed */ 4779 mutex_enter(&(VTOMI4(vp)->mi_lock)); 4780 if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL) 4781 recov_failed = TRUE; 4782 mutex_exit(&(VTOMI4(vp)->mi_lock)); 4783 if (!recov_failed) { 4784 mutex_enter(&rp->r_statelock); 4785 if (rp->r_flags & R4RECOVERR) 4786 recov_failed = TRUE; 4787 mutex_exit(&rp->r_statelock); 4788 } 4789 if (recov_failed) { 4790 NFS4_DEBUG(nfs4_client_recov_debug, 4791 (CE_NOTE, "nfs4_inactive_otw: " 4792 "close failed (recovery failure)")); 4793 } 4794 } 4795 } 4796 4797 redo: 4798 if (rp->r_unldvp == NULL) { 4799 rp4_addfree(rp, cr); 4800 return; 4801 } 4802 4803 /* 4804 * Save the vnode pointer for the directory where the 4805 * unlinked-open file got renamed, then set it to NULL 4806 * to prevent another thread from getting here before 4807 * we're done with the remove. While we have the 4808 * statelock, make local copies of the pertinent rnode 4809 * fields. If we weren't to do this in an atomic way, the 4810 * the unl* fields could become inconsistent with respect 4811 * to each other due to a race condition between this 4812 * code and nfs_remove(). See bug report 1034328. 4813 */ 4814 mutex_enter(&rp->r_statelock); 4815 if (rp->r_unldvp == NULL) { 4816 mutex_exit(&rp->r_statelock); 4817 rp4_addfree(rp, cr); 4818 return; 4819 } 4820 4821 unldvp = rp->r_unldvp; 4822 rp->r_unldvp = NULL; 4823 unlname = rp->r_unlname; 4824 rp->r_unlname = NULL; 4825 unlcred = rp->r_unlcred; 4826 rp->r_unlcred = NULL; 4827 mutex_exit(&rp->r_statelock); 4828 4829 /* 4830 * If there are any dirty pages left, then flush 4831 * them. This is unfortunate because they just 4832 * may get thrown away during the remove operation, 4833 * but we have to do this for correctness. 4834 */ 4835 if (nfs4_has_pages(vp) && 4836 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 4837 ASSERT(vp->v_type != VCHR); 4838 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL); 4839 if (e.error) { 4840 mutex_enter(&rp->r_statelock); 4841 if (!rp->r_error) 4842 rp->r_error = e.error; 4843 mutex_exit(&rp->r_statelock); 4844 } 4845 } 4846 4847 recov_state.rs_flags = 0; 4848 recov_state.rs_num_retry_despite_err = 0; 4849 recov_retry_remove: 4850 /* 4851 * Do the remove operation on the renamed file 4852 */ 4853 args.ctag = TAG_INACTIVE; 4854 4855 /* 4856 * Remove ops: putfh dir; remove 4857 */ 4858 args.array_len = 2; 4859 args.array = argop; 4860 4861 e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state); 4862 if (e.error) { 4863 kmem_free(unlname, MAXNAMELEN); 4864 crfree(unlcred); 4865 VN_RELE(unldvp); 4866 /* 4867 * Try again; this time around r_unldvp will be NULL, so we'll 4868 * just call rp4_addfree() and return. 4869 */ 4870 goto redo; 4871 } 4872 4873 /* putfh directory */ 4874 argop[0].argop = OP_CPUTFH; 4875 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh; 4876 4877 /* remove */ 4878 argop[1].argop = OP_CREMOVE; 4879 argop[1].nfs_argop4_u.opcremove.ctarget = unlname; 4880 4881 doqueue = 1; 4882 resp = &res; 4883 4884 #if 0 /* notyet */ 4885 /* 4886 * Can't do this yet. We may be being called from 4887 * dnlc_purge_XXX while that routine is holding a 4888 * mutex lock to the nc_rele list. The calls to 4889 * nfs3_cache_wcc_data may result in calls to 4890 * dnlc_purge_XXX. This will result in a deadlock. 4891 */ 4892 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4893 if (e.error) { 4894 PURGE_ATTRCACHE4(unldvp); 4895 resp = NULL; 4896 } else if (res.status) { 4897 e.error = geterrno4(res.status); 4898 PURGE_ATTRCACHE4(unldvp); 4899 /* 4900 * This code is inactive right now 4901 * but if made active there should 4902 * be a nfs4_end_op() call before 4903 * nfs4_purge_stale_fh to avoid start_op() 4904 * deadlock. See BugId: 4948726 4905 */ 4906 nfs4_purge_stale_fh(error, unldvp, cr); 4907 } else { 4908 nfs_resop4 *resop; 4909 REMOVE4res *rm_res; 4910 4911 resop = &res.array[1]; 4912 rm_res = &resop->nfs_resop4_u.opremove; 4913 /* 4914 * Update directory cache attribute, 4915 * readdir and dnlc caches. 4916 */ 4917 nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL); 4918 } 4919 #else 4920 rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e); 4921 4922 PURGE_ATTRCACHE4(unldvp); 4923 #endif 4924 4925 if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) { 4926 if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL, 4927 NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 4928 if (!e.error) 4929 (void) xdr_free(xdr_COMPOUND4res_clnt, 4930 (caddr_t)&res); 4931 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, 4932 &recov_state, TRUE); 4933 goto recov_retry_remove; 4934 } 4935 } 4936 nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE); 4937 4938 /* 4939 * Release stuff held for the remove 4940 */ 4941 VN_RELE(unldvp); 4942 if (!e.error && resp) 4943 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 4944 4945 kmem_free(unlname, MAXNAMELEN); 4946 crfree(unlcred); 4947 goto redo; 4948 } 4949 4950 /* 4951 * Remote file system operations having to do with directory manipulation. 4952 */ 4953 /* ARGSUSED3 */ 4954 int 4955 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, 4956 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, 4957 int *direntflags, pathname_t *realpnp) 4958 { 4959 int error; 4960 vnode_t *vp, *avp = NULL; 4961 rnode4_t *drp; 4962 4963 *vpp = NULL; 4964 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 4965 return (EPERM); 4966 /* 4967 * if LOOKUP_XATTR, must replace dvp (object) with 4968 * object's attrdir before continuing with lookup 4969 */ 4970 if (flags & LOOKUP_XATTR) { 4971 error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr); 4972 if (error) 4973 return (error); 4974 4975 dvp = avp; 4976 4977 /* 4978 * If lookup is for "", just return dvp now. The attrdir 4979 * has already been activated (from nfs4lookup_xattr), and 4980 * the caller will RELE the original dvp -- not 4981 * the attrdir. So, set vpp and return. 4982 * Currently, when the LOOKUP_XATTR flag is 4983 * passed to VOP_LOOKUP, the name is always empty, and 4984 * shortcircuiting here avoids 3 unneeded lock/unlock 4985 * pairs. 4986 * 4987 * If a non-empty name was provided, then it is the 4988 * attribute name, and it will be looked up below. 4989 */ 4990 if (*nm == '\0') { 4991 *vpp = dvp; 4992 return (0); 4993 } 4994 4995 /* 4996 * The vfs layer never sends a name when asking for the 4997 * attrdir, so we should never get here (unless of course 4998 * name is passed at some time in future -- at which time 4999 * we'll blow up here). 5000 */ 5001 ASSERT(0); 5002 } 5003 5004 drp = VTOR4(dvp); 5005 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5006 return (EINTR); 5007 5008 error = nfs4lookup(dvp, nm, vpp, cr, 0); 5009 nfs_rw_exit(&drp->r_rwlock); 5010 5011 /* 5012 * If vnode is a device, create special vnode. 5013 */ 5014 if (!error && ISVDEV((*vpp)->v_type)) { 5015 vp = *vpp; 5016 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 5017 VN_RELE(vp); 5018 } 5019 5020 return (error); 5021 } 5022 5023 /* ARGSUSED */ 5024 static int 5025 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr) 5026 { 5027 int error; 5028 rnode4_t *drp; 5029 int cflag = ((flags & CREATE_XATTR_DIR) != 0); 5030 mntinfo4_t *mi; 5031 5032 mi = VTOMI4(dvp); 5033 if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) && 5034 !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS)) 5035 return (EINVAL); 5036 5037 drp = VTOR4(dvp); 5038 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) 5039 return (EINTR); 5040 5041 mutex_enter(&drp->r_statelock); 5042 /* 5043 * If the server doesn't support xattrs just return EINVAL 5044 */ 5045 if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) { 5046 mutex_exit(&drp->r_statelock); 5047 nfs_rw_exit(&drp->r_rwlock); 5048 return (EINVAL); 5049 } 5050 5051 /* 5052 * If there is a cached xattr directory entry, 5053 * use it as long as the attributes are valid. If the 5054 * attributes are not valid, take the simple approach and 5055 * free the cached value and re-fetch a new value. 5056 * 5057 * We don't negative entry cache for now, if we did we 5058 * would need to check if the file has changed on every 5059 * lookup. But xattrs don't exist very often and failing 5060 * an openattr is not much more expensive than and NVERIFY or GETATTR 5061 * so do an openattr over the wire for now. 5062 */ 5063 if (drp->r_xattr_dir != NULL) { 5064 if (ATTRCACHE4_VALID(dvp)) { 5065 VN_HOLD(drp->r_xattr_dir); 5066 *vpp = drp->r_xattr_dir; 5067 mutex_exit(&drp->r_statelock); 5068 nfs_rw_exit(&drp->r_rwlock); 5069 return (0); 5070 } 5071 VN_RELE(drp->r_xattr_dir); 5072 drp->r_xattr_dir = NULL; 5073 } 5074 mutex_exit(&drp->r_statelock); 5075 5076 error = nfs4openattr(dvp, vpp, cflag, cr); 5077 5078 nfs_rw_exit(&drp->r_rwlock); 5079 5080 return (error); 5081 } 5082 5083 static int 5084 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc) 5085 { 5086 int error; 5087 rnode4_t *drp; 5088 5089 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5090 5091 /* 5092 * If lookup is for "", just return dvp. Don't need 5093 * to send it over the wire, look it up in the dnlc, 5094 * or perform any access checks. 5095 */ 5096 if (*nm == '\0') { 5097 VN_HOLD(dvp); 5098 *vpp = dvp; 5099 return (0); 5100 } 5101 5102 /* 5103 * Can't do lookups in non-directories. 5104 */ 5105 if (dvp->v_type != VDIR) 5106 return (ENOTDIR); 5107 5108 /* 5109 * If lookup is for ".", just return dvp. Don't need 5110 * to send it over the wire or look it up in the dnlc, 5111 * just need to check access. 5112 */ 5113 if (nm[0] == '.' && nm[1] == '\0') { 5114 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5115 if (error) 5116 return (error); 5117 VN_HOLD(dvp); 5118 *vpp = dvp; 5119 return (0); 5120 } 5121 5122 drp = VTOR4(dvp); 5123 if (!(drp->r_flags & R4LOOKUP)) { 5124 mutex_enter(&drp->r_statelock); 5125 drp->r_flags |= R4LOOKUP; 5126 mutex_exit(&drp->r_statelock); 5127 } 5128 5129 *vpp = NULL; 5130 /* 5131 * Lookup this name in the DNLC. If there is no entry 5132 * lookup over the wire. 5133 */ 5134 if (!skipdnlc) 5135 *vpp = dnlc_lookup(dvp, nm); 5136 if (*vpp == NULL) { 5137 /* 5138 * We need to go over the wire to lookup the name. 5139 */ 5140 return (nfs4lookupnew_otw(dvp, nm, vpp, cr)); 5141 } 5142 5143 /* 5144 * We hit on the dnlc 5145 */ 5146 if (*vpp != DNLC_NO_VNODE || 5147 (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) { 5148 /* 5149 * But our attrs may not be valid. 5150 */ 5151 if (ATTRCACHE4_VALID(dvp)) { 5152 error = nfs4_waitfor_purge_complete(dvp); 5153 if (error) { 5154 VN_RELE(*vpp); 5155 *vpp = NULL; 5156 return (error); 5157 } 5158 5159 /* 5160 * If after the purge completes, check to make sure 5161 * our attrs are still valid. 5162 */ 5163 if (ATTRCACHE4_VALID(dvp)) { 5164 /* 5165 * If we waited for a purge we may have 5166 * lost our vnode so look it up again. 5167 */ 5168 VN_RELE(*vpp); 5169 *vpp = dnlc_lookup(dvp, nm); 5170 if (*vpp == NULL) 5171 return (nfs4lookupnew_otw(dvp, 5172 nm, vpp, cr)); 5173 5174 /* 5175 * The access cache should almost always hit 5176 */ 5177 error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5178 5179 if (error) { 5180 VN_RELE(*vpp); 5181 *vpp = NULL; 5182 return (error); 5183 } 5184 if (*vpp == DNLC_NO_VNODE) { 5185 VN_RELE(*vpp); 5186 *vpp = NULL; 5187 return (ENOENT); 5188 } 5189 return (0); 5190 } 5191 } 5192 } 5193 5194 ASSERT(*vpp != NULL); 5195 5196 /* 5197 * We may have gotten here we have one of the following cases: 5198 * 1) vpp != DNLC_NO_VNODE, our attrs have timed out so we 5199 * need to validate them. 5200 * 2) vpp == DNLC_NO_VNODE, a negative entry that we always 5201 * must validate. 5202 * 5203 * Go to the server and check if the directory has changed, if 5204 * it hasn't we are done and can use the dnlc entry. 5205 */ 5206 return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr)); 5207 } 5208 5209 /* 5210 * Go to the server and check if the directory has changed, if 5211 * it hasn't we are done and can use the dnlc entry. If it 5212 * has changed we get a new copy of its attributes and check 5213 * the access for VEXEC, then relookup the filename and 5214 * get its filehandle and attributes. 5215 * 5216 * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR 5217 * if the NVERIFY failed we must 5218 * purge the caches 5219 * cache new attributes (will set r_time_attr_inval) 5220 * cache new access 5221 * recheck VEXEC access 5222 * add name to dnlc, possibly negative 5223 * if LOOKUP succeeded 5224 * cache new attributes 5225 * else 5226 * set a new r_time_attr_inval for dvp 5227 * check to make sure we have access 5228 * 5229 * The vpp returned is the vnode passed in if the directory is valid, 5230 * a new vnode if successful lookup, or NULL on error. 5231 */ 5232 static int 5233 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5234 { 5235 COMPOUND4args_clnt args; 5236 COMPOUND4res_clnt res; 5237 fattr4 *ver_fattr; 5238 fattr4_change dchange; 5239 int32_t *ptr; 5240 int argoplist_size = 7 * sizeof (nfs_argop4); 5241 nfs_argop4 *argop; 5242 int doqueue; 5243 mntinfo4_t *mi; 5244 nfs4_recov_state_t recov_state; 5245 hrtime_t t; 5246 int isdotdot; 5247 vnode_t *nvp; 5248 nfs_fh4 *fhp; 5249 nfs4_sharedfh_t *sfhp; 5250 nfs4_access_type_t cacc; 5251 rnode4_t *nrp; 5252 rnode4_t *drp = VTOR4(dvp); 5253 nfs4_ga_res_t *garp = NULL; 5254 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5255 5256 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5257 ASSERT(nm != NULL); 5258 ASSERT(nm[0] != '\0'); 5259 ASSERT(dvp->v_type == VDIR); 5260 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5261 ASSERT(*vpp != NULL); 5262 5263 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5264 isdotdot = 1; 5265 args.ctag = TAG_LOOKUP_VPARENT; 5266 } else { 5267 /* 5268 * If dvp were a stub, it should have triggered and caused 5269 * a mount for us to get this far. 5270 */ 5271 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5272 5273 isdotdot = 0; 5274 args.ctag = TAG_LOOKUP_VALID; 5275 } 5276 5277 mi = VTOMI4(dvp); 5278 recov_state.rs_flags = 0; 5279 recov_state.rs_num_retry_despite_err = 0; 5280 5281 nvp = NULL; 5282 5283 /* Save the original mount point security information */ 5284 (void) save_mnt_secinfo(mi->mi_curr_serv); 5285 5286 recov_retry: 5287 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5288 &recov_state, NULL); 5289 if (e.error) { 5290 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5291 VN_RELE(*vpp); 5292 *vpp = NULL; 5293 return (e.error); 5294 } 5295 5296 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5297 5298 /* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */ 5299 args.array_len = 7; 5300 args.array = argop; 5301 5302 /* 0. putfh file */ 5303 argop[0].argop = OP_CPUTFH; 5304 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5305 5306 /* 1. nverify the change info */ 5307 argop[1].argop = OP_NVERIFY; 5308 ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes; 5309 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5310 ver_fattr->attrlist4 = (char *)&dchange; 5311 ptr = (int32_t *)&dchange; 5312 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5313 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5314 5315 /* 2. getattr directory */ 5316 argop[2].argop = OP_GETATTR; 5317 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5318 argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5319 5320 /* 3. access directory */ 5321 argop[3].argop = OP_ACCESS; 5322 argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5323 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5324 5325 /* 4. lookup name */ 5326 if (isdotdot) { 5327 argop[4].argop = OP_LOOKUPP; 5328 } else { 5329 argop[4].argop = OP_CLOOKUP; 5330 argop[4].nfs_argop4_u.opclookup.cname = nm; 5331 } 5332 5333 /* 5. resulting file handle */ 5334 argop[5].argop = OP_GETFH; 5335 5336 /* 6. resulting file attributes */ 5337 argop[6].argop = OP_GETATTR; 5338 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5339 argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5340 5341 doqueue = 1; 5342 t = gethrtime(); 5343 5344 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5345 5346 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5347 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5348 if (e.error != 0 && *vpp != NULL) 5349 VN_RELE(*vpp); 5350 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5351 &recov_state, FALSE); 5352 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5353 kmem_free(argop, argoplist_size); 5354 return (e.error); 5355 } 5356 5357 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5358 /* 5359 * For WRONGSEC of a non-dotdot case, send secinfo directly 5360 * from this thread, do not go thru the recovery thread since 5361 * we need the nm information. 5362 * 5363 * Not doing dotdot case because there is no specification 5364 * for (PUTFH, SECINFO "..") yet. 5365 */ 5366 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5367 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5368 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5369 &recov_state, FALSE); 5370 else 5371 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5372 &recov_state, TRUE); 5373 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5374 kmem_free(argop, argoplist_size); 5375 if (!e.error) 5376 goto recov_retry; 5377 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5378 VN_RELE(*vpp); 5379 *vpp = NULL; 5380 return (e.error); 5381 } 5382 5383 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5384 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5385 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5386 &recov_state, TRUE); 5387 5388 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5389 kmem_free(argop, argoplist_size); 5390 goto recov_retry; 5391 } 5392 } 5393 5394 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5395 5396 if (e.error || res.array_len == 0) { 5397 /* 5398 * If e.error isn't set, then reply has no ops (or we couldn't 5399 * be here). The only legal way to reply without an op array 5400 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5401 * be in the reply for all other status values. 5402 * 5403 * For valid replies without an ops array, return ENOTSUP 5404 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5405 * return EIO -- don't trust status. 5406 */ 5407 if (e.error == 0) 5408 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5409 ENOTSUP : EIO; 5410 VN_RELE(*vpp); 5411 *vpp = NULL; 5412 kmem_free(argop, argoplist_size); 5413 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5414 return (e.error); 5415 } 5416 5417 if (res.status != NFS4ERR_SAME) { 5418 e.error = geterrno4(res.status); 5419 5420 /* 5421 * The NVERIFY "failed" so the directory has changed 5422 * First make sure PUTFH succeeded and NVERIFY "failed" 5423 * cleanly. 5424 */ 5425 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5426 (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) { 5427 nfs4_purge_stale_fh(e.error, dvp, cr); 5428 VN_RELE(*vpp); 5429 *vpp = NULL; 5430 goto exit; 5431 } 5432 5433 /* 5434 * We know the NVERIFY "failed" so we must: 5435 * purge the caches (access and indirectly dnlc if needed) 5436 */ 5437 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5438 5439 if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5440 nfs4_purge_stale_fh(e.error, dvp, cr); 5441 VN_RELE(*vpp); 5442 *vpp = NULL; 5443 goto exit; 5444 } 5445 5446 /* 5447 * Install new cached attributes for the directory 5448 */ 5449 nfs4_attr_cache(dvp, 5450 &res.array[2].nfs_resop4_u.opgetattr.ga_res, 5451 t, cr, FALSE, NULL); 5452 5453 if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) { 5454 nfs4_purge_stale_fh(e.error, dvp, cr); 5455 VN_RELE(*vpp); 5456 *vpp = NULL; 5457 e.error = geterrno4(res.status); 5458 goto exit; 5459 } 5460 5461 /* 5462 * Now we know the directory is valid, 5463 * cache new directory access 5464 */ 5465 nfs4_access_cache(drp, 5466 args.array[3].nfs_argop4_u.opaccess.access, 5467 res.array[3].nfs_resop4_u.opaccess.access, cr); 5468 5469 /* 5470 * recheck VEXEC access 5471 */ 5472 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5473 if (cacc != NFS4_ACCESS_ALLOWED) { 5474 /* 5475 * Directory permissions might have been revoked 5476 */ 5477 if (cacc == NFS4_ACCESS_DENIED) { 5478 e.error = EACCES; 5479 VN_RELE(*vpp); 5480 *vpp = NULL; 5481 goto exit; 5482 } 5483 5484 /* 5485 * Somehow we must not have asked for enough 5486 * so try a singleton ACCESS, should never happen. 5487 */ 5488 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5489 if (e.error) { 5490 VN_RELE(*vpp); 5491 *vpp = NULL; 5492 goto exit; 5493 } 5494 } 5495 5496 e.error = geterrno4(res.status); 5497 if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) { 5498 /* 5499 * The lookup failed, probably no entry 5500 */ 5501 if (e.error == ENOENT && nfs4_lookup_neg_cache) { 5502 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5503 } else { 5504 /* 5505 * Might be some other error, so remove 5506 * the dnlc entry to make sure we start all 5507 * over again, next time. 5508 */ 5509 dnlc_remove(dvp, nm); 5510 } 5511 VN_RELE(*vpp); 5512 *vpp = NULL; 5513 goto exit; 5514 } 5515 5516 if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5517 /* 5518 * The file exists but we can't get its fh for 5519 * some unknown reason. Remove it from the dnlc 5520 * and error out to be safe. 5521 */ 5522 dnlc_remove(dvp, nm); 5523 VN_RELE(*vpp); 5524 *vpp = NULL; 5525 goto exit; 5526 } 5527 fhp = &res.array[5].nfs_resop4_u.opgetfh.object; 5528 if (fhp->nfs_fh4_len == 0) { 5529 /* 5530 * The file exists but a bogus fh 5531 * some unknown reason. Remove it from the dnlc 5532 * and error out to be safe. 5533 */ 5534 e.error = ENOENT; 5535 dnlc_remove(dvp, nm); 5536 VN_RELE(*vpp); 5537 *vpp = NULL; 5538 goto exit; 5539 } 5540 sfhp = sfh4_get(fhp, mi); 5541 5542 if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK) 5543 garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 5544 5545 /* 5546 * Make the new rnode 5547 */ 5548 if (isdotdot) { 5549 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 5550 if (e.error) { 5551 sfh4_rele(&sfhp); 5552 VN_RELE(*vpp); 5553 *vpp = NULL; 5554 goto exit; 5555 } 5556 /* 5557 * XXX if nfs4_make_dotdot uses an existing rnode 5558 * XXX it doesn't update the attributes. 5559 * XXX for now just save them again to save an OTW 5560 */ 5561 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 5562 } else { 5563 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 5564 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 5565 /* 5566 * If v_type == VNON, then garp was NULL because 5567 * the last op in the compound failed and makenfs4node 5568 * could not find the vnode for sfhp. It created 5569 * a new vnode, so we have nothing to purge here. 5570 */ 5571 if (nvp->v_type == VNON) { 5572 vattr_t vattr; 5573 5574 vattr.va_mask = AT_TYPE; 5575 /* 5576 * N.B. We've already called nfs4_end_fop above. 5577 */ 5578 e.error = nfs4getattr(nvp, &vattr, cr); 5579 if (e.error) { 5580 sfh4_rele(&sfhp); 5581 VN_RELE(*vpp); 5582 *vpp = NULL; 5583 VN_RELE(nvp); 5584 goto exit; 5585 } 5586 nvp->v_type = vattr.va_type; 5587 } 5588 } 5589 sfh4_rele(&sfhp); 5590 5591 nrp = VTOR4(nvp); 5592 mutex_enter(&nrp->r_statev4_lock); 5593 if (!nrp->created_v4) { 5594 mutex_exit(&nrp->r_statev4_lock); 5595 dnlc_update(dvp, nm, nvp); 5596 } else 5597 mutex_exit(&nrp->r_statev4_lock); 5598 5599 VN_RELE(*vpp); 5600 *vpp = nvp; 5601 } else { 5602 hrtime_t now; 5603 hrtime_t delta = 0; 5604 5605 e.error = 0; 5606 5607 /* 5608 * Because the NVERIFY "succeeded" we know that the 5609 * directory attributes are still valid 5610 * so update r_time_attr_inval 5611 */ 5612 now = gethrtime(); 5613 mutex_enter(&drp->r_statelock); 5614 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5615 delta = now - drp->r_time_attr_saved; 5616 if (delta < mi->mi_acdirmin) 5617 delta = mi->mi_acdirmin; 5618 else if (delta > mi->mi_acdirmax) 5619 delta = mi->mi_acdirmax; 5620 } 5621 drp->r_time_attr_inval = now + delta; 5622 mutex_exit(&drp->r_statelock); 5623 dnlc_update(dvp, nm, *vpp); 5624 5625 /* 5626 * Even though we have a valid directory attr cache 5627 * and dnlc entry, we may not have access. 5628 * This should almost always hit the cache. 5629 */ 5630 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5631 if (e.error) { 5632 VN_RELE(*vpp); 5633 *vpp = NULL; 5634 } 5635 5636 if (*vpp == DNLC_NO_VNODE) { 5637 VN_RELE(*vpp); 5638 *vpp = NULL; 5639 e.error = ENOENT; 5640 } 5641 } 5642 5643 exit: 5644 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5645 kmem_free(argop, argoplist_size); 5646 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5647 return (e.error); 5648 } 5649 5650 /* 5651 * We need to go over the wire to lookup the name, but 5652 * while we are there verify the directory has not 5653 * changed but if it has, get new attributes and check access 5654 * 5655 * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH 5656 * NVERIFY GETATTR ACCESS 5657 * 5658 * With the results: 5659 * if the NVERIFY failed we must purge the caches, add new attributes, 5660 * and cache new access. 5661 * set a new r_time_attr_inval 5662 * add name to dnlc, possibly negative 5663 * if LOOKUP succeeded 5664 * cache new attributes 5665 */ 5666 static int 5667 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr) 5668 { 5669 COMPOUND4args_clnt args; 5670 COMPOUND4res_clnt res; 5671 fattr4 *ver_fattr; 5672 fattr4_change dchange; 5673 int32_t *ptr; 5674 nfs4_ga_res_t *garp = NULL; 5675 int argoplist_size = 9 * sizeof (nfs_argop4); 5676 nfs_argop4 *argop; 5677 int doqueue; 5678 mntinfo4_t *mi; 5679 nfs4_recov_state_t recov_state; 5680 hrtime_t t; 5681 int isdotdot; 5682 vnode_t *nvp; 5683 nfs_fh4 *fhp; 5684 nfs4_sharedfh_t *sfhp; 5685 nfs4_access_type_t cacc; 5686 rnode4_t *nrp; 5687 rnode4_t *drp = VTOR4(dvp); 5688 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 5689 5690 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 5691 ASSERT(nm != NULL); 5692 ASSERT(nm[0] != '\0'); 5693 ASSERT(dvp->v_type == VDIR); 5694 ASSERT(nm[0] != '.' || nm[1] != '\0'); 5695 ASSERT(*vpp == NULL); 5696 5697 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') { 5698 isdotdot = 1; 5699 args.ctag = TAG_LOOKUP_PARENT; 5700 } else { 5701 /* 5702 * If dvp were a stub, it should have triggered and caused 5703 * a mount for us to get this far. 5704 */ 5705 ASSERT(!RP_ISSTUB(VTOR4(dvp))); 5706 5707 isdotdot = 0; 5708 args.ctag = TAG_LOOKUP; 5709 } 5710 5711 mi = VTOMI4(dvp); 5712 recov_state.rs_flags = 0; 5713 recov_state.rs_num_retry_despite_err = 0; 5714 5715 nvp = NULL; 5716 5717 /* Save the original mount point security information */ 5718 (void) save_mnt_secinfo(mi->mi_curr_serv); 5719 5720 recov_retry: 5721 e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP, 5722 &recov_state, NULL); 5723 if (e.error) { 5724 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5725 return (e.error); 5726 } 5727 5728 argop = kmem_alloc(argoplist_size, KM_SLEEP); 5729 5730 /* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */ 5731 args.array_len = 9; 5732 args.array = argop; 5733 5734 /* 0. putfh file */ 5735 argop[0].argop = OP_CPUTFH; 5736 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh; 5737 5738 /* 1. savefh for the nverify */ 5739 argop[1].argop = OP_SAVEFH; 5740 5741 /* 2. lookup name */ 5742 if (isdotdot) { 5743 argop[2].argop = OP_LOOKUPP; 5744 } else { 5745 argop[2].argop = OP_CLOOKUP; 5746 argop[2].nfs_argop4_u.opclookup.cname = nm; 5747 } 5748 5749 /* 3. resulting file handle */ 5750 argop[3].argop = OP_GETFH; 5751 5752 /* 4. resulting file attributes */ 5753 argop[4].argop = OP_GETATTR; 5754 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5755 argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5756 5757 /* 5. restorefh back the directory for the nverify */ 5758 argop[5].argop = OP_RESTOREFH; 5759 5760 /* 6. nverify the change info */ 5761 argop[6].argop = OP_NVERIFY; 5762 ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes; 5763 ver_fattr->attrmask = FATTR4_CHANGE_MASK; 5764 ver_fattr->attrlist4 = (char *)&dchange; 5765 ptr = (int32_t *)&dchange; 5766 IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change); 5767 ver_fattr->attrlist4_len = sizeof (fattr4_change); 5768 5769 /* 7. getattr directory */ 5770 argop[7].argop = OP_GETATTR; 5771 argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 5772 argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 5773 5774 /* 8. access directory */ 5775 argop[8].argop = OP_ACCESS; 5776 argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE | 5777 ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP; 5778 5779 doqueue = 1; 5780 t = gethrtime(); 5781 5782 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 5783 5784 if (!isdotdot && res.status == NFS4ERR_MOVED) { 5785 e.error = nfs4_setup_referral(dvp, nm, vpp, cr); 5786 if (e.error != 0 && *vpp != NULL) 5787 VN_RELE(*vpp); 5788 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5789 &recov_state, FALSE); 5790 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5791 kmem_free(argop, argoplist_size); 5792 return (e.error); 5793 } 5794 5795 if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) { 5796 /* 5797 * For WRONGSEC of a non-dotdot case, send secinfo directly 5798 * from this thread, do not go thru the recovery thread since 5799 * we need the nm information. 5800 * 5801 * Not doing dotdot case because there is no specification 5802 * for (PUTFH, SECINFO "..") yet. 5803 */ 5804 if (!isdotdot && res.status == NFS4ERR_WRONGSEC) { 5805 if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr))) 5806 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5807 &recov_state, FALSE); 5808 else 5809 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5810 &recov_state, TRUE); 5811 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5812 kmem_free(argop, argoplist_size); 5813 if (!e.error) 5814 goto recov_retry; 5815 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5816 return (e.error); 5817 } 5818 5819 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 5820 OP_LOOKUP, NULL, NULL, NULL) == FALSE) { 5821 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, 5822 &recov_state, TRUE); 5823 5824 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 5825 kmem_free(argop, argoplist_size); 5826 goto recov_retry; 5827 } 5828 } 5829 5830 nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE); 5831 5832 if (e.error || res.array_len == 0) { 5833 /* 5834 * If e.error isn't set, then reply has no ops (or we couldn't 5835 * be here). The only legal way to reply without an op array 5836 * is via NFS4ERR_MINOR_VERS_MISMATCH. An ops array should 5837 * be in the reply for all other status values. 5838 * 5839 * For valid replies without an ops array, return ENOTSUP 5840 * (geterrno4 xlation of VERS_MISMATCH). For illegal replies, 5841 * return EIO -- don't trust status. 5842 */ 5843 if (e.error == 0) 5844 e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ? 5845 ENOTSUP : EIO; 5846 5847 kmem_free(argop, argoplist_size); 5848 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 5849 return (e.error); 5850 } 5851 5852 e.error = geterrno4(res.status); 5853 5854 /* 5855 * The PUTFH and SAVEFH may have failed. 5856 */ 5857 if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) || 5858 (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) { 5859 nfs4_purge_stale_fh(e.error, dvp, cr); 5860 goto exit; 5861 } 5862 5863 /* 5864 * Check if the file exists, if it does delay entering 5865 * into the dnlc until after we update the directory 5866 * attributes so we don't cause it to get purged immediately. 5867 */ 5868 if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) { 5869 /* 5870 * The lookup failed, probably no entry 5871 */ 5872 if (e.error == ENOENT && nfs4_lookup_neg_cache) 5873 dnlc_update(dvp, nm, DNLC_NO_VNODE); 5874 goto exit; 5875 } 5876 5877 if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) { 5878 /* 5879 * The file exists but we can't get its fh for 5880 * some unknown reason. Error out to be safe. 5881 */ 5882 goto exit; 5883 } 5884 5885 fhp = &res.array[3].nfs_resop4_u.opgetfh.object; 5886 if (fhp->nfs_fh4_len == 0) { 5887 /* 5888 * The file exists but a bogus fh 5889 * some unknown reason. Error out to be safe. 5890 */ 5891 e.error = EIO; 5892 goto exit; 5893 } 5894 sfhp = sfh4_get(fhp, mi); 5895 5896 if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5897 sfh4_rele(&sfhp); 5898 goto exit; 5899 } 5900 garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 5901 5902 /* 5903 * The RESTOREFH may have failed 5904 */ 5905 if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) { 5906 sfh4_rele(&sfhp); 5907 e.error = EIO; 5908 goto exit; 5909 } 5910 5911 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) { 5912 /* 5913 * First make sure the NVERIFY failed as we expected, 5914 * if it didn't then be conservative and error out 5915 * as we can't trust the directory. 5916 */ 5917 if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) { 5918 sfh4_rele(&sfhp); 5919 e.error = EIO; 5920 goto exit; 5921 } 5922 5923 /* 5924 * We know the NVERIFY "failed" so the directory has changed, 5925 * so we must: 5926 * purge the caches (access and indirectly dnlc if needed) 5927 */ 5928 nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE); 5929 5930 if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) { 5931 sfh4_rele(&sfhp); 5932 goto exit; 5933 } 5934 nfs4_attr_cache(dvp, 5935 &res.array[7].nfs_resop4_u.opgetattr.ga_res, 5936 t, cr, FALSE, NULL); 5937 5938 if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) { 5939 nfs4_purge_stale_fh(e.error, dvp, cr); 5940 sfh4_rele(&sfhp); 5941 e.error = geterrno4(res.status); 5942 goto exit; 5943 } 5944 5945 /* 5946 * Now we know the directory is valid, 5947 * cache new directory access 5948 */ 5949 nfs4_access_cache(drp, 5950 args.array[8].nfs_argop4_u.opaccess.access, 5951 res.array[8].nfs_resop4_u.opaccess.access, cr); 5952 5953 /* 5954 * recheck VEXEC access 5955 */ 5956 cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr); 5957 if (cacc != NFS4_ACCESS_ALLOWED) { 5958 /* 5959 * Directory permissions might have been revoked 5960 */ 5961 if (cacc == NFS4_ACCESS_DENIED) { 5962 sfh4_rele(&sfhp); 5963 e.error = EACCES; 5964 goto exit; 5965 } 5966 5967 /* 5968 * Somehow we must not have asked for enough 5969 * so try a singleton ACCESS should never happen 5970 */ 5971 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 5972 if (e.error) { 5973 sfh4_rele(&sfhp); 5974 goto exit; 5975 } 5976 } 5977 5978 e.error = geterrno4(res.status); 5979 } else { 5980 hrtime_t now; 5981 hrtime_t delta = 0; 5982 5983 e.error = 0; 5984 5985 /* 5986 * Because the NVERIFY "succeeded" we know that the 5987 * directory attributes are still valid 5988 * so update r_time_attr_inval 5989 */ 5990 now = gethrtime(); 5991 mutex_enter(&drp->r_statelock); 5992 if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) { 5993 delta = now - drp->r_time_attr_saved; 5994 if (delta < mi->mi_acdirmin) 5995 delta = mi->mi_acdirmin; 5996 else if (delta > mi->mi_acdirmax) 5997 delta = mi->mi_acdirmax; 5998 } 5999 drp->r_time_attr_inval = now + delta; 6000 mutex_exit(&drp->r_statelock); 6001 6002 /* 6003 * Even though we have a valid directory attr cache, 6004 * we may not have access. 6005 * This should almost always hit the cache. 6006 */ 6007 e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL); 6008 if (e.error) { 6009 sfh4_rele(&sfhp); 6010 goto exit; 6011 } 6012 } 6013 6014 /* 6015 * Now we have successfully completed the lookup, if the 6016 * directory has changed we now have the valid attributes. 6017 * We also know we have directory access. 6018 * Create the new rnode and insert it in the dnlc. 6019 */ 6020 if (isdotdot) { 6021 e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1); 6022 if (e.error) { 6023 sfh4_rele(&sfhp); 6024 goto exit; 6025 } 6026 /* 6027 * XXX if nfs4_make_dotdot uses an existing rnode 6028 * XXX it doesn't update the attributes. 6029 * XXX for now just save them again to save an OTW 6030 */ 6031 nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL); 6032 } else { 6033 nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr, 6034 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 6035 } 6036 sfh4_rele(&sfhp); 6037 6038 nrp = VTOR4(nvp); 6039 mutex_enter(&nrp->r_statev4_lock); 6040 if (!nrp->created_v4) { 6041 mutex_exit(&nrp->r_statev4_lock); 6042 dnlc_update(dvp, nm, nvp); 6043 } else 6044 mutex_exit(&nrp->r_statev4_lock); 6045 6046 *vpp = nvp; 6047 6048 exit: 6049 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6050 kmem_free(argop, argoplist_size); 6051 (void) check_mnt_secinfo(mi->mi_curr_serv, nvp); 6052 return (e.error); 6053 } 6054 6055 #ifdef DEBUG 6056 void 6057 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt) 6058 { 6059 uint_t i, len; 6060 zoneid_t zoneid = getzoneid(); 6061 char *s; 6062 6063 zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where); 6064 for (i = 0; i < argcnt; i++) { 6065 nfs_argop4 *op = &argbase[i]; 6066 switch (op->argop) { 6067 case OP_CPUTFH: 6068 case OP_PUTFH: 6069 zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i); 6070 break; 6071 case OP_PUTROOTFH: 6072 zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i); 6073 break; 6074 case OP_CLOOKUP: 6075 s = op->nfs_argop4_u.opclookup.cname; 6076 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6077 break; 6078 case OP_LOOKUP: 6079 s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname, 6080 &len, NULL); 6081 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s); 6082 kmem_free(s, len); 6083 break; 6084 case OP_LOOKUPP: 6085 zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i); 6086 break; 6087 case OP_GETFH: 6088 zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i); 6089 break; 6090 case OP_GETATTR: 6091 zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i); 6092 break; 6093 case OP_OPENATTR: 6094 zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i); 6095 break; 6096 default: 6097 zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i, 6098 op->argop); 6099 break; 6100 } 6101 } 6102 } 6103 #endif 6104 6105 /* 6106 * nfs4lookup_setup - constructs a multi-lookup compound request. 6107 * 6108 * Given the path "nm1/nm2/.../nmn", the following compound requests 6109 * may be created: 6110 * 6111 * Note: Getfh is not be needed because filehandle attr is mandatory, but it 6112 * is faster, for now. 6113 * 6114 * l4_getattrs indicates the type of compound requested. 6115 * 6116 * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo): 6117 * 6118 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn} } 6119 * 6120 * total number of ops is n + 1. 6121 * 6122 * LKP4_LAST_NAMED_ATTR - multi-component path for a named 6123 * attribute: create lookups plus one OPENATTR/GETFH/GETATTR 6124 * before the last component, and only get attributes 6125 * for the last component. Note that the second-to-last 6126 * pathname component is XATTR_RPATH, which does NOT go 6127 * over-the-wire as a lookup. 6128 * 6129 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2}; 6130 * Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr } 6131 * 6132 * and total number of ops is n + 5. 6133 * 6134 * LKP4_LAST_ATTRDIR - multi-component path for the hidden named 6135 * attribute directory: create lookups plus an OPENATTR 6136 * replacing the last lookup. Note that the last pathname 6137 * component is XATTR_RPATH, which does NOT go over-the-wire 6138 * as a lookup. 6139 * 6140 * compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr; 6141 * Openattr; Getfh; Getattr } 6142 * 6143 * and total number of ops is n + 5. 6144 * 6145 * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate 6146 * nodes too. 6147 * 6148 * compound { Put*fh; Lookup {nm1}; Getfh; Getattr; 6149 * Lookup {nm2}; ... Lookup {nmn}; Getfh; Getattr } 6150 * 6151 * and total number of ops is 3*n + 1. 6152 * 6153 * All cases: returns the index in the arg array of the final LOOKUP op, or 6154 * -1 if no LOOKUPs were used. 6155 */ 6156 int 6157 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh) 6158 { 6159 enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs; 6160 nfs_argop4 *argbase, *argop; 6161 int arglen, argcnt; 6162 int n = 1; /* number of components */ 6163 int nga = 1; /* number of Getattr's in request */ 6164 char c = '\0', *s, *p; 6165 int lookup_idx = -1; 6166 int argoplist_size; 6167 6168 /* set lookuparg response result to 0 */ 6169 lookupargp->resp->status = NFS4_OK; 6170 6171 /* skip leading "/" or "." e.g. ".//./" if there is */ 6172 for (; ; nm++) { 6173 if (*nm != '/' && *nm != '.') 6174 break; 6175 6176 /* ".." is counted as 1 component */ 6177 if (*nm == '.' && *(nm + 1) != '/') 6178 break; 6179 } 6180 6181 /* 6182 * Find n = number of components - nm must be null terminated 6183 * Skip "." components. 6184 */ 6185 if (*nm != '\0') 6186 for (n = 1, s = nm; *s != '\0'; s++) { 6187 if ((*s == '/') && (*(s + 1) != '/') && 6188 (*(s + 1) != '\0') && 6189 !(*(s + 1) == '.' && (*(s + 2) == '/' || 6190 *(s + 2) == '\0'))) 6191 n++; 6192 } 6193 else 6194 n = 0; 6195 6196 /* 6197 * nga is number of components that need Getfh+Getattr 6198 */ 6199 switch (l4_getattrs) { 6200 case LKP4_NO_ATTRIBUTES: 6201 nga = 0; 6202 break; 6203 case LKP4_ALL_ATTRIBUTES: 6204 nga = n; 6205 /* 6206 * Always have at least 1 getfh, getattr pair 6207 */ 6208 if (nga == 0) 6209 nga++; 6210 break; 6211 case LKP4_LAST_ATTRDIR: 6212 case LKP4_LAST_NAMED_ATTR: 6213 nga = n+1; 6214 break; 6215 } 6216 6217 /* 6218 * If change to use the filehandle attr instead of getfh 6219 * the following line can be deleted. 6220 */ 6221 nga *= 2; 6222 6223 /* 6224 * calculate number of ops in request as 6225 * header + trailer + lookups + getattrs 6226 */ 6227 arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga; 6228 6229 argoplist_size = arglen * sizeof (nfs_argop4); 6230 argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP); 6231 lookupargp->argsp->array = argop; 6232 6233 argcnt = lookupargp->header_len; 6234 argop += argcnt; 6235 6236 /* 6237 * loop and create a lookup op and possibly getattr/getfh for 6238 * each component. Skip "." components. 6239 */ 6240 for (s = nm; *s != '\0'; s = p) { 6241 /* 6242 * Set up a pathname struct for each component if needed 6243 */ 6244 while (*s == '/') 6245 s++; 6246 if (*s == '\0') 6247 break; 6248 6249 for (p = s; (*p != '/') && (*p != '\0'); p++) 6250 ; 6251 c = *p; 6252 *p = '\0'; 6253 6254 if (s[0] == '.' && s[1] == '\0') { 6255 *p = c; 6256 continue; 6257 } 6258 if (l4_getattrs == LKP4_LAST_ATTRDIR && 6259 strcmp(s, XATTR_RPATH) == 0) { 6260 /* getfh XXX may not be needed in future */ 6261 argop->argop = OP_GETFH; 6262 argop++; 6263 argcnt++; 6264 6265 /* getattr */ 6266 argop->argop = OP_GETATTR; 6267 argop->nfs_argop4_u.opgetattr.attr_request = 6268 lookupargp->ga_bits; 6269 argop->nfs_argop4_u.opgetattr.mi = 6270 lookupargp->mi; 6271 argop++; 6272 argcnt++; 6273 6274 /* openattr */ 6275 argop->argop = OP_OPENATTR; 6276 } else if (l4_getattrs == LKP4_LAST_NAMED_ATTR && 6277 strcmp(s, XATTR_RPATH) == 0) { 6278 /* openattr */ 6279 argop->argop = OP_OPENATTR; 6280 argop++; 6281 argcnt++; 6282 6283 /* getfh XXX may not be needed in future */ 6284 argop->argop = OP_GETFH; 6285 argop++; 6286 argcnt++; 6287 6288 /* getattr */ 6289 argop->argop = OP_GETATTR; 6290 argop->nfs_argop4_u.opgetattr.attr_request = 6291 lookupargp->ga_bits; 6292 argop->nfs_argop4_u.opgetattr.mi = 6293 lookupargp->mi; 6294 argop++; 6295 argcnt++; 6296 *p = c; 6297 continue; 6298 } else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') { 6299 /* lookupp */ 6300 argop->argop = OP_LOOKUPP; 6301 } else { 6302 /* lookup */ 6303 argop->argop = OP_LOOKUP; 6304 (void) str_to_utf8(s, 6305 &argop->nfs_argop4_u.oplookup.objname); 6306 } 6307 lookup_idx = argcnt; 6308 argop++; 6309 argcnt++; 6310 6311 *p = c; 6312 6313 if (l4_getattrs == LKP4_ALL_ATTRIBUTES) { 6314 /* getfh XXX may not be needed in future */ 6315 argop->argop = OP_GETFH; 6316 argop++; 6317 argcnt++; 6318 6319 /* getattr */ 6320 argop->argop = OP_GETATTR; 6321 argop->nfs_argop4_u.opgetattr.attr_request = 6322 lookupargp->ga_bits; 6323 argop->nfs_argop4_u.opgetattr.mi = 6324 lookupargp->mi; 6325 argop++; 6326 argcnt++; 6327 } 6328 } 6329 6330 if ((l4_getattrs != LKP4_NO_ATTRIBUTES) && 6331 ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) { 6332 if (needgetfh) { 6333 /* stick in a post-lookup getfh */ 6334 argop->argop = OP_GETFH; 6335 argcnt++; 6336 argop++; 6337 } 6338 /* post-lookup getattr */ 6339 argop->argop = OP_GETATTR; 6340 argop->nfs_argop4_u.opgetattr.attr_request = 6341 lookupargp->ga_bits; 6342 argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi; 6343 argcnt++; 6344 } 6345 argcnt += lookupargp->trailer_len; /* actual op count */ 6346 lookupargp->argsp->array_len = argcnt; 6347 lookupargp->arglen = arglen; 6348 6349 #ifdef DEBUG 6350 if (nfs4_client_lookup_debug) 6351 nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt); 6352 #endif 6353 6354 return (lookup_idx); 6355 } 6356 6357 static int 6358 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr) 6359 { 6360 COMPOUND4args_clnt args; 6361 COMPOUND4res_clnt res; 6362 GETFH4res *gf_res = NULL; 6363 nfs_argop4 argop[4]; 6364 nfs_resop4 *resop = NULL; 6365 nfs4_sharedfh_t *sfhp; 6366 hrtime_t t; 6367 nfs4_error_t e; 6368 6369 rnode4_t *drp; 6370 int doqueue = 1; 6371 vnode_t *vp; 6372 int needrecov = 0; 6373 nfs4_recov_state_t recov_state; 6374 6375 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 6376 6377 *avp = NULL; 6378 recov_state.rs_flags = 0; 6379 recov_state.rs_num_retry_despite_err = 0; 6380 6381 recov_retry: 6382 /* COMPOUND: putfh, openattr, getfh, getattr */ 6383 args.array_len = 4; 6384 args.array = argop; 6385 args.ctag = TAG_OPENATTR; 6386 6387 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 6388 if (e.error) 6389 return (e.error); 6390 6391 drp = VTOR4(dvp); 6392 6393 /* putfh */ 6394 argop[0].argop = OP_CPUTFH; 6395 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6396 6397 /* openattr */ 6398 argop[1].argop = OP_OPENATTR; 6399 argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE); 6400 6401 /* getfh */ 6402 argop[2].argop = OP_GETFH; 6403 6404 /* getattr */ 6405 argop[3].argop = OP_GETATTR; 6406 argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6407 argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp); 6408 6409 NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE, 6410 "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first", 6411 rnode4info(drp))); 6412 6413 t = gethrtime(); 6414 6415 rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e); 6416 6417 needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp); 6418 if (needrecov) { 6419 bool_t abort; 6420 6421 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 6422 "nfs4openattr: initiating recovery\n")); 6423 6424 abort = nfs4_start_recovery(&e, 6425 VTOMI4(dvp), dvp, NULL, NULL, NULL, 6426 OP_OPENATTR, NULL, NULL, NULL); 6427 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6428 if (!e.error) { 6429 e.error = geterrno4(res.status); 6430 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6431 } 6432 if (abort == FALSE) 6433 goto recov_retry; 6434 return (e.error); 6435 } 6436 6437 if (e.error) { 6438 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6439 return (e.error); 6440 } 6441 6442 if (res.status) { 6443 /* 6444 * If OTW errro is NOTSUPP, then it should be 6445 * translated to EINVAL. All Solaris file system 6446 * implementations return EINVAL to the syscall layer 6447 * when the attrdir cannot be created due to an 6448 * implementation restriction or noxattr mount option. 6449 */ 6450 if (res.status == NFS4ERR_NOTSUPP) { 6451 mutex_enter(&drp->r_statelock); 6452 if (drp->r_xattr_dir) 6453 VN_RELE(drp->r_xattr_dir); 6454 VN_HOLD(NFS4_XATTR_DIR_NOTSUPP); 6455 drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP; 6456 mutex_exit(&drp->r_statelock); 6457 6458 e.error = EINVAL; 6459 } else { 6460 e.error = geterrno4(res.status); 6461 } 6462 6463 if (e.error) { 6464 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6465 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 6466 needrecov); 6467 return (e.error); 6468 } 6469 } 6470 6471 resop = &res.array[0]; /* putfh res */ 6472 ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK); 6473 6474 resop = &res.array[1]; /* openattr res */ 6475 ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK); 6476 6477 resop = &res.array[2]; /* getfh res */ 6478 gf_res = &resop->nfs_resop4_u.opgetfh; 6479 if (gf_res->object.nfs_fh4_len == 0) { 6480 *avp = NULL; 6481 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6482 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6483 return (ENOENT); 6484 } 6485 6486 sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp)); 6487 vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res, 6488 dvp->v_vfsp, t, cr, dvp, 6489 fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp)); 6490 sfh4_rele(&sfhp); 6491 6492 if (e.error) 6493 PURGE_ATTRCACHE4(vp); 6494 6495 mutex_enter(&vp->v_lock); 6496 vp->v_flag |= V_XATTRDIR; 6497 mutex_exit(&vp->v_lock); 6498 6499 *avp = vp; 6500 6501 mutex_enter(&drp->r_statelock); 6502 if (drp->r_xattr_dir) 6503 VN_RELE(drp->r_xattr_dir); 6504 VN_HOLD(vp); 6505 drp->r_xattr_dir = vp; 6506 6507 /* 6508 * Invalidate pathconf4 cache because r_xattr_dir is no longer 6509 * NULL. xattrs could be created at any time, and we have no 6510 * way to update pc4_xattr_exists in the base object if/when 6511 * it happens. 6512 */ 6513 drp->r_pathconf.pc4_xattr_valid = 0; 6514 6515 mutex_exit(&drp->r_statelock); 6516 6517 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 6518 6519 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 6520 6521 return (0); 6522 } 6523 6524 /* ARGSUSED */ 6525 static int 6526 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 6527 int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct, 6528 vsecattr_t *vsecp) 6529 { 6530 int error; 6531 vnode_t *vp = NULL; 6532 rnode4_t *rp; 6533 struct vattr vattr; 6534 rnode4_t *drp; 6535 vnode_t *tempvp; 6536 enum createmode4 createmode; 6537 bool_t must_trunc = FALSE; 6538 int truncating = 0; 6539 6540 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 6541 return (EPERM); 6542 if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) { 6543 return (EINVAL); 6544 } 6545 6546 /* . and .. have special meaning in the protocol, reject them. */ 6547 6548 if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0'))) 6549 return (EISDIR); 6550 6551 drp = VTOR4(dvp); 6552 6553 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 6554 return (EINTR); 6555 6556 top: 6557 /* 6558 * We make a copy of the attributes because the caller does not 6559 * expect us to change what va points to. 6560 */ 6561 vattr = *va; 6562 6563 /* 6564 * If the pathname is "", then dvp is the root vnode of 6565 * a remote file mounted over a local directory. 6566 * All that needs to be done is access 6567 * checking and truncation. Note that we avoid doing 6568 * open w/ create because the parent directory might 6569 * be in pseudo-fs and the open would fail. 6570 */ 6571 if (*nm == '\0') { 6572 error = 0; 6573 VN_HOLD(dvp); 6574 vp = dvp; 6575 must_trunc = TRUE; 6576 } else { 6577 /* 6578 * We need to go over the wire, just to be sure whether the 6579 * file exists or not. Using the DNLC can be dangerous in 6580 * this case when making a decision regarding existence. 6581 */ 6582 error = nfs4lookup(dvp, nm, &vp, cr, 1); 6583 } 6584 6585 if (exclusive) 6586 createmode = EXCLUSIVE4; 6587 else 6588 createmode = GUARDED4; 6589 6590 /* 6591 * error would be set if the file does not exist on the 6592 * server, so lets go create it. 6593 */ 6594 if (error) { 6595 goto create_otw; 6596 } 6597 6598 /* 6599 * File does exist on the server 6600 */ 6601 if (exclusive == EXCL) 6602 error = EEXIST; 6603 else if (vp->v_type == VDIR && (mode & VWRITE)) 6604 error = EISDIR; 6605 else { 6606 /* 6607 * If vnode is a device, create special vnode. 6608 */ 6609 if (ISVDEV(vp->v_type)) { 6610 tempvp = vp; 6611 vp = specvp(vp, vp->v_rdev, vp->v_type, cr); 6612 VN_RELE(tempvp); 6613 } 6614 if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) { 6615 if ((vattr.va_mask & AT_SIZE) && 6616 vp->v_type == VREG) { 6617 rp = VTOR4(vp); 6618 /* 6619 * Check here for large file handled 6620 * by LF-unaware process (as 6621 * ufs_create() does) 6622 */ 6623 if (!(flags & FOFFMAX)) { 6624 mutex_enter(&rp->r_statelock); 6625 if (rp->r_size > MAXOFF32_T) 6626 error = EOVERFLOW; 6627 mutex_exit(&rp->r_statelock); 6628 } 6629 6630 /* if error is set then we need to return */ 6631 if (error) { 6632 nfs_rw_exit(&drp->r_rwlock); 6633 VN_RELE(vp); 6634 return (error); 6635 } 6636 6637 if (must_trunc) { 6638 vattr.va_mask = AT_SIZE; 6639 error = nfs4setattr(vp, &vattr, 0, cr, 6640 NULL); 6641 } else { 6642 /* 6643 * we know we have a regular file that already 6644 * exists and we may end up truncating the file 6645 * as a result of the open_otw, so flush out 6646 * any dirty pages for this file first. 6647 */ 6648 if (nfs4_has_pages(vp) && 6649 ((rp->r_flags & R4DIRTY) || 6650 rp->r_count > 0 || 6651 rp->r_mapcnt > 0)) { 6652 error = nfs4_putpage(vp, 6653 (offset_t)0, 0, 0, cr, ct); 6654 if (error && (error == ENOSPC || 6655 error == EDQUOT)) { 6656 mutex_enter( 6657 &rp->r_statelock); 6658 if (!rp->r_error) 6659 rp->r_error = 6660 error; 6661 mutex_exit( 6662 &rp->r_statelock); 6663 } 6664 } 6665 vattr.va_mask = (AT_SIZE | 6666 AT_TYPE | AT_MODE); 6667 vattr.va_type = VREG; 6668 createmode = UNCHECKED4; 6669 truncating = 1; 6670 goto create_otw; 6671 } 6672 } 6673 } 6674 } 6675 nfs_rw_exit(&drp->r_rwlock); 6676 if (error) { 6677 VN_RELE(vp); 6678 } else { 6679 vnode_t *tvp; 6680 rnode4_t *trp; 6681 tvp = vp; 6682 if (vp->v_type == VREG) { 6683 trp = VTOR4(vp); 6684 if (IS_SHADOW(vp, trp)) 6685 tvp = RTOV4(trp); 6686 } 6687 6688 if (must_trunc) { 6689 /* 6690 * existing file got truncated, notify. 6691 */ 6692 vnevent_create(tvp, ct); 6693 } 6694 6695 *vpp = vp; 6696 } 6697 return (error); 6698 6699 create_otw: 6700 dnlc_remove(dvp, nm); 6701 6702 ASSERT(vattr.va_mask & AT_TYPE); 6703 6704 /* 6705 * If not a regular file let nfs4mknod() handle it. 6706 */ 6707 if (vattr.va_type != VREG) { 6708 error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr); 6709 nfs_rw_exit(&drp->r_rwlock); 6710 return (error); 6711 } 6712 6713 /* 6714 * It _is_ a regular file. 6715 */ 6716 ASSERT(vattr.va_mask & AT_MODE); 6717 if (MANDMODE(vattr.va_mode)) { 6718 nfs_rw_exit(&drp->r_rwlock); 6719 return (EACCES); 6720 } 6721 6722 /* 6723 * If this happens to be a mknod of a regular file, then flags will 6724 * have neither FREAD or FWRITE. However, we must set at least one 6725 * for the call to nfs4open_otw. If it's open(O_CREAT) driving 6726 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been 6727 * set (based on openmode specified by app). 6728 */ 6729 if ((flags & (FREAD|FWRITE)) == 0) 6730 flags |= (FREAD|FWRITE); 6731 6732 error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0); 6733 6734 if (vp != NULL) { 6735 /* if create was successful, throw away the file's pages */ 6736 if (!error && (vattr.va_mask & AT_SIZE)) 6737 nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK), 6738 cr); 6739 /* release the lookup hold */ 6740 VN_RELE(vp); 6741 vp = NULL; 6742 } 6743 6744 /* 6745 * validate that we opened a regular file. This handles a misbehaving 6746 * server that returns an incorrect FH. 6747 */ 6748 if ((error == 0) && *vpp && (*vpp)->v_type != VREG) { 6749 error = EISDIR; 6750 VN_RELE(*vpp); 6751 } 6752 6753 /* 6754 * If this is not an exclusive create, then the CREATE 6755 * request will be made with the GUARDED mode set. This 6756 * means that the server will return EEXIST if the file 6757 * exists. The file could exist because of a retransmitted 6758 * request. In this case, we recover by starting over and 6759 * checking to see whether the file exists. This second 6760 * time through it should and a CREATE request will not be 6761 * sent. 6762 * 6763 * This handles the problem of a dangling CREATE request 6764 * which contains attributes which indicate that the file 6765 * should be truncated. This retransmitted request could 6766 * possibly truncate valid data in the file if not caught 6767 * by the duplicate request mechanism on the server or if 6768 * not caught by other means. The scenario is: 6769 * 6770 * Client transmits CREATE request with size = 0 6771 * Client times out, retransmits request. 6772 * Response to the first request arrives from the server 6773 * and the client proceeds on. 6774 * Client writes data to the file. 6775 * The server now processes retransmitted CREATE request 6776 * and truncates file. 6777 * 6778 * The use of the GUARDED CREATE request prevents this from 6779 * happening because the retransmitted CREATE would fail 6780 * with EEXIST and would not truncate the file. 6781 */ 6782 if (error == EEXIST && exclusive == NONEXCL) { 6783 #ifdef DEBUG 6784 nfs4_create_misses++; 6785 #endif 6786 goto top; 6787 } 6788 nfs_rw_exit(&drp->r_rwlock); 6789 if (truncating && !error && *vpp) { 6790 vnode_t *tvp; 6791 rnode4_t *trp; 6792 /* 6793 * existing file got truncated, notify. 6794 */ 6795 tvp = *vpp; 6796 trp = VTOR4(tvp); 6797 if (IS_SHADOW(tvp, trp)) 6798 tvp = RTOV4(trp); 6799 vnevent_create(tvp, ct); 6800 } 6801 return (error); 6802 } 6803 6804 /* 6805 * Create compound (for mkdir, mknod, symlink): 6806 * { Putfh <dfh>; Create; Getfh; Getattr } 6807 * It's okay if setattr failed to set gid - this is not considered 6808 * an error, but purge attrs in that case. 6809 */ 6810 static int 6811 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va, 6812 vnode_t **vpp, cred_t *cr, nfs_ftype4 type) 6813 { 6814 int need_end_op = FALSE; 6815 COMPOUND4args_clnt args; 6816 COMPOUND4res_clnt res, *resp = NULL; 6817 nfs_argop4 *argop; 6818 nfs_resop4 *resop; 6819 int doqueue; 6820 mntinfo4_t *mi; 6821 rnode4_t *drp = VTOR4(dvp); 6822 change_info4 *cinfo; 6823 GETFH4res *gf_res; 6824 struct vattr vattr; 6825 vnode_t *vp; 6826 fattr4 *crattr; 6827 bool_t needrecov = FALSE; 6828 nfs4_recov_state_t recov_state; 6829 nfs4_sharedfh_t *sfhp = NULL; 6830 hrtime_t t; 6831 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 6832 int numops, argoplist_size, setgid_flag, idx_create, idx_fattr; 6833 dirattr_info_t dinfo, *dinfop; 6834 servinfo4_t *svp; 6835 bitmap4 supp_attrs; 6836 6837 ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK || 6838 type == NF4CHR || type == NF4SOCK || type == NF4FIFO); 6839 6840 mi = VTOMI4(dvp); 6841 6842 /* 6843 * Make sure we properly deal with setting the right gid 6844 * on a new directory to reflect the parent's setgid bit 6845 */ 6846 setgid_flag = 0; 6847 if (type == NF4DIR) { 6848 struct vattr dva; 6849 6850 va->va_mode &= ~VSGID; 6851 dva.va_mask = AT_MODE | AT_GID; 6852 if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) { 6853 6854 /* 6855 * If the parent's directory has the setgid bit set 6856 * _and_ the client was able to get a valid mapping 6857 * for the parent dir's owner_group, we want to 6858 * append NVERIFY(owner_group == dva.va_gid) and 6859 * SETTATTR to the CREATE compound. 6860 */ 6861 if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) { 6862 setgid_flag = 1; 6863 va->va_mode |= VSGID; 6864 if (dva.va_gid != GID_NOBODY) { 6865 va->va_mask |= AT_GID; 6866 va->va_gid = dva.va_gid; 6867 } 6868 } 6869 } 6870 } 6871 6872 /* 6873 * Create ops: 6874 * 0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new) 6875 * 5:restorefh(dir) 6:getattr(dir) 6876 * 6877 * if (setgid) 6878 * 0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new) 6879 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new) 6880 * 8:nverify 9:setattr 6881 */ 6882 if (setgid_flag) { 6883 numops = 10; 6884 idx_create = 1; 6885 idx_fattr = 3; 6886 } else { 6887 numops = 7; 6888 idx_create = 2; 6889 idx_fattr = 4; 6890 } 6891 6892 ASSERT(nfs_zone() == mi->mi_zone); 6893 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) { 6894 return (EINTR); 6895 } 6896 recov_state.rs_flags = 0; 6897 recov_state.rs_num_retry_despite_err = 0; 6898 6899 argoplist_size = numops * sizeof (nfs_argop4); 6900 argop = kmem_alloc(argoplist_size, KM_SLEEP); 6901 6902 recov_retry: 6903 if (type == NF4LNK) 6904 args.ctag = TAG_SYMLINK; 6905 else if (type == NF4DIR) 6906 args.ctag = TAG_MKDIR; 6907 else 6908 args.ctag = TAG_MKNOD; 6909 6910 args.array_len = numops; 6911 args.array = argop; 6912 6913 if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) { 6914 nfs_rw_exit(&drp->r_rwlock); 6915 kmem_free(argop, argoplist_size); 6916 return (e.error); 6917 } 6918 need_end_op = TRUE; 6919 6920 6921 /* 0: putfh directory */ 6922 argop[0].argop = OP_CPUTFH; 6923 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6924 6925 /* 1/2: Create object */ 6926 argop[idx_create].argop = OP_CCREATE; 6927 argop[idx_create].nfs_argop4_u.opccreate.cname = nm; 6928 argop[idx_create].nfs_argop4_u.opccreate.type = type; 6929 if (type == NF4LNK) { 6930 /* 6931 * symlink, treat name as data 6932 */ 6933 ASSERT(data != NULL); 6934 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata = 6935 (char *)data; 6936 } 6937 if (type == NF4BLK || type == NF4CHR) { 6938 ASSERT(data != NULL); 6939 argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata = 6940 *((specdata4 *)data); 6941 } 6942 6943 crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs; 6944 6945 svp = drp->r_server; 6946 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 6947 supp_attrs = svp->sv_supp_attrs; 6948 nfs_rw_exit(&svp->sv_lock); 6949 6950 if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) { 6951 nfs_rw_exit(&drp->r_rwlock); 6952 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 6953 e.error = EINVAL; 6954 kmem_free(argop, argoplist_size); 6955 return (e.error); 6956 } 6957 6958 /* 2/3: getfh fh of created object */ 6959 ASSERT(idx_create + 1 == idx_fattr - 1); 6960 argop[idx_create + 1].argop = OP_GETFH; 6961 6962 /* 3/4: getattr of new object */ 6963 argop[idx_fattr].argop = OP_GETATTR; 6964 argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6965 argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi; 6966 6967 if (setgid_flag) { 6968 vattr_t _v; 6969 6970 argop[4].argop = OP_SAVEFH; 6971 6972 argop[5].argop = OP_CPUTFH; 6973 argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 6974 6975 argop[6].argop = OP_GETATTR; 6976 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 6977 argop[6].nfs_argop4_u.opgetattr.mi = mi; 6978 6979 argop[7].argop = OP_RESTOREFH; 6980 6981 /* 6982 * nverify 6983 * 6984 * XXX - Revisit the last argument to nfs4_end_op() 6985 * once 5020486 is fixed. 6986 */ 6987 _v.va_mask = AT_GID; 6988 _v.va_gid = va->va_gid; 6989 if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY, 6990 supp_attrs)) { 6991 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 6992 nfs_rw_exit(&drp->r_rwlock); 6993 nfs4_fattr4_free(crattr); 6994 kmem_free(argop, argoplist_size); 6995 return (e.error); 6996 } 6997 6998 /* 6999 * setattr 7000 * 7001 * We _know_ we're not messing with AT_SIZE or AT_XTIME, 7002 * so no need for stateid or flags. Also we specify NULL 7003 * rp since we're only interested in setting owner_group 7004 * attributes. 7005 */ 7006 nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs, 7007 &e.error, 0); 7008 7009 if (e.error) { 7010 nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE); 7011 nfs_rw_exit(&drp->r_rwlock); 7012 nfs4_fattr4_free(crattr); 7013 nfs4args_verify_free(&argop[8]); 7014 kmem_free(argop, argoplist_size); 7015 return (e.error); 7016 } 7017 } else { 7018 argop[1].argop = OP_SAVEFH; 7019 7020 argop[5].argop = OP_RESTOREFH; 7021 7022 argop[6].argop = OP_GETATTR; 7023 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7024 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7025 } 7026 7027 dnlc_remove(dvp, nm); 7028 7029 doqueue = 1; 7030 t = gethrtime(); 7031 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7032 7033 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7034 if (e.error) { 7035 PURGE_ATTRCACHE4(dvp); 7036 if (!needrecov) 7037 goto out; 7038 } 7039 7040 if (needrecov) { 7041 if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL, 7042 OP_CREATE, NULL, NULL, NULL) == FALSE) { 7043 nfs4_end_op(mi, dvp, NULL, &recov_state, 7044 needrecov); 7045 need_end_op = FALSE; 7046 nfs4_fattr4_free(crattr); 7047 if (setgid_flag) { 7048 nfs4args_verify_free(&argop[8]); 7049 nfs4args_setattr_free(&argop[9]); 7050 } 7051 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 7052 goto recov_retry; 7053 } 7054 } 7055 7056 resp = &res; 7057 7058 if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) { 7059 7060 if (res.status == NFS4ERR_BADOWNER) 7061 nfs4_log_badowner(mi, OP_CREATE); 7062 7063 e.error = geterrno4(res.status); 7064 7065 /* 7066 * This check is left over from when create was implemented 7067 * using a setattr op (instead of createattrs). If the 7068 * putfh/create/getfh failed, the error was returned. If 7069 * setattr/getattr failed, we keep going. 7070 * 7071 * It might be better to get rid of the GETFH also, and just 7072 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory. 7073 * Then if any of the operations failed, we could return the 7074 * error now, and remove much of the error code below. 7075 */ 7076 if (res.array_len <= idx_fattr) { 7077 /* 7078 * Either Putfh, Create or Getfh failed. 7079 */ 7080 PURGE_ATTRCACHE4(dvp); 7081 /* 7082 * nfs4_purge_stale_fh() may generate otw calls through 7083 * nfs4_invalidate_pages. Hence the need to call 7084 * nfs4_end_op() here to avoid nfs4_start_op() deadlock. 7085 */ 7086 nfs4_end_op(mi, dvp, NULL, &recov_state, 7087 needrecov); 7088 need_end_op = FALSE; 7089 nfs4_purge_stale_fh(e.error, dvp, cr); 7090 goto out; 7091 } 7092 } 7093 7094 resop = &res.array[idx_create]; /* create res */ 7095 cinfo = &resop->nfs_resop4_u.opcreate.cinfo; 7096 7097 resop = &res.array[idx_create + 1]; /* getfh res */ 7098 gf_res = &resop->nfs_resop4_u.opgetfh; 7099 7100 sfhp = sfh4_get(&gf_res->object, mi); 7101 if (e.error) { 7102 *vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp, 7103 fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7104 if (vp->v_type == VNON) { 7105 vattr.va_mask = AT_TYPE; 7106 /* 7107 * Need to call nfs4_end_op before nfs4getattr to avoid 7108 * potential nfs4_start_op deadlock. See RFE 4777612. 7109 */ 7110 nfs4_end_op(mi, dvp, NULL, &recov_state, 7111 needrecov); 7112 need_end_op = FALSE; 7113 e.error = nfs4getattr(vp, &vattr, cr); 7114 if (e.error) { 7115 VN_RELE(vp); 7116 *vpp = NULL; 7117 goto out; 7118 } 7119 vp->v_type = vattr.va_type; 7120 } 7121 e.error = 0; 7122 } else { 7123 *vpp = vp = makenfs4node(sfhp, 7124 &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res, 7125 dvp->v_vfsp, t, cr, 7126 dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp)); 7127 } 7128 7129 /* 7130 * If compound succeeded, then update dir attrs 7131 */ 7132 if (res.status == NFS4_OK) { 7133 dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res; 7134 dinfo.di_cred = cr; 7135 dinfo.di_time_call = t; 7136 dinfop = &dinfo; 7137 } else 7138 dinfop = NULL; 7139 7140 /* Update directory cache attribute, readdir and dnlc caches */ 7141 nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop); 7142 7143 out: 7144 if (sfhp != NULL) 7145 sfh4_rele(&sfhp); 7146 nfs_rw_exit(&drp->r_rwlock); 7147 nfs4_fattr4_free(crattr); 7148 if (setgid_flag) { 7149 nfs4args_verify_free(&argop[8]); 7150 nfs4args_setattr_free(&argop[9]); 7151 } 7152 if (resp) 7153 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7154 if (need_end_op) 7155 nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov); 7156 7157 kmem_free(argop, argoplist_size); 7158 return (e.error); 7159 } 7160 7161 /* ARGSUSED */ 7162 static int 7163 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive, 7164 int mode, vnode_t **vpp, cred_t *cr) 7165 { 7166 int error; 7167 vnode_t *vp; 7168 nfs_ftype4 type; 7169 specdata4 spec, *specp = NULL; 7170 7171 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 7172 7173 switch (va->va_type) { 7174 case VCHR: 7175 case VBLK: 7176 type = (va->va_type == VCHR) ? NF4CHR : NF4BLK; 7177 spec.specdata1 = getmajor(va->va_rdev); 7178 spec.specdata2 = getminor(va->va_rdev); 7179 specp = &spec; 7180 break; 7181 7182 case VFIFO: 7183 type = NF4FIFO; 7184 break; 7185 case VSOCK: 7186 type = NF4SOCK; 7187 break; 7188 7189 default: 7190 return (EINVAL); 7191 } 7192 7193 error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type); 7194 if (error) { 7195 return (error); 7196 } 7197 7198 /* 7199 * This might not be needed any more; special case to deal 7200 * with problematic v2/v3 servers. Since create was unable 7201 * to set group correctly, not sure what hope setattr has. 7202 */ 7203 if (va->va_gid != VTOR4(vp)->r_attr.va_gid) { 7204 va->va_mask = AT_GID; 7205 (void) nfs4setattr(vp, va, 0, cr, NULL); 7206 } 7207 7208 /* 7209 * If vnode is a device create special vnode 7210 */ 7211 if (ISVDEV(vp->v_type)) { 7212 *vpp = specvp(vp, vp->v_rdev, vp->v_type, cr); 7213 VN_RELE(vp); 7214 } else { 7215 *vpp = vp; 7216 } 7217 return (error); 7218 } 7219 7220 /* 7221 * Remove requires that the current fh be the target directory. 7222 * After the operation, the current fh is unchanged. 7223 * The compound op structure is: 7224 * PUTFH(targetdir), REMOVE 7225 * 7226 * Weirdness: if the vnode to be removed is open 7227 * we rename it instead of removing it and nfs_inactive 7228 * will remove the new name. 7229 */ 7230 /* ARGSUSED */ 7231 static int 7232 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags) 7233 { 7234 COMPOUND4args_clnt args; 7235 COMPOUND4res_clnt res, *resp = NULL; 7236 REMOVE4res *rm_res; 7237 nfs_argop4 argop[3]; 7238 nfs_resop4 *resop; 7239 vnode_t *vp; 7240 char *tmpname; 7241 int doqueue; 7242 mntinfo4_t *mi; 7243 rnode4_t *rp; 7244 rnode4_t *drp; 7245 int needrecov = 0; 7246 nfs4_recov_state_t recov_state; 7247 int isopen; 7248 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7249 dirattr_info_t dinfo; 7250 7251 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 7252 return (EPERM); 7253 drp = VTOR4(dvp); 7254 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 7255 return (EINTR); 7256 7257 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 7258 if (e.error) { 7259 nfs_rw_exit(&drp->r_rwlock); 7260 return (e.error); 7261 } 7262 7263 if (vp->v_type == VDIR) { 7264 VN_RELE(vp); 7265 nfs_rw_exit(&drp->r_rwlock); 7266 return (EISDIR); 7267 } 7268 7269 /* 7270 * First just remove the entry from the name cache, as it 7271 * is most likely the only entry for this vp. 7272 */ 7273 dnlc_remove(dvp, nm); 7274 7275 rp = VTOR4(vp); 7276 7277 /* 7278 * For regular file types, check to see if the file is open by looking 7279 * at the open streams. 7280 * For all other types, check the reference count on the vnode. Since 7281 * they are not opened OTW they never have an open stream. 7282 * 7283 * If the file is open, rename it to .nfsXXXX. 7284 */ 7285 if (vp->v_type != VREG) { 7286 /* 7287 * If the file has a v_count > 1 then there may be more than one 7288 * entry in the name cache due multiple links or an open file, 7289 * but we don't have the real reference count so flush all 7290 * possible entries. 7291 */ 7292 if (vp->v_count > 1) 7293 dnlc_purge_vp(vp); 7294 7295 /* 7296 * Now we have the real reference count. 7297 */ 7298 isopen = vp->v_count > 1; 7299 } else { 7300 mutex_enter(&rp->r_os_lock); 7301 isopen = list_head(&rp->r_open_streams) != NULL; 7302 mutex_exit(&rp->r_os_lock); 7303 } 7304 7305 mutex_enter(&rp->r_statelock); 7306 if (isopen && 7307 (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) { 7308 mutex_exit(&rp->r_statelock); 7309 tmpname = newname(); 7310 e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct); 7311 if (e.error) 7312 kmem_free(tmpname, MAXNAMELEN); 7313 else { 7314 mutex_enter(&rp->r_statelock); 7315 if (rp->r_unldvp == NULL) { 7316 VN_HOLD(dvp); 7317 rp->r_unldvp = dvp; 7318 if (rp->r_unlcred != NULL) 7319 crfree(rp->r_unlcred); 7320 crhold(cr); 7321 rp->r_unlcred = cr; 7322 rp->r_unlname = tmpname; 7323 } else { 7324 kmem_free(rp->r_unlname, MAXNAMELEN); 7325 rp->r_unlname = tmpname; 7326 } 7327 mutex_exit(&rp->r_statelock); 7328 } 7329 VN_RELE(vp); 7330 nfs_rw_exit(&drp->r_rwlock); 7331 return (e.error); 7332 } 7333 /* 7334 * Actually remove the file/dir 7335 */ 7336 mutex_exit(&rp->r_statelock); 7337 7338 /* 7339 * We need to flush any dirty pages which happen to 7340 * be hanging around before removing the file. 7341 * This shouldn't happen very often since in NFSv4 7342 * we should be close to open consistent. 7343 */ 7344 if (nfs4_has_pages(vp) && 7345 ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) { 7346 e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct); 7347 if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) { 7348 mutex_enter(&rp->r_statelock); 7349 if (!rp->r_error) 7350 rp->r_error = e.error; 7351 mutex_exit(&rp->r_statelock); 7352 } 7353 } 7354 7355 mi = VTOMI4(dvp); 7356 7357 (void) nfs4delegreturn(rp, NFS4_DR_REOPEN); 7358 recov_state.rs_flags = 0; 7359 recov_state.rs_num_retry_despite_err = 0; 7360 7361 recov_retry: 7362 /* 7363 * Remove ops: putfh dir; remove 7364 */ 7365 args.ctag = TAG_REMOVE; 7366 args.array_len = 3; 7367 args.array = argop; 7368 7369 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 7370 if (e.error) { 7371 nfs_rw_exit(&drp->r_rwlock); 7372 VN_RELE(vp); 7373 return (e.error); 7374 } 7375 7376 /* putfh directory */ 7377 argop[0].argop = OP_CPUTFH; 7378 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 7379 7380 /* remove */ 7381 argop[1].argop = OP_CREMOVE; 7382 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 7383 7384 /* getattr dir */ 7385 argop[2].argop = OP_GETATTR; 7386 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7387 argop[2].nfs_argop4_u.opgetattr.mi = mi; 7388 7389 doqueue = 1; 7390 dinfo.di_time_call = gethrtime(); 7391 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 7392 7393 PURGE_ATTRCACHE4(vp); 7394 7395 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 7396 if (e.error) 7397 PURGE_ATTRCACHE4(dvp); 7398 7399 if (needrecov) { 7400 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, 7401 NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 7402 if (!e.error) 7403 (void) xdr_free(xdr_COMPOUND4res_clnt, 7404 (caddr_t)&res); 7405 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 7406 needrecov); 7407 goto recov_retry; 7408 } 7409 } 7410 7411 /* 7412 * Matching nfs4_end_op() for start_op() above. 7413 * There is a path in the code below which calls 7414 * nfs4_purge_stale_fh(), which may generate otw calls through 7415 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op() 7416 * here to avoid nfs4_start_op() deadlock. 7417 */ 7418 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 7419 7420 if (!e.error) { 7421 resp = &res; 7422 7423 if (res.status) { 7424 e.error = geterrno4(res.status); 7425 PURGE_ATTRCACHE4(dvp); 7426 nfs4_purge_stale_fh(e.error, dvp, cr); 7427 } else { 7428 resop = &res.array[1]; /* remove res */ 7429 rm_res = &resop->nfs_resop4_u.opremove; 7430 7431 dinfo.di_garp = 7432 &res.array[2].nfs_resop4_u.opgetattr.ga_res; 7433 dinfo.di_cred = cr; 7434 7435 /* Update directory attr, readdir and dnlc caches */ 7436 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 7437 &dinfo); 7438 } 7439 } 7440 nfs_rw_exit(&drp->r_rwlock); 7441 if (resp) 7442 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7443 7444 if (e.error == 0) { 7445 vnode_t *tvp; 7446 rnode4_t *trp; 7447 trp = VTOR4(vp); 7448 tvp = vp; 7449 if (IS_SHADOW(vp, trp)) 7450 tvp = RTOV4(trp); 7451 vnevent_remove(tvp, dvp, nm, ct); 7452 } 7453 VN_RELE(vp); 7454 return (e.error); 7455 } 7456 7457 /* 7458 * Link requires that the current fh be the target directory and the 7459 * saved fh be the source fh. After the operation, the current fh is unchanged. 7460 * Thus the compound op structure is: 7461 * PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH, 7462 * GETATTR(file) 7463 */ 7464 /* ARGSUSED */ 7465 static int 7466 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr, 7467 caller_context_t *ct, int flags) 7468 { 7469 COMPOUND4args_clnt args; 7470 COMPOUND4res_clnt res, *resp = NULL; 7471 LINK4res *ln_res; 7472 int argoplist_size = 7 * sizeof (nfs_argop4); 7473 nfs_argop4 *argop; 7474 nfs_resop4 *resop; 7475 vnode_t *realvp, *nvp; 7476 int doqueue; 7477 mntinfo4_t *mi; 7478 rnode4_t *tdrp; 7479 bool_t needrecov = FALSE; 7480 nfs4_recov_state_t recov_state; 7481 hrtime_t t; 7482 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 7483 dirattr_info_t dinfo; 7484 7485 ASSERT(*tnm != '\0'); 7486 ASSERT(tdvp->v_type == VDIR); 7487 ASSERT(nfs4_consistent_type(tdvp)); 7488 ASSERT(nfs4_consistent_type(svp)); 7489 7490 if (nfs_zone() != VTOMI4(tdvp)->mi_zone) 7491 return (EPERM); 7492 if (VOP_REALVP(svp, &realvp, ct) == 0) { 7493 svp = realvp; 7494 ASSERT(nfs4_consistent_type(svp)); 7495 } 7496 7497 tdrp = VTOR4(tdvp); 7498 mi = VTOMI4(svp); 7499 7500 if (!(mi->mi_flags & MI4_LINK)) { 7501 return (EOPNOTSUPP); 7502 } 7503 recov_state.rs_flags = 0; 7504 recov_state.rs_num_retry_despite_err = 0; 7505 7506 if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp))) 7507 return (EINTR); 7508 7509 recov_retry: 7510 argop = kmem_alloc(argoplist_size, KM_SLEEP); 7511 7512 args.ctag = TAG_LINK; 7513 7514 /* 7515 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir); 7516 * restorefh; getattr(fl) 7517 */ 7518 args.array_len = 7; 7519 args.array = argop; 7520 7521 e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state); 7522 if (e.error) { 7523 kmem_free(argop, argoplist_size); 7524 nfs_rw_exit(&tdrp->r_rwlock); 7525 return (e.error); 7526 } 7527 7528 /* 0. putfh file */ 7529 argop[0].argop = OP_CPUTFH; 7530 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh; 7531 7532 /* 1. save current fh to free up the space for the dir */ 7533 argop[1].argop = OP_SAVEFH; 7534 7535 /* 2. putfh targetdir */ 7536 argop[2].argop = OP_CPUTFH; 7537 argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh; 7538 7539 /* 3. link: current_fh is targetdir, saved_fh is source */ 7540 argop[3].argop = OP_CLINK; 7541 argop[3].nfs_argop4_u.opclink.cnewname = tnm; 7542 7543 /* 4. Get attributes of dir */ 7544 argop[4].argop = OP_GETATTR; 7545 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7546 argop[4].nfs_argop4_u.opgetattr.mi = mi; 7547 7548 /* 5. If link was successful, restore current vp to file */ 7549 argop[5].argop = OP_RESTOREFH; 7550 7551 /* 6. Get attributes of linked object */ 7552 argop[6].argop = OP_GETATTR; 7553 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 7554 argop[6].nfs_argop4_u.opgetattr.mi = mi; 7555 7556 dnlc_remove(tdvp, tnm); 7557 7558 doqueue = 1; 7559 t = gethrtime(); 7560 7561 rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e); 7562 7563 needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp); 7564 if (e.error != 0 && !needrecov) { 7565 PURGE_ATTRCACHE4(tdvp); 7566 PURGE_ATTRCACHE4(svp); 7567 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7568 goto out; 7569 } 7570 7571 if (needrecov) { 7572 bool_t abort; 7573 7574 abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp, 7575 NULL, NULL, OP_LINK, NULL, NULL, NULL); 7576 if (abort == FALSE) { 7577 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, 7578 needrecov); 7579 kmem_free(argop, argoplist_size); 7580 if (!e.error) 7581 (void) xdr_free(xdr_COMPOUND4res_clnt, 7582 (caddr_t)&res); 7583 goto recov_retry; 7584 } else { 7585 if (e.error != 0) { 7586 PURGE_ATTRCACHE4(tdvp); 7587 PURGE_ATTRCACHE4(svp); 7588 nfs4_end_op(VTOMI4(svp), svp, tdvp, 7589 &recov_state, needrecov); 7590 goto out; 7591 } 7592 /* fall through for res.status case */ 7593 } 7594 } 7595 7596 nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov); 7597 7598 resp = &res; 7599 if (res.status) { 7600 /* If link succeeded, then don't return error */ 7601 e.error = geterrno4(res.status); 7602 if (res.array_len <= 4) { 7603 /* 7604 * Either Putfh, Savefh, Putfh dir, or Link failed 7605 */ 7606 PURGE_ATTRCACHE4(svp); 7607 PURGE_ATTRCACHE4(tdvp); 7608 if (e.error == EOPNOTSUPP) { 7609 mutex_enter(&mi->mi_lock); 7610 mi->mi_flags &= ~MI4_LINK; 7611 mutex_exit(&mi->mi_lock); 7612 } 7613 /* Remap EISDIR to EPERM for non-root user for SVVS */ 7614 /* XXX-LP */ 7615 if (e.error == EISDIR && crgetuid(cr) != 0) 7616 e.error = EPERM; 7617 goto out; 7618 } 7619 } 7620 7621 /* either no error or one of the postop getattr failed */ 7622 7623 /* 7624 * XXX - if LINK succeeded, but no attrs were returned for link 7625 * file, purge its cache. 7626 * 7627 * XXX Perform a simplified version of wcc checking. Instead of 7628 * have another getattr to get pre-op, just purge cache if 7629 * any of the ops prior to and including the getattr failed. 7630 * If the getattr succeeded then update the attrcache accordingly. 7631 */ 7632 7633 /* 7634 * update cache with link file postattrs. 7635 * Note: at this point resop points to link res. 7636 */ 7637 resop = &res.array[3]; /* link res */ 7638 ln_res = &resop->nfs_resop4_u.oplink; 7639 if (res.status == NFS4_OK) 7640 e.error = nfs4_update_attrcache(res.status, 7641 &res.array[6].nfs_resop4_u.opgetattr.ga_res, 7642 t, svp, cr); 7643 7644 /* 7645 * Call makenfs4node to create the new shadow vp for tnm. 7646 * We pass NULL attrs because we just cached attrs for 7647 * the src object. All we're trying to accomplish is to 7648 * to create the new shadow vnode. 7649 */ 7650 nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr, 7651 tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh)); 7652 7653 /* Update target cache attribute, readdir and dnlc caches */ 7654 dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res; 7655 dinfo.di_time_call = t; 7656 dinfo.di_cred = cr; 7657 7658 nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo); 7659 ASSERT(nfs4_consistent_type(tdvp)); 7660 ASSERT(nfs4_consistent_type(svp)); 7661 ASSERT(nfs4_consistent_type(nvp)); 7662 VN_RELE(nvp); 7663 7664 if (!e.error) { 7665 vnode_t *tvp; 7666 rnode4_t *trp; 7667 /* 7668 * Notify the source file of this link operation. 7669 */ 7670 trp = VTOR4(svp); 7671 tvp = svp; 7672 if (IS_SHADOW(svp, trp)) 7673 tvp = RTOV4(trp); 7674 vnevent_link(tvp, ct); 7675 } 7676 out: 7677 kmem_free(argop, argoplist_size); 7678 if (resp) 7679 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 7680 7681 nfs_rw_exit(&tdrp->r_rwlock); 7682 7683 return (e.error); 7684 } 7685 7686 /* ARGSUSED */ 7687 static int 7688 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7689 caller_context_t *ct, int flags) 7690 { 7691 vnode_t *realvp; 7692 7693 if (nfs_zone() != VTOMI4(odvp)->mi_zone) 7694 return (EPERM); 7695 if (VOP_REALVP(ndvp, &realvp, ct) == 0) 7696 ndvp = realvp; 7697 7698 return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct)); 7699 } 7700 7701 /* 7702 * nfs4rename does the real work of renaming in NFS Version 4. 7703 * 7704 * A file handle is considered volatile for renaming purposes if either 7705 * of the volatile bits are turned on. However, the compound may differ 7706 * based on the likelihood of the filehandle to change during rename. 7707 */ 7708 static int 7709 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr, 7710 caller_context_t *ct) 7711 { 7712 int error; 7713 mntinfo4_t *mi; 7714 vnode_t *nvp = NULL; 7715 vnode_t *ovp = NULL; 7716 char *tmpname = NULL; 7717 rnode4_t *rp; 7718 rnode4_t *odrp; 7719 rnode4_t *ndrp; 7720 int did_link = 0; 7721 int do_link = 1; 7722 nfsstat4 stat = NFS4_OK; 7723 7724 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 7725 ASSERT(nfs4_consistent_type(odvp)); 7726 ASSERT(nfs4_consistent_type(ndvp)); 7727 7728 if (onm[0] == '.' && (onm[1] == '\0' || 7729 (onm[1] == '.' && onm[2] == '\0'))) 7730 return (EINVAL); 7731 7732 if (nnm[0] == '.' && (nnm[1] == '\0' || 7733 (nnm[1] == '.' && nnm[2] == '\0'))) 7734 return (EINVAL); 7735 7736 odrp = VTOR4(odvp); 7737 ndrp = VTOR4(ndvp); 7738 if ((intptr_t)odrp < (intptr_t)ndrp) { 7739 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) 7740 return (EINTR); 7741 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) { 7742 nfs_rw_exit(&odrp->r_rwlock); 7743 return (EINTR); 7744 } 7745 } else { 7746 if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) 7747 return (EINTR); 7748 if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) { 7749 nfs_rw_exit(&ndrp->r_rwlock); 7750 return (EINTR); 7751 } 7752 } 7753 7754 /* 7755 * Lookup the target file. If it exists, it needs to be 7756 * checked to see whether it is a mount point and whether 7757 * it is active (open). 7758 */ 7759 error = nfs4lookup(ndvp, nnm, &nvp, cr, 0); 7760 if (!error) { 7761 int isactive; 7762 7763 ASSERT(nfs4_consistent_type(nvp)); 7764 /* 7765 * If this file has been mounted on, then just 7766 * return busy because renaming to it would remove 7767 * the mounted file system from the name space. 7768 */ 7769 if (vn_ismntpt(nvp)) { 7770 VN_RELE(nvp); 7771 nfs_rw_exit(&odrp->r_rwlock); 7772 nfs_rw_exit(&ndrp->r_rwlock); 7773 return (EBUSY); 7774 } 7775 7776 /* 7777 * First just remove the entry from the name cache, as it 7778 * is most likely the only entry for this vp. 7779 */ 7780 dnlc_remove(ndvp, nnm); 7781 7782 rp = VTOR4(nvp); 7783 7784 if (nvp->v_type != VREG) { 7785 /* 7786 * Purge the name cache of all references to this vnode 7787 * so that we can check the reference count to infer 7788 * whether it is active or not. 7789 */ 7790 if (nvp->v_count > 1) 7791 dnlc_purge_vp(nvp); 7792 7793 isactive = nvp->v_count > 1; 7794 } else { 7795 mutex_enter(&rp->r_os_lock); 7796 isactive = list_head(&rp->r_open_streams) != NULL; 7797 mutex_exit(&rp->r_os_lock); 7798 } 7799 7800 /* 7801 * If the vnode is active and is not a directory, 7802 * arrange to rename it to a 7803 * temporary file so that it will continue to be 7804 * accessible. This implements the "unlink-open-file" 7805 * semantics for the target of a rename operation. 7806 * Before doing this though, make sure that the 7807 * source and target files are not already the same. 7808 */ 7809 if (isactive && nvp->v_type != VDIR) { 7810 /* 7811 * Lookup the source name. 7812 */ 7813 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7814 7815 /* 7816 * The source name *should* already exist. 7817 */ 7818 if (error) { 7819 VN_RELE(nvp); 7820 nfs_rw_exit(&odrp->r_rwlock); 7821 nfs_rw_exit(&ndrp->r_rwlock); 7822 return (error); 7823 } 7824 7825 ASSERT(nfs4_consistent_type(ovp)); 7826 7827 /* 7828 * Compare the two vnodes. If they are the same, 7829 * just release all held vnodes and return success. 7830 */ 7831 if (VN_CMP(ovp, nvp)) { 7832 VN_RELE(ovp); 7833 VN_RELE(nvp); 7834 nfs_rw_exit(&odrp->r_rwlock); 7835 nfs_rw_exit(&ndrp->r_rwlock); 7836 return (0); 7837 } 7838 7839 /* 7840 * Can't mix and match directories and non- 7841 * directories in rename operations. We already 7842 * know that the target is not a directory. If 7843 * the source is a directory, return an error. 7844 */ 7845 if (ovp->v_type == VDIR) { 7846 VN_RELE(ovp); 7847 VN_RELE(nvp); 7848 nfs_rw_exit(&odrp->r_rwlock); 7849 nfs_rw_exit(&ndrp->r_rwlock); 7850 return (ENOTDIR); 7851 } 7852 link_call: 7853 /* 7854 * The target file exists, is not the same as 7855 * the source file, and is active. We first 7856 * try to Link it to a temporary filename to 7857 * avoid having the server removing the file 7858 * completely (which could cause data loss to 7859 * the user's POV in the event the Rename fails 7860 * -- see bug 1165874). 7861 */ 7862 /* 7863 * The do_link and did_link booleans are 7864 * introduced in the event we get NFS4ERR_FILE_OPEN 7865 * returned for the Rename. Some servers can 7866 * not Rename over an Open file, so they return 7867 * this error. The client needs to Remove the 7868 * newly created Link and do two Renames, just 7869 * as if the server didn't support LINK. 7870 */ 7871 tmpname = newname(); 7872 error = 0; 7873 7874 if (do_link) { 7875 error = nfs4_link(ndvp, nvp, tmpname, cr, 7876 NULL, 0); 7877 } 7878 if (error == EOPNOTSUPP || !do_link) { 7879 error = nfs4_rename(ndvp, nnm, ndvp, tmpname, 7880 cr, NULL, 0); 7881 did_link = 0; 7882 } else { 7883 did_link = 1; 7884 } 7885 if (error) { 7886 kmem_free(tmpname, MAXNAMELEN); 7887 VN_RELE(ovp); 7888 VN_RELE(nvp); 7889 nfs_rw_exit(&odrp->r_rwlock); 7890 nfs_rw_exit(&ndrp->r_rwlock); 7891 return (error); 7892 } 7893 7894 mutex_enter(&rp->r_statelock); 7895 if (rp->r_unldvp == NULL) { 7896 VN_HOLD(ndvp); 7897 rp->r_unldvp = ndvp; 7898 if (rp->r_unlcred != NULL) 7899 crfree(rp->r_unlcred); 7900 crhold(cr); 7901 rp->r_unlcred = cr; 7902 rp->r_unlname = tmpname; 7903 } else { 7904 if (rp->r_unlname) 7905 kmem_free(rp->r_unlname, MAXNAMELEN); 7906 rp->r_unlname = tmpname; 7907 } 7908 mutex_exit(&rp->r_statelock); 7909 } 7910 7911 (void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7912 7913 ASSERT(nfs4_consistent_type(nvp)); 7914 } 7915 7916 if (ovp == NULL) { 7917 /* 7918 * When renaming directories to be a subdirectory of a 7919 * different parent, the dnlc entry for ".." will no 7920 * longer be valid, so it must be removed. 7921 * 7922 * We do a lookup here to determine whether we are renaming 7923 * a directory and we need to check if we are renaming 7924 * an unlinked file. This might have already been done 7925 * in previous code, so we check ovp == NULL to avoid 7926 * doing it twice. 7927 */ 7928 error = nfs4lookup(odvp, onm, &ovp, cr, 0); 7929 /* 7930 * The source name *should* already exist. 7931 */ 7932 if (error) { 7933 nfs_rw_exit(&odrp->r_rwlock); 7934 nfs_rw_exit(&ndrp->r_rwlock); 7935 if (nvp) { 7936 VN_RELE(nvp); 7937 } 7938 return (error); 7939 } 7940 ASSERT(ovp != NULL); 7941 ASSERT(nfs4_consistent_type(ovp)); 7942 } 7943 7944 /* 7945 * Is the object being renamed a dir, and if so, is 7946 * it being renamed to a child of itself? The underlying 7947 * fs should ultimately return EINVAL for this case; 7948 * however, buggy beta non-Solaris NFSv4 servers at 7949 * interop testing events have allowed this behavior, 7950 * and it caused our client to panic due to a recursive 7951 * mutex_enter in fn_move. 7952 * 7953 * The tedious locking in fn_move could be changed to 7954 * deal with this case, and the client could avoid the 7955 * panic; however, the client would just confuse itself 7956 * later and misbehave. A better way to handle the broken 7957 * server is to detect this condition and return EINVAL 7958 * without ever sending the the bogus rename to the server. 7959 * We know the rename is invalid -- just fail it now. 7960 */ 7961 if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) { 7962 VN_RELE(ovp); 7963 nfs_rw_exit(&odrp->r_rwlock); 7964 nfs_rw_exit(&ndrp->r_rwlock); 7965 if (nvp) { 7966 VN_RELE(nvp); 7967 } 7968 return (EINVAL); 7969 } 7970 7971 (void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN); 7972 7973 /* 7974 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is 7975 * possible for the filehandle to change due to the rename. 7976 * If neither of these bits is set, but FH4_VOL_MIGRATION is set, 7977 * the fh will not change because of the rename, but we still need 7978 * to update its rnode entry with the new name for 7979 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN 7980 * has no effect on these for now, but for future improvements, 7981 * we might want to use it too to simplify handling of files 7982 * that are open with that flag on. (XXX) 7983 */ 7984 mi = VTOMI4(odvp); 7985 if (NFS4_VOLATILE_FH(mi)) 7986 error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr, 7987 &stat); 7988 else 7989 error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr, 7990 &stat); 7991 7992 ASSERT(nfs4_consistent_type(odvp)); 7993 ASSERT(nfs4_consistent_type(ndvp)); 7994 ASSERT(nfs4_consistent_type(ovp)); 7995 7996 if (stat == NFS4ERR_FILE_OPEN && did_link) { 7997 do_link = 0; 7998 /* 7999 * Before the 'link_call' code, we did a nfs4_lookup 8000 * that puts a VN_HOLD on nvp. After the nfs4_link 8001 * call we call VN_RELE to match that hold. We need 8002 * to place an additional VN_HOLD here since we will 8003 * be hitting that VN_RELE again. 8004 */ 8005 VN_HOLD(nvp); 8006 8007 (void) nfs4_remove(ndvp, tmpname, cr, NULL, 0); 8008 8009 /* Undo the unlinked file naming stuff we just did */ 8010 mutex_enter(&rp->r_statelock); 8011 if (rp->r_unldvp) { 8012 VN_RELE(ndvp); 8013 rp->r_unldvp = NULL; 8014 if (rp->r_unlcred != NULL) 8015 crfree(rp->r_unlcred); 8016 rp->r_unlcred = NULL; 8017 /* rp->r_unlanme points to tmpname */ 8018 if (rp->r_unlname) 8019 kmem_free(rp->r_unlname, MAXNAMELEN); 8020 rp->r_unlname = NULL; 8021 } 8022 mutex_exit(&rp->r_statelock); 8023 8024 if (nvp) { 8025 VN_RELE(nvp); 8026 } 8027 goto link_call; 8028 } 8029 8030 if (error) { 8031 VN_RELE(ovp); 8032 nfs_rw_exit(&odrp->r_rwlock); 8033 nfs_rw_exit(&ndrp->r_rwlock); 8034 if (nvp) { 8035 VN_RELE(nvp); 8036 } 8037 return (error); 8038 } 8039 8040 /* 8041 * when renaming directories to be a subdirectory of a 8042 * different parent, the dnlc entry for ".." will no 8043 * longer be valid, so it must be removed 8044 */ 8045 rp = VTOR4(ovp); 8046 if (ndvp != odvp) { 8047 if (ovp->v_type == VDIR) { 8048 dnlc_remove(ovp, ".."); 8049 if (rp->r_dir != NULL) 8050 nfs4_purge_rddir_cache(ovp); 8051 } 8052 } 8053 8054 /* 8055 * If we are renaming the unlinked file, update the 8056 * r_unldvp and r_unlname as needed. 8057 */ 8058 mutex_enter(&rp->r_statelock); 8059 if (rp->r_unldvp != NULL) { 8060 if (strcmp(rp->r_unlname, onm) == 0) { 8061 (void) strncpy(rp->r_unlname, nnm, MAXNAMELEN); 8062 rp->r_unlname[MAXNAMELEN - 1] = '\0'; 8063 if (ndvp != rp->r_unldvp) { 8064 VN_RELE(rp->r_unldvp); 8065 rp->r_unldvp = ndvp; 8066 VN_HOLD(ndvp); 8067 } 8068 } 8069 } 8070 mutex_exit(&rp->r_statelock); 8071 8072 /* 8073 * Notify the rename vnevents to source vnode, and to the target 8074 * vnode if it already existed. 8075 */ 8076 if (error == 0) { 8077 vnode_t *tvp; 8078 rnode4_t *trp; 8079 /* 8080 * Notify the vnode. Each links is represented by 8081 * a different vnode, in nfsv4. 8082 */ 8083 if (nvp) { 8084 trp = VTOR4(nvp); 8085 tvp = nvp; 8086 if (IS_SHADOW(nvp, trp)) 8087 tvp = RTOV4(trp); 8088 vnevent_rename_dest(tvp, ndvp, nnm, ct); 8089 } 8090 8091 /* 8092 * if the source and destination directory are not the 8093 * same notify the destination directory. 8094 */ 8095 if (VTOR4(odvp) != VTOR4(ndvp)) { 8096 trp = VTOR4(ndvp); 8097 tvp = ndvp; 8098 if (IS_SHADOW(ndvp, trp)) 8099 tvp = RTOV4(trp); 8100 vnevent_rename_dest_dir(tvp, ct); 8101 } 8102 8103 trp = VTOR4(ovp); 8104 tvp = ovp; 8105 if (IS_SHADOW(ovp, trp)) 8106 tvp = RTOV4(trp); 8107 vnevent_rename_src(tvp, odvp, onm, ct); 8108 } 8109 8110 if (nvp) { 8111 VN_RELE(nvp); 8112 } 8113 VN_RELE(ovp); 8114 8115 nfs_rw_exit(&odrp->r_rwlock); 8116 nfs_rw_exit(&ndrp->r_rwlock); 8117 8118 return (error); 8119 } 8120 8121 /* 8122 * When the parent directory has changed, sv_dfh must be updated 8123 */ 8124 static void 8125 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp) 8126 { 8127 svnode_t *sv = VTOSV(vp); 8128 nfs4_sharedfh_t *old_dfh = sv->sv_dfh; 8129 nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh; 8130 8131 sfh4_hold(new_dfh); 8132 sv->sv_dfh = new_dfh; 8133 sfh4_rele(&old_dfh); 8134 } 8135 8136 /* 8137 * nfs4rename_persistent does the otw portion of renaming in NFS Version 4, 8138 * when it is known that the filehandle is persistent through rename. 8139 * 8140 * Rename requires that the current fh be the target directory and the 8141 * saved fh be the source directory. After the operation, the current fh 8142 * is unchanged. 8143 * The compound op structure for persistent fh rename is: 8144 * PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME 8145 * Rather than bother with the directory postop args, we'll simply 8146 * update that a change occurred in the cache, so no post-op getattrs. 8147 */ 8148 static int 8149 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp, 8150 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8151 { 8152 COMPOUND4args_clnt args; 8153 COMPOUND4res_clnt res, *resp = NULL; 8154 nfs_argop4 *argop; 8155 nfs_resop4 *resop; 8156 int doqueue, argoplist_size; 8157 mntinfo4_t *mi; 8158 rnode4_t *odrp = VTOR4(odvp); 8159 rnode4_t *ndrp = VTOR4(ndvp); 8160 RENAME4res *rn_res; 8161 bool_t needrecov; 8162 nfs4_recov_state_t recov_state; 8163 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8164 dirattr_info_t dinfo, *dinfop; 8165 8166 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8167 8168 recov_state.rs_flags = 0; 8169 recov_state.rs_num_retry_despite_err = 0; 8170 8171 /* 8172 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir 8173 * 8174 * If source/target are different dirs, then append putfh(src); getattr 8175 */ 8176 args.array_len = (odvp == ndvp) ? 5 : 7; 8177 argoplist_size = args.array_len * sizeof (nfs_argop4); 8178 args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP); 8179 8180 recov_retry: 8181 *statp = NFS4_OK; 8182 8183 /* No need to Lookup the file, persistent fh */ 8184 args.ctag = TAG_RENAME; 8185 8186 mi = VTOMI4(odvp); 8187 e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state); 8188 if (e.error) { 8189 kmem_free(argop, argoplist_size); 8190 return (e.error); 8191 } 8192 8193 /* 0: putfh source directory */ 8194 argop[0].argop = OP_CPUTFH; 8195 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8196 8197 /* 1: Save source fh to free up current for target */ 8198 argop[1].argop = OP_SAVEFH; 8199 8200 /* 2: putfh targetdir */ 8201 argop[2].argop = OP_CPUTFH; 8202 argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8203 8204 /* 3: current_fh is targetdir, saved_fh is sourcedir */ 8205 argop[3].argop = OP_CRENAME; 8206 argop[3].nfs_argop4_u.opcrename.coldname = onm; 8207 argop[3].nfs_argop4_u.opcrename.cnewname = nnm; 8208 8209 /* 4: getattr (targetdir) */ 8210 argop[4].argop = OP_GETATTR; 8211 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8212 argop[4].nfs_argop4_u.opgetattr.mi = mi; 8213 8214 if (ndvp != odvp) { 8215 8216 /* 5: putfh (sourcedir) */ 8217 argop[5].argop = OP_CPUTFH; 8218 argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8219 8220 /* 6: getattr (sourcedir) */ 8221 argop[6].argop = OP_GETATTR; 8222 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8223 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8224 } 8225 8226 dnlc_remove(odvp, onm); 8227 dnlc_remove(ndvp, nnm); 8228 8229 doqueue = 1; 8230 dinfo.di_time_call = gethrtime(); 8231 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8232 8233 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8234 if (e.error) { 8235 PURGE_ATTRCACHE4(odvp); 8236 PURGE_ATTRCACHE4(ndvp); 8237 } else { 8238 *statp = res.status; 8239 } 8240 8241 if (needrecov) { 8242 if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8243 OP_RENAME, NULL, NULL, NULL) == FALSE) { 8244 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8245 if (!e.error) 8246 (void) xdr_free(xdr_COMPOUND4res_clnt, 8247 (caddr_t)&res); 8248 goto recov_retry; 8249 } 8250 } 8251 8252 if (!e.error) { 8253 resp = &res; 8254 /* 8255 * as long as OP_RENAME 8256 */ 8257 if (res.status != NFS4_OK && res.array_len <= 4) { 8258 e.error = geterrno4(res.status); 8259 PURGE_ATTRCACHE4(odvp); 8260 PURGE_ATTRCACHE4(ndvp); 8261 /* 8262 * System V defines rename to return EEXIST, not 8263 * ENOTEMPTY if the target directory is not empty. 8264 * Over the wire, the error is NFSERR_ENOTEMPTY 8265 * which geterrno4 maps to ENOTEMPTY. 8266 */ 8267 if (e.error == ENOTEMPTY) 8268 e.error = EEXIST; 8269 } else { 8270 8271 resop = &res.array[3]; /* rename res */ 8272 rn_res = &resop->nfs_resop4_u.oprename; 8273 8274 if (res.status == NFS4_OK) { 8275 /* 8276 * Update target attribute, readdir and dnlc 8277 * caches. 8278 */ 8279 dinfo.di_garp = 8280 &res.array[4].nfs_resop4_u.opgetattr.ga_res; 8281 dinfo.di_cred = cr; 8282 dinfop = &dinfo; 8283 } else 8284 dinfop = NULL; 8285 8286 nfs4_update_dircaches(&rn_res->target_cinfo, 8287 ndvp, NULL, NULL, dinfop); 8288 8289 /* 8290 * Update source attribute, readdir and dnlc caches 8291 * 8292 */ 8293 if (ndvp != odvp) { 8294 update_parentdir_sfh(renvp, ndvp); 8295 8296 if (dinfop) 8297 dinfo.di_garp = 8298 &(res.array[6].nfs_resop4_u. 8299 opgetattr.ga_res); 8300 8301 nfs4_update_dircaches(&rn_res->source_cinfo, 8302 odvp, NULL, NULL, dinfop); 8303 } 8304 8305 fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name, 8306 nnm); 8307 } 8308 } 8309 8310 if (resp) 8311 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8312 nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov); 8313 kmem_free(argop, argoplist_size); 8314 8315 return (e.error); 8316 } 8317 8318 /* 8319 * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when 8320 * it is possible for the filehandle to change due to the rename. 8321 * 8322 * The compound req in this case includes a post-rename lookup and getattr 8323 * to ensure that we have the correct fh and attributes for the object. 8324 * 8325 * Rename requires that the current fh be the target directory and the 8326 * saved fh be the source directory. After the operation, the current fh 8327 * is unchanged. 8328 * 8329 * We need the new filehandle (hence a LOOKUP and GETFH) so that we can 8330 * update the filehandle for the renamed object. We also get the old 8331 * filehandle for historical reasons; this should be taken out sometime. 8332 * This results in a rather cumbersome compound... 8333 * 8334 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8335 * PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR 8336 * 8337 */ 8338 static int 8339 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp, 8340 vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp) 8341 { 8342 COMPOUND4args_clnt args; 8343 COMPOUND4res_clnt res, *resp = NULL; 8344 int argoplist_size; 8345 nfs_argop4 *argop; 8346 nfs_resop4 *resop; 8347 int doqueue; 8348 mntinfo4_t *mi; 8349 rnode4_t *odrp = VTOR4(odvp); /* old directory */ 8350 rnode4_t *ndrp = VTOR4(ndvp); /* new directory */ 8351 rnode4_t *orp = VTOR4(ovp); /* object being renamed */ 8352 RENAME4res *rn_res; 8353 GETFH4res *ngf_res; 8354 bool_t needrecov; 8355 nfs4_recov_state_t recov_state; 8356 hrtime_t t; 8357 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8358 dirattr_info_t dinfo, *dinfop = &dinfo; 8359 8360 ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone); 8361 8362 recov_state.rs_flags = 0; 8363 recov_state.rs_num_retry_despite_err = 0; 8364 8365 recov_retry: 8366 *statp = NFS4_OK; 8367 8368 /* 8369 * There is a window between the RPC and updating the path and 8370 * filehandle stored in the rnode. Lock out the FHEXPIRED recovery 8371 * code, so that it doesn't try to use the old path during that 8372 * window. 8373 */ 8374 mutex_enter(&orp->r_statelock); 8375 while (orp->r_flags & R4RECEXPFH) { 8376 klwp_t *lwp = ttolwp(curthread); 8377 8378 if (lwp != NULL) 8379 lwp->lwp_nostop++; 8380 if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) { 8381 mutex_exit(&orp->r_statelock); 8382 if (lwp != NULL) 8383 lwp->lwp_nostop--; 8384 return (EINTR); 8385 } 8386 if (lwp != NULL) 8387 lwp->lwp_nostop--; 8388 } 8389 orp->r_flags |= R4RECEXPFH; 8390 mutex_exit(&orp->r_statelock); 8391 8392 mi = VTOMI4(odvp); 8393 8394 args.ctag = TAG_RENAME_VFH; 8395 args.array_len = (odvp == ndvp) ? 10 : 12; 8396 argoplist_size = args.array_len * sizeof (nfs_argop4); 8397 argop = kmem_alloc(argoplist_size, KM_SLEEP); 8398 8399 /* 8400 * Rename ops: 8401 * PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old), 8402 * PUTFH(targetdir), RENAME, GETATTR(targetdir) 8403 * LOOKUP(trgt), GETFH(new), GETATTR, 8404 * 8405 * if (odvp != ndvp) 8406 * add putfh(sourcedir), getattr(sourcedir) } 8407 */ 8408 args.array = argop; 8409 8410 e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8411 &recov_state, NULL); 8412 if (e.error) { 8413 kmem_free(argop, argoplist_size); 8414 mutex_enter(&orp->r_statelock); 8415 orp->r_flags &= ~R4RECEXPFH; 8416 cv_broadcast(&orp->r_cv); 8417 mutex_exit(&orp->r_statelock); 8418 return (e.error); 8419 } 8420 8421 /* 0: putfh source directory */ 8422 argop[0].argop = OP_CPUTFH; 8423 argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh; 8424 8425 /* 1: Save source fh to free up current for target */ 8426 argop[1].argop = OP_SAVEFH; 8427 8428 /* 2: Lookup pre-rename fh of renamed object */ 8429 argop[2].argop = OP_CLOOKUP; 8430 argop[2].nfs_argop4_u.opclookup.cname = onm; 8431 8432 /* 3: getfh fh of renamed object (before rename) */ 8433 argop[3].argop = OP_GETFH; 8434 8435 /* 4: putfh targetdir */ 8436 argop[4].argop = OP_CPUTFH; 8437 argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8438 8439 /* 5: current_fh is targetdir, saved_fh is sourcedir */ 8440 argop[5].argop = OP_CRENAME; 8441 argop[5].nfs_argop4_u.opcrename.coldname = onm; 8442 argop[5].nfs_argop4_u.opcrename.cnewname = nnm; 8443 8444 /* 6: getattr of target dir (post op attrs) */ 8445 argop[6].argop = OP_GETATTR; 8446 argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8447 argop[6].nfs_argop4_u.opgetattr.mi = mi; 8448 8449 /* 7: Lookup post-rename fh of renamed object */ 8450 argop[7].argop = OP_CLOOKUP; 8451 argop[7].nfs_argop4_u.opclookup.cname = nnm; 8452 8453 /* 8: getfh fh of renamed object (after rename) */ 8454 argop[8].argop = OP_GETFH; 8455 8456 /* 9: getattr of renamed object */ 8457 argop[9].argop = OP_GETATTR; 8458 argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8459 argop[9].nfs_argop4_u.opgetattr.mi = mi; 8460 8461 /* 8462 * If source/target dirs are different, then get new post-op 8463 * attrs for source dir also. 8464 */ 8465 if (ndvp != odvp) { 8466 /* 10: putfh (sourcedir) */ 8467 argop[10].argop = OP_CPUTFH; 8468 argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh; 8469 8470 /* 11: getattr (sourcedir) */ 8471 argop[11].argop = OP_GETATTR; 8472 argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8473 argop[11].nfs_argop4_u.opgetattr.mi = mi; 8474 } 8475 8476 dnlc_remove(odvp, onm); 8477 dnlc_remove(ndvp, nnm); 8478 8479 doqueue = 1; 8480 t = gethrtime(); 8481 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8482 8483 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8484 if (e.error) { 8485 PURGE_ATTRCACHE4(odvp); 8486 PURGE_ATTRCACHE4(ndvp); 8487 if (!needrecov) { 8488 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8489 &recov_state, needrecov); 8490 goto out; 8491 } 8492 } else { 8493 *statp = res.status; 8494 } 8495 8496 if (needrecov) { 8497 bool_t abort; 8498 8499 abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL, 8500 OP_RENAME, NULL, NULL, NULL); 8501 if (abort == FALSE) { 8502 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8503 &recov_state, needrecov); 8504 kmem_free(argop, argoplist_size); 8505 if (!e.error) 8506 (void) xdr_free(xdr_COMPOUND4res_clnt, 8507 (caddr_t)&res); 8508 mutex_enter(&orp->r_statelock); 8509 orp->r_flags &= ~R4RECEXPFH; 8510 cv_broadcast(&orp->r_cv); 8511 mutex_exit(&orp->r_statelock); 8512 goto recov_retry; 8513 } else { 8514 if (e.error != 0) { 8515 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, 8516 &recov_state, needrecov); 8517 goto out; 8518 } 8519 /* fall through for res.status case */ 8520 } 8521 } 8522 8523 resp = &res; 8524 /* 8525 * If OP_RENAME (or any prev op) failed, then return an error. 8526 * OP_RENAME is index 5, so if array len <= 6 we return an error. 8527 */ 8528 if ((res.status != NFS4_OK) && (res.array_len <= 6)) { 8529 /* 8530 * Error in an op other than last Getattr 8531 */ 8532 e.error = geterrno4(res.status); 8533 PURGE_ATTRCACHE4(odvp); 8534 PURGE_ATTRCACHE4(ndvp); 8535 /* 8536 * System V defines rename to return EEXIST, not 8537 * ENOTEMPTY if the target directory is not empty. 8538 * Over the wire, the error is NFSERR_ENOTEMPTY 8539 * which geterrno4 maps to ENOTEMPTY. 8540 */ 8541 if (e.error == ENOTEMPTY) 8542 e.error = EEXIST; 8543 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, 8544 needrecov); 8545 goto out; 8546 } 8547 8548 /* rename results */ 8549 rn_res = &res.array[5].nfs_resop4_u.oprename; 8550 8551 if (res.status == NFS4_OK) { 8552 /* Update target attribute, readdir and dnlc caches */ 8553 dinfo.di_garp = 8554 &res.array[6].nfs_resop4_u.opgetattr.ga_res; 8555 dinfo.di_cred = cr; 8556 dinfo.di_time_call = t; 8557 } else 8558 dinfop = NULL; 8559 8560 /* Update source cache attribute, readdir and dnlc caches */ 8561 nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop); 8562 8563 /* Update source cache attribute, readdir and dnlc caches */ 8564 if (ndvp != odvp) { 8565 update_parentdir_sfh(ovp, ndvp); 8566 8567 /* 8568 * If dinfop is non-NULL, then compound succeded, so 8569 * set di_garp to attrs for source dir. dinfop is only 8570 * set to NULL when compound fails. 8571 */ 8572 if (dinfop) 8573 dinfo.di_garp = 8574 &res.array[11].nfs_resop4_u.opgetattr.ga_res; 8575 nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL, 8576 dinfop); 8577 } 8578 8579 /* 8580 * Update the rnode with the new component name and args, 8581 * and if the file handle changed, also update it with the new fh. 8582 * This is only necessary if the target object has an rnode 8583 * entry and there is no need to create one for it. 8584 */ 8585 resop = &res.array[8]; /* getfh new res */ 8586 ngf_res = &resop->nfs_resop4_u.opgetfh; 8587 8588 /* 8589 * Update the path and filehandle for the renamed object. 8590 */ 8591 nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm); 8592 8593 nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov); 8594 8595 if (res.status == NFS4_OK) { 8596 resop++; /* getattr res */ 8597 e.error = nfs4_update_attrcache(res.status, 8598 &resop->nfs_resop4_u.opgetattr.ga_res, 8599 t, ovp, cr); 8600 } 8601 8602 out: 8603 kmem_free(argop, argoplist_size); 8604 if (resp) 8605 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8606 mutex_enter(&orp->r_statelock); 8607 orp->r_flags &= ~R4RECEXPFH; 8608 cv_broadcast(&orp->r_cv); 8609 mutex_exit(&orp->r_statelock); 8610 8611 return (e.error); 8612 } 8613 8614 /* ARGSUSED */ 8615 static int 8616 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr, 8617 caller_context_t *ct, int flags, vsecattr_t *vsecp) 8618 { 8619 int error; 8620 vnode_t *vp; 8621 8622 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8623 return (EPERM); 8624 /* 8625 * As ".." has special meaning and rather than send a mkdir 8626 * over the wire to just let the server freak out, we just 8627 * short circuit it here and return EEXIST 8628 */ 8629 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8630 return (EEXIST); 8631 8632 /* 8633 * Decision to get the right gid and setgid bit of the 8634 * new directory is now made in call_nfs4_create_req. 8635 */ 8636 va->va_mask |= AT_MODE; 8637 error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR); 8638 if (error) 8639 return (error); 8640 8641 *vpp = vp; 8642 return (0); 8643 } 8644 8645 8646 /* 8647 * rmdir is using the same remove v4 op as does remove. 8648 * Remove requires that the current fh be the target directory. 8649 * After the operation, the current fh is unchanged. 8650 * The compound op structure is: 8651 * PUTFH(targetdir), REMOVE 8652 */ 8653 /*ARGSUSED4*/ 8654 static int 8655 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr, 8656 caller_context_t *ct, int flags) 8657 { 8658 int need_end_op = FALSE; 8659 COMPOUND4args_clnt args; 8660 COMPOUND4res_clnt res, *resp = NULL; 8661 REMOVE4res *rm_res; 8662 nfs_argop4 argop[3]; 8663 nfs_resop4 *resop; 8664 vnode_t *vp; 8665 int doqueue; 8666 mntinfo4_t *mi; 8667 rnode4_t *drp; 8668 bool_t needrecov = FALSE; 8669 nfs4_recov_state_t recov_state; 8670 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 8671 dirattr_info_t dinfo, *dinfop; 8672 8673 if (nfs_zone() != VTOMI4(dvp)->mi_zone) 8674 return (EPERM); 8675 /* 8676 * As ".." has special meaning and rather than send a rmdir 8677 * over the wire to just let the server freak out, we just 8678 * short circuit it here and return EEXIST 8679 */ 8680 if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') 8681 return (EEXIST); 8682 8683 drp = VTOR4(dvp); 8684 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) 8685 return (EINTR); 8686 8687 /* 8688 * Attempt to prevent a rmdir(".") from succeeding. 8689 */ 8690 e.error = nfs4lookup(dvp, nm, &vp, cr, 0); 8691 if (e.error) { 8692 nfs_rw_exit(&drp->r_rwlock); 8693 return (e.error); 8694 } 8695 if (vp == cdir) { 8696 VN_RELE(vp); 8697 nfs_rw_exit(&drp->r_rwlock); 8698 return (EINVAL); 8699 } 8700 8701 /* 8702 * Since nfsv4 remove op works on both files and directories, 8703 * check that the removed object is indeed a directory. 8704 */ 8705 if (vp->v_type != VDIR) { 8706 VN_RELE(vp); 8707 nfs_rw_exit(&drp->r_rwlock); 8708 return (ENOTDIR); 8709 } 8710 8711 /* 8712 * First just remove the entry from the name cache, as it 8713 * is most likely an entry for this vp. 8714 */ 8715 dnlc_remove(dvp, nm); 8716 8717 /* 8718 * If there vnode reference count is greater than one, then 8719 * there may be additional references in the DNLC which will 8720 * need to be purged. First, trying removing the entry for 8721 * the parent directory and see if that removes the additional 8722 * reference(s). If that doesn't do it, then use dnlc_purge_vp 8723 * to completely remove any references to the directory which 8724 * might still exist in the DNLC. 8725 */ 8726 if (vp->v_count > 1) { 8727 dnlc_remove(vp, ".."); 8728 if (vp->v_count > 1) 8729 dnlc_purge_vp(vp); 8730 } 8731 8732 mi = VTOMI4(dvp); 8733 recov_state.rs_flags = 0; 8734 recov_state.rs_num_retry_despite_err = 0; 8735 8736 recov_retry: 8737 args.ctag = TAG_RMDIR; 8738 8739 /* 8740 * Rmdir ops: putfh dir; remove 8741 */ 8742 args.array_len = 3; 8743 args.array = argop; 8744 8745 e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state); 8746 if (e.error) { 8747 nfs_rw_exit(&drp->r_rwlock); 8748 return (e.error); 8749 } 8750 need_end_op = TRUE; 8751 8752 /* putfh directory */ 8753 argop[0].argop = OP_CPUTFH; 8754 argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh; 8755 8756 /* remove */ 8757 argop[1].argop = OP_CREMOVE; 8758 argop[1].nfs_argop4_u.opcremove.ctarget = nm; 8759 8760 /* getattr (postop attrs for dir that contained removed dir) */ 8761 argop[2].argop = OP_GETATTR; 8762 argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 8763 argop[2].nfs_argop4_u.opgetattr.mi = mi; 8764 8765 dinfo.di_time_call = gethrtime(); 8766 doqueue = 1; 8767 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 8768 8769 PURGE_ATTRCACHE4(vp); 8770 8771 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 8772 if (e.error) { 8773 PURGE_ATTRCACHE4(dvp); 8774 } 8775 8776 if (needrecov) { 8777 if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL, 8778 NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) { 8779 if (!e.error) 8780 (void) xdr_free(xdr_COMPOUND4res_clnt, 8781 (caddr_t)&res); 8782 8783 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, 8784 needrecov); 8785 need_end_op = FALSE; 8786 goto recov_retry; 8787 } 8788 } 8789 8790 if (!e.error) { 8791 resp = &res; 8792 8793 /* 8794 * Only return error if first 2 ops (OP_REMOVE or earlier) 8795 * failed. 8796 */ 8797 if (res.status != NFS4_OK && res.array_len <= 2) { 8798 e.error = geterrno4(res.status); 8799 PURGE_ATTRCACHE4(dvp); 8800 nfs4_end_op(VTOMI4(dvp), dvp, NULL, 8801 &recov_state, needrecov); 8802 need_end_op = FALSE; 8803 nfs4_purge_stale_fh(e.error, dvp, cr); 8804 /* 8805 * System V defines rmdir to return EEXIST, not 8806 * ENOTEMPTY if the directory is not empty. Over 8807 * the wire, the error is NFSERR_ENOTEMPTY which 8808 * geterrno4 maps to ENOTEMPTY. 8809 */ 8810 if (e.error == ENOTEMPTY) 8811 e.error = EEXIST; 8812 } else { 8813 resop = &res.array[1]; /* remove res */ 8814 rm_res = &resop->nfs_resop4_u.opremove; 8815 8816 if (res.status == NFS4_OK) { 8817 resop = &res.array[2]; /* dir attrs */ 8818 dinfo.di_garp = 8819 &resop->nfs_resop4_u.opgetattr.ga_res; 8820 dinfo.di_cred = cr; 8821 dinfop = &dinfo; 8822 } else 8823 dinfop = NULL; 8824 8825 /* Update dir attribute, readdir and dnlc caches */ 8826 nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL, 8827 dinfop); 8828 8829 /* destroy rddir cache for dir that was removed */ 8830 if (VTOR4(vp)->r_dir != NULL) 8831 nfs4_purge_rddir_cache(vp); 8832 } 8833 } 8834 8835 if (need_end_op) 8836 nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov); 8837 8838 nfs_rw_exit(&drp->r_rwlock); 8839 8840 if (resp) 8841 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 8842 8843 if (e.error == 0) { 8844 vnode_t *tvp; 8845 rnode4_t *trp; 8846 trp = VTOR4(vp); 8847 tvp = vp; 8848 if (IS_SHADOW(vp, trp)) 8849 tvp = RTOV4(trp); 8850 vnevent_rmdir(tvp, dvp, nm, ct); 8851 } 8852 8853 VN_RELE(vp); 8854 8855 return (e.error); 8856 } 8857 8858 /* ARGSUSED */ 8859 static int 8860 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr, 8861 caller_context_t *ct, int flags) 8862 { 8863 int error; 8864 vnode_t *vp; 8865 rnode4_t *rp; 8866 char *contents; 8867 mntinfo4_t *mi = VTOMI4(dvp); 8868 8869 if (nfs_zone() != mi->mi_zone) 8870 return (EPERM); 8871 if (!(mi->mi_flags & MI4_SYMLINK)) 8872 return (EOPNOTSUPP); 8873 8874 error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK); 8875 if (error) 8876 return (error); 8877 8878 ASSERT(nfs4_consistent_type(vp)); 8879 rp = VTOR4(vp); 8880 if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) { 8881 8882 contents = kmem_alloc(MAXPATHLEN, KM_SLEEP); 8883 8884 if (contents != NULL) { 8885 mutex_enter(&rp->r_statelock); 8886 if (rp->r_symlink.contents == NULL) { 8887 rp->r_symlink.len = strlen(tnm); 8888 bcopy(tnm, contents, rp->r_symlink.len); 8889 rp->r_symlink.contents = contents; 8890 rp->r_symlink.size = MAXPATHLEN; 8891 mutex_exit(&rp->r_statelock); 8892 } else { 8893 mutex_exit(&rp->r_statelock); 8894 kmem_free((void *)contents, MAXPATHLEN); 8895 } 8896 } 8897 } 8898 VN_RELE(vp); 8899 8900 return (error); 8901 } 8902 8903 8904 /* 8905 * Read directory entries. 8906 * There are some weird things to look out for here. The uio_loffset 8907 * field is either 0 or it is the offset returned from a previous 8908 * readdir. It is an opaque value used by the server to find the 8909 * correct directory block to read. The count field is the number 8910 * of blocks to read on the server. This is advisory only, the server 8911 * may return only one block's worth of entries. Entries may be compressed 8912 * on the server. 8913 */ 8914 /* ARGSUSED */ 8915 static int 8916 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp, 8917 caller_context_t *ct, int flags) 8918 { 8919 int error; 8920 uint_t count; 8921 rnode4_t *rp; 8922 rddir4_cache *rdc; 8923 rddir4_cache *rrdc; 8924 8925 if (nfs_zone() != VTOMI4(vp)->mi_zone) 8926 return (EIO); 8927 rp = VTOR4(vp); 8928 8929 ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER)); 8930 8931 /* 8932 * Make sure that the directory cache is valid. 8933 */ 8934 if (rp->r_dir != NULL) { 8935 if (nfs_disable_rddir_cache != 0) { 8936 /* 8937 * Setting nfs_disable_rddir_cache in /etc/system 8938 * allows interoperability with servers that do not 8939 * properly update the attributes of directories. 8940 * Any cached information gets purged before an 8941 * access is made to it. 8942 */ 8943 nfs4_purge_rddir_cache(vp); 8944 } 8945 8946 error = nfs4_validate_caches(vp, cr); 8947 if (error) 8948 return (error); 8949 } 8950 8951 count = MIN(uiop->uio_iov->iov_len, MAXBSIZE); 8952 8953 /* 8954 * Short circuit last readdir which always returns 0 bytes. 8955 * This can be done after the directory has been read through 8956 * completely at least once. This will set r_direof which 8957 * can be used to find the value of the last cookie. 8958 */ 8959 mutex_enter(&rp->r_statelock); 8960 if (rp->r_direof != NULL && 8961 uiop->uio_loffset == rp->r_direof->nfs4_ncookie) { 8962 mutex_exit(&rp->r_statelock); 8963 #ifdef DEBUG 8964 nfs4_readdir_cache_shorts++; 8965 #endif 8966 if (eofp) 8967 *eofp = 1; 8968 return (0); 8969 } 8970 8971 /* 8972 * Look for a cache entry. Cache entries are identified 8973 * by the NFS cookie value and the byte count requested. 8974 */ 8975 rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count); 8976 8977 /* 8978 * If rdc is NULL then the lookup resulted in an unrecoverable error. 8979 */ 8980 if (rdc == NULL) { 8981 mutex_exit(&rp->r_statelock); 8982 return (EINTR); 8983 } 8984 8985 /* 8986 * Check to see if we need to fill this entry in. 8987 */ 8988 if (rdc->flags & RDDIRREQ) { 8989 rdc->flags &= ~RDDIRREQ; 8990 rdc->flags |= RDDIR; 8991 mutex_exit(&rp->r_statelock); 8992 8993 /* 8994 * Do the readdir. 8995 */ 8996 nfs4readdir(vp, rdc, cr); 8997 8998 /* 8999 * Reacquire the lock, so that we can continue 9000 */ 9001 mutex_enter(&rp->r_statelock); 9002 /* 9003 * The entry is now complete 9004 */ 9005 rdc->flags &= ~RDDIR; 9006 } 9007 9008 ASSERT(!(rdc->flags & RDDIR)); 9009 9010 /* 9011 * If an error occurred while attempting 9012 * to fill the cache entry, mark the entry invalid and 9013 * just return the error. 9014 */ 9015 if (rdc->error) { 9016 error = rdc->error; 9017 rdc->flags |= RDDIRREQ; 9018 rddir4_cache_rele(rp, rdc); 9019 mutex_exit(&rp->r_statelock); 9020 return (error); 9021 } 9022 9023 /* 9024 * The cache entry is complete and good, 9025 * copyout the dirent structs to the calling 9026 * thread. 9027 */ 9028 error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop); 9029 9030 /* 9031 * If no error occurred during the copyout, 9032 * update the offset in the uio struct to 9033 * contain the value of the next NFS 4 cookie 9034 * and set the eof value appropriately. 9035 */ 9036 if (!error) { 9037 uiop->uio_loffset = rdc->nfs4_ncookie; 9038 if (eofp) 9039 *eofp = rdc->eof; 9040 } 9041 9042 /* 9043 * Decide whether to do readahead. Don't if we 9044 * have already read to the end of directory. 9045 */ 9046 if (rdc->eof) { 9047 /* 9048 * Make the entry the direof only if it is cached 9049 */ 9050 if (rdc->flags & RDDIRCACHED) 9051 rp->r_direof = rdc; 9052 rddir4_cache_rele(rp, rdc); 9053 mutex_exit(&rp->r_statelock); 9054 return (error); 9055 } 9056 9057 /* Determine if a readdir readahead should be done */ 9058 if (!(rp->r_flags & R4LOOKUP)) { 9059 rddir4_cache_rele(rp, rdc); 9060 mutex_exit(&rp->r_statelock); 9061 return (error); 9062 } 9063 9064 /* 9065 * Now look for a readahead entry. 9066 * 9067 * Check to see whether we found an entry for the readahead. 9068 * If so, we don't need to do anything further, so free the new 9069 * entry if one was allocated. Otherwise, allocate a new entry, add 9070 * it to the cache, and then initiate an asynchronous readdir 9071 * operation to fill it. 9072 */ 9073 rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count); 9074 9075 /* 9076 * A readdir cache entry could not be obtained for the readahead. In 9077 * this case we skip the readahead and return. 9078 */ 9079 if (rrdc == NULL) { 9080 rddir4_cache_rele(rp, rdc); 9081 mutex_exit(&rp->r_statelock); 9082 return (error); 9083 } 9084 9085 /* 9086 * Check to see if we need to fill this entry in. 9087 */ 9088 if (rrdc->flags & RDDIRREQ) { 9089 rrdc->flags &= ~RDDIRREQ; 9090 rrdc->flags |= RDDIR; 9091 rddir4_cache_rele(rp, rdc); 9092 mutex_exit(&rp->r_statelock); 9093 #ifdef DEBUG 9094 nfs4_readdir_readahead++; 9095 #endif 9096 /* 9097 * Do the readdir. 9098 */ 9099 nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir); 9100 return (error); 9101 } 9102 9103 rddir4_cache_rele(rp, rrdc); 9104 rddir4_cache_rele(rp, rdc); 9105 mutex_exit(&rp->r_statelock); 9106 return (error); 9107 } 9108 9109 static int 9110 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9111 { 9112 int error; 9113 rnode4_t *rp; 9114 9115 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 9116 9117 rp = VTOR4(vp); 9118 9119 /* 9120 * Obtain the readdir results for the caller. 9121 */ 9122 nfs4readdir(vp, rdc, cr); 9123 9124 mutex_enter(&rp->r_statelock); 9125 /* 9126 * The entry is now complete 9127 */ 9128 rdc->flags &= ~RDDIR; 9129 9130 error = rdc->error; 9131 if (error) 9132 rdc->flags |= RDDIRREQ; 9133 rddir4_cache_rele(rp, rdc); 9134 mutex_exit(&rp->r_statelock); 9135 9136 return (error); 9137 } 9138 9139 /* 9140 * Read directory entries. 9141 * There are some weird things to look out for here. The uio_loffset 9142 * field is either 0 or it is the offset returned from a previous 9143 * readdir. It is an opaque value used by the server to find the 9144 * correct directory block to read. The count field is the number 9145 * of blocks to read on the server. This is advisory only, the server 9146 * may return only one block's worth of entries. Entries may be compressed 9147 * on the server. 9148 * 9149 * Generates the following compound request: 9150 * 1. If readdir offset is zero and no dnlc entry for parent exists, 9151 * must include a Lookupp as well. In this case, send: 9152 * { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr } 9153 * 2. Otherwise just do: { Putfh <fh>; Readdir } 9154 * 9155 * Get complete attributes and filehandles for entries if this is the 9156 * first read of the directory. Otherwise, just get fileid's. 9157 */ 9158 static void 9159 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr) 9160 { 9161 COMPOUND4args_clnt args; 9162 COMPOUND4res_clnt res; 9163 READDIR4args *rargs; 9164 READDIR4res_clnt *rd_res; 9165 bitmap4 rd_bitsval; 9166 nfs_argop4 argop[5]; 9167 nfs_resop4 *resop; 9168 rnode4_t *rp = VTOR4(vp); 9169 mntinfo4_t *mi = VTOMI4(vp); 9170 int doqueue; 9171 u_longlong_t nodeid, pnodeid; /* id's of dir and its parents */ 9172 vnode_t *dvp; 9173 nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie; 9174 int num_ops, res_opcnt; 9175 bool_t needrecov = FALSE; 9176 nfs4_recov_state_t recov_state; 9177 hrtime_t t; 9178 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 9179 9180 ASSERT(nfs_zone() == mi->mi_zone); 9181 ASSERT(rdc->flags & RDDIR); 9182 ASSERT(rdc->entries == NULL); 9183 9184 /* 9185 * If rp were a stub, it should have triggered and caused 9186 * a mount for us to get this far. 9187 */ 9188 ASSERT(!RP_ISSTUB(rp)); 9189 9190 num_ops = 2; 9191 if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) { 9192 /* 9193 * Since nfsv4 readdir may not return entries for "." and "..", 9194 * the client must recreate them: 9195 * To find the correct nodeid, do the following: 9196 * For current node, get nodeid from dnlc. 9197 * - if current node is rootvp, set pnodeid to nodeid. 9198 * - else if parent is in the dnlc, get its nodeid from there. 9199 * - else add LOOKUPP+GETATTR to compound. 9200 */ 9201 nodeid = rp->r_attr.va_nodeid; 9202 if (vp->v_flag & VROOT) { 9203 pnodeid = nodeid; /* root of mount point */ 9204 } else { 9205 dvp = dnlc_lookup(vp, ".."); 9206 if (dvp != NULL && dvp != DNLC_NO_VNODE) { 9207 /* parent in dnlc cache - no need for otw */ 9208 pnodeid = VTOR4(dvp)->r_attr.va_nodeid; 9209 } else { 9210 /* 9211 * parent not in dnlc cache, 9212 * do lookupp to get its id 9213 */ 9214 num_ops = 5; 9215 pnodeid = 0; /* set later by getattr parent */ 9216 } 9217 if (dvp) 9218 VN_RELE(dvp); 9219 } 9220 } 9221 recov_state.rs_flags = 0; 9222 recov_state.rs_num_retry_despite_err = 0; 9223 9224 /* Save the original mount point security flavor */ 9225 (void) save_mnt_secinfo(mi->mi_curr_serv); 9226 9227 recov_retry: 9228 args.ctag = TAG_READDIR; 9229 9230 args.array = argop; 9231 args.array_len = num_ops; 9232 9233 if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9234 &recov_state, NULL)) { 9235 /* 9236 * If readdir a node that is a stub for a crossed mount point, 9237 * keep the original secinfo flavor for the current file 9238 * system, not the crossed one. 9239 */ 9240 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9241 rdc->error = e.error; 9242 return; 9243 } 9244 9245 /* 9246 * Determine which attrs to request for dirents. This code 9247 * must be protected by nfs4_start/end_fop because of r_server 9248 * (which will change during failover recovery). 9249 * 9250 */ 9251 if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) { 9252 /* 9253 * Get all vattr attrs plus filehandle and rdattr_error 9254 */ 9255 rd_bitsval = NFS4_VATTR_MASK | 9256 FATTR4_RDATTR_ERROR_MASK | 9257 FATTR4_FILEHANDLE_MASK; 9258 9259 if (rp->r_flags & R4READDIRWATTR) { 9260 mutex_enter(&rp->r_statelock); 9261 rp->r_flags &= ~R4READDIRWATTR; 9262 mutex_exit(&rp->r_statelock); 9263 } 9264 } else { 9265 servinfo4_t *svp = rp->r_server; 9266 9267 /* 9268 * Already read directory. Use readdir with 9269 * no attrs (except for mounted_on_fileid) for updates. 9270 */ 9271 rd_bitsval = FATTR4_RDATTR_ERROR_MASK; 9272 9273 /* 9274 * request mounted on fileid if supported, else request 9275 * fileid. maybe we should verify that fileid is supported 9276 * and request something else if not. 9277 */ 9278 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 9279 if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK) 9280 rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK; 9281 nfs_rw_exit(&svp->sv_lock); 9282 } 9283 9284 /* putfh directory fh */ 9285 argop[0].argop = OP_CPUTFH; 9286 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 9287 9288 argop[1].argop = OP_READDIR; 9289 rargs = &argop[1].nfs_argop4_u.opreaddir; 9290 /* 9291 * 1 and 2 are reserved for client "." and ".." entry offset. 9292 * cookie 0 should be used over-the-wire to start reading at 9293 * the beginning of the directory excluding "." and "..". 9294 */ 9295 if (rdc->nfs4_cookie == 0 || 9296 rdc->nfs4_cookie == 1 || 9297 rdc->nfs4_cookie == 2) { 9298 rargs->cookie = (nfs_cookie4)0; 9299 rargs->cookieverf = 0; 9300 } else { 9301 rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie; 9302 mutex_enter(&rp->r_statelock); 9303 rargs->cookieverf = rp->r_cookieverf4; 9304 mutex_exit(&rp->r_statelock); 9305 } 9306 rargs->dircount = MIN(rdc->buflen, mi->mi_tsize); 9307 rargs->maxcount = mi->mi_tsize; 9308 rargs->attr_request = rd_bitsval; 9309 rargs->rdc = rdc; 9310 rargs->dvp = vp; 9311 rargs->mi = mi; 9312 rargs->cr = cr; 9313 9314 9315 /* 9316 * If count < than the minimum required, we return no entries 9317 * and fail with EINVAL 9318 */ 9319 if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) { 9320 rdc->error = EINVAL; 9321 goto out; 9322 } 9323 9324 if (args.array_len == 5) { 9325 /* 9326 * Add lookupp and getattr for parent nodeid. 9327 */ 9328 argop[2].argop = OP_LOOKUPP; 9329 9330 argop[3].argop = OP_GETFH; 9331 9332 /* getattr parent */ 9333 argop[4].argop = OP_GETATTR; 9334 argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK; 9335 argop[4].nfs_argop4_u.opgetattr.mi = mi; 9336 } 9337 9338 doqueue = 1; 9339 9340 if (mi->mi_io_kstats) { 9341 mutex_enter(&mi->mi_lock); 9342 kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats)); 9343 mutex_exit(&mi->mi_lock); 9344 } 9345 9346 /* capture the time of this call */ 9347 rargs->t = t = gethrtime(); 9348 9349 rfs4call(mi, &args, &res, cr, &doqueue, 0, &e); 9350 9351 if (mi->mi_io_kstats) { 9352 mutex_enter(&mi->mi_lock); 9353 kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats)); 9354 mutex_exit(&mi->mi_lock); 9355 } 9356 9357 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 9358 9359 /* 9360 * If RPC error occurred and it isn't an error that 9361 * triggers recovery, then go ahead and fail now. 9362 */ 9363 if (e.error != 0 && !needrecov) { 9364 rdc->error = e.error; 9365 goto out; 9366 } 9367 9368 if (needrecov) { 9369 bool_t abort; 9370 9371 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 9372 "nfs4readdir: initiating recovery.\n")); 9373 9374 abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 9375 NULL, OP_READDIR, NULL, NULL, NULL); 9376 if (abort == FALSE) { 9377 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9378 &recov_state, needrecov); 9379 if (!e.error) 9380 (void) xdr_free(xdr_COMPOUND4res_clnt, 9381 (caddr_t)&res); 9382 if (rdc->entries != NULL) { 9383 kmem_free(rdc->entries, rdc->entlen); 9384 rdc->entries = NULL; 9385 } 9386 goto recov_retry; 9387 } 9388 9389 if (e.error != 0) { 9390 rdc->error = e.error; 9391 goto out; 9392 } 9393 9394 /* fall through for res.status case */ 9395 } 9396 9397 res_opcnt = res.array_len; 9398 9399 /* 9400 * If compound failed first 2 ops (PUTFH+READDIR), then return 9401 * failure here. Subsequent ops are for filling out dot-dot 9402 * dirent, and if they fail, we still want to give the caller 9403 * the dirents returned by (the successful) READDIR op, so we need 9404 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR). 9405 * 9406 * One example where PUTFH+READDIR ops would succeed but 9407 * LOOKUPP+GETATTR would fail would be a dir that has r perm 9408 * but lacks x. In this case, a POSIX server's VOP_READDIR 9409 * would succeed; however, VOP_LOOKUP(..) would fail since no 9410 * x perm. We need to come up with a non-vendor-specific way 9411 * for a POSIX server to return d_ino from dotdot's dirent if 9412 * client only requests mounted_on_fileid, and just say the 9413 * LOOKUPP succeeded and fill out the GETATTR. However, if 9414 * client requested any mandatory attrs, server would be required 9415 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR 9416 * for dotdot. 9417 */ 9418 9419 if (res.status) { 9420 if (res_opcnt <= 2) { 9421 e.error = geterrno4(res.status); 9422 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR, 9423 &recov_state, needrecov); 9424 nfs4_purge_stale_fh(e.error, vp, cr); 9425 rdc->error = e.error; 9426 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9427 if (rdc->entries != NULL) { 9428 kmem_free(rdc->entries, rdc->entlen); 9429 rdc->entries = NULL; 9430 } 9431 /* 9432 * If readdir a node that is a stub for a 9433 * crossed mount point, keep the original 9434 * secinfo flavor for the current file system, 9435 * not the crossed one. 9436 */ 9437 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9438 return; 9439 } 9440 } 9441 9442 resop = &res.array[1]; /* readdir res */ 9443 rd_res = &resop->nfs_resop4_u.opreaddirclnt; 9444 9445 mutex_enter(&rp->r_statelock); 9446 rp->r_cookieverf4 = rd_res->cookieverf; 9447 mutex_exit(&rp->r_statelock); 9448 9449 /* 9450 * For "." and ".." entries 9451 * e.g. 9452 * seek(cookie=0) -> "." entry with d_off = 1 9453 * seek(cookie=1) -> ".." entry with d_off = 2 9454 */ 9455 if (cookie == (nfs_cookie4) 0) { 9456 if (rd_res->dotp) 9457 rd_res->dotp->d_ino = nodeid; 9458 if (rd_res->dotdotp) 9459 rd_res->dotdotp->d_ino = pnodeid; 9460 } 9461 if (cookie == (nfs_cookie4) 1) { 9462 if (rd_res->dotdotp) 9463 rd_res->dotdotp->d_ino = pnodeid; 9464 } 9465 9466 9467 /* LOOKUPP+GETATTR attemped */ 9468 if (args.array_len == 5 && rd_res->dotdotp) { 9469 if (res.status == NFS4_OK && res_opcnt == 5) { 9470 nfs_fh4 *fhp; 9471 nfs4_sharedfh_t *sfhp; 9472 vnode_t *pvp; 9473 nfs4_ga_res_t *garp; 9474 9475 resop++; /* lookupp */ 9476 resop++; /* getfh */ 9477 fhp = &resop->nfs_resop4_u.opgetfh.object; 9478 9479 resop++; /* getattr of parent */ 9480 9481 /* 9482 * First, take care of finishing the 9483 * readdir results. 9484 */ 9485 garp = &resop->nfs_resop4_u.opgetattr.ga_res; 9486 /* 9487 * The d_ino of .. must be the inode number 9488 * of the mounted filesystem. 9489 */ 9490 if (garp->n4g_va.va_mask & AT_NODEID) 9491 rd_res->dotdotp->d_ino = 9492 garp->n4g_va.va_nodeid; 9493 9494 9495 /* 9496 * Next, create the ".." dnlc entry 9497 */ 9498 sfhp = sfh4_get(fhp, mi); 9499 if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) { 9500 dnlc_update(vp, "..", pvp); 9501 VN_RELE(pvp); 9502 } 9503 sfh4_rele(&sfhp); 9504 } 9505 } 9506 9507 if (mi->mi_io_kstats) { 9508 mutex_enter(&mi->mi_lock); 9509 KSTAT_IO_PTR(mi->mi_io_kstats)->reads++; 9510 KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen; 9511 mutex_exit(&mi->mi_lock); 9512 } 9513 9514 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 9515 9516 out: 9517 /* 9518 * If readdir a node that is a stub for a crossed mount point, 9519 * keep the original secinfo flavor for the current file system, 9520 * not the crossed one. 9521 */ 9522 (void) check_mnt_secinfo(mi->mi_curr_serv, vp); 9523 9524 nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov); 9525 } 9526 9527 9528 static int 9529 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead) 9530 { 9531 rnode4_t *rp = VTOR4(bp->b_vp); 9532 int count; 9533 int error; 9534 cred_t *cred_otw = NULL; 9535 offset_t offset; 9536 nfs4_open_stream_t *osp = NULL; 9537 bool_t first_time = TRUE; /* first time getting otw cred */ 9538 bool_t last_time = FALSE; /* last time getting otw cred */ 9539 9540 ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone); 9541 9542 DTRACE_IO1(start, struct buf *, bp); 9543 offset = ldbtob(bp->b_lblkno); 9544 9545 if (bp->b_flags & B_READ) { 9546 read_again: 9547 /* 9548 * Releases the osp, if it is provided. 9549 * Puts a hold on the cred_otw and the new osp (if found). 9550 */ 9551 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9552 &first_time, &last_time); 9553 error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr, 9554 offset, bp->b_bcount, &bp->b_resid, cred_otw, 9555 readahead, NULL); 9556 crfree(cred_otw); 9557 if (!error) { 9558 if (bp->b_resid) { 9559 /* 9560 * Didn't get it all because we hit EOF, 9561 * zero all the memory beyond the EOF. 9562 */ 9563 /* bzero(rdaddr + */ 9564 bzero(bp->b_un.b_addr + 9565 bp->b_bcount - bp->b_resid, bp->b_resid); 9566 } 9567 mutex_enter(&rp->r_statelock); 9568 if (bp->b_resid == bp->b_bcount && 9569 offset >= rp->r_size) { 9570 /* 9571 * We didn't read anything at all as we are 9572 * past EOF. Return an error indicator back 9573 * but don't destroy the pages (yet). 9574 */ 9575 error = NFS_EOF; 9576 } 9577 mutex_exit(&rp->r_statelock); 9578 } else if (error == EACCES && last_time == FALSE) { 9579 goto read_again; 9580 } 9581 } else { 9582 if (!(rp->r_flags & R4STALE)) { 9583 write_again: 9584 /* 9585 * Releases the osp, if it is provided. 9586 * Puts a hold on the cred_otw and the new 9587 * osp (if found). 9588 */ 9589 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 9590 &first_time, &last_time); 9591 mutex_enter(&rp->r_statelock); 9592 count = MIN(bp->b_bcount, rp->r_size - offset); 9593 mutex_exit(&rp->r_statelock); 9594 if (count < 0) 9595 cmn_err(CE_PANIC, "nfs4_bio: write count < 0"); 9596 #ifdef DEBUG 9597 if (count == 0) { 9598 zoneid_t zoneid = getzoneid(); 9599 9600 zcmn_err(zoneid, CE_WARN, 9601 "nfs4_bio: zero length write at %lld", 9602 offset); 9603 zcmn_err(zoneid, CE_CONT, "flags=0x%x, " 9604 "b_bcount=%ld, file size=%lld", 9605 rp->r_flags, (long)bp->b_bcount, 9606 rp->r_size); 9607 sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh); 9608 if (nfs4_bio_do_stop) 9609 debug_enter("nfs4_bio"); 9610 } 9611 #endif 9612 error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset, 9613 count, cred_otw, stab_comm); 9614 if (error == EACCES && last_time == FALSE) { 9615 crfree(cred_otw); 9616 goto write_again; 9617 } 9618 bp->b_error = error; 9619 if (error && error != EINTR && 9620 !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) { 9621 /* 9622 * Don't print EDQUOT errors on the console. 9623 * Don't print asynchronous EACCES errors. 9624 * Don't print EFBIG errors. 9625 * Print all other write errors. 9626 */ 9627 if (error != EDQUOT && error != EFBIG && 9628 (error != EACCES || 9629 !(bp->b_flags & B_ASYNC))) 9630 nfs4_write_error(bp->b_vp, 9631 error, cred_otw); 9632 /* 9633 * Update r_error and r_flags as appropriate. 9634 * If the error was ESTALE, then mark the 9635 * rnode as not being writeable and save 9636 * the error status. Otherwise, save any 9637 * errors which occur from asynchronous 9638 * page invalidations. Any errors occurring 9639 * from other operations should be saved 9640 * by the caller. 9641 */ 9642 mutex_enter(&rp->r_statelock); 9643 if (error == ESTALE) { 9644 rp->r_flags |= R4STALE; 9645 if (!rp->r_error) 9646 rp->r_error = error; 9647 } else if (!rp->r_error && 9648 (bp->b_flags & 9649 (B_INVAL|B_FORCE|B_ASYNC)) == 9650 (B_INVAL|B_FORCE|B_ASYNC)) { 9651 rp->r_error = error; 9652 } 9653 mutex_exit(&rp->r_statelock); 9654 } 9655 crfree(cred_otw); 9656 } else { 9657 error = rp->r_error; 9658 /* 9659 * A close may have cleared r_error, if so, 9660 * propagate ESTALE error return properly 9661 */ 9662 if (error == 0) 9663 error = ESTALE; 9664 } 9665 } 9666 9667 if (error != 0 && error != NFS_EOF) 9668 bp->b_flags |= B_ERROR; 9669 9670 if (osp) 9671 open_stream_rele(osp, rp); 9672 9673 DTRACE_IO1(done, struct buf *, bp); 9674 9675 return (error); 9676 } 9677 9678 /* ARGSUSED */ 9679 int 9680 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) 9681 { 9682 return (EREMOTE); 9683 } 9684 9685 /* ARGSUSED2 */ 9686 int 9687 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9688 { 9689 rnode4_t *rp = VTOR4(vp); 9690 9691 if (!write_lock) { 9692 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9693 return (V_WRITELOCK_FALSE); 9694 } 9695 9696 if ((rp->r_flags & R4DIRECTIO) || 9697 (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) { 9698 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE); 9699 if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp)) 9700 return (V_WRITELOCK_FALSE); 9701 nfs_rw_exit(&rp->r_rwlock); 9702 } 9703 9704 (void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE); 9705 return (V_WRITELOCK_TRUE); 9706 } 9707 9708 /* ARGSUSED */ 9709 void 9710 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp) 9711 { 9712 rnode4_t *rp = VTOR4(vp); 9713 9714 nfs_rw_exit(&rp->r_rwlock); 9715 } 9716 9717 /* ARGSUSED */ 9718 static int 9719 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct) 9720 { 9721 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9722 return (EIO); 9723 9724 /* 9725 * Because we stuff the readdir cookie into the offset field 9726 * someone may attempt to do an lseek with the cookie which 9727 * we want to succeed. 9728 */ 9729 if (vp->v_type == VDIR) 9730 return (0); 9731 if (*noffp < 0) 9732 return (EINVAL); 9733 return (0); 9734 } 9735 9736 9737 /* 9738 * Return all the pages from [off..off+len) in file 9739 */ 9740 /* ARGSUSED */ 9741 static int 9742 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, 9743 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9744 enum seg_rw rw, cred_t *cr, caller_context_t *ct) 9745 { 9746 rnode4_t *rp; 9747 int error; 9748 mntinfo4_t *mi; 9749 9750 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9751 return (EIO); 9752 rp = VTOR4(vp); 9753 if (IS_SHADOW(vp, rp)) 9754 vp = RTOV4(rp); 9755 9756 if (vp->v_flag & VNOMAP) 9757 return (ENOSYS); 9758 9759 if (protp != NULL) 9760 *protp = PROT_ALL; 9761 9762 /* 9763 * Now validate that the caches are up to date. 9764 */ 9765 if (error = nfs4_validate_caches(vp, cr)) 9766 return (error); 9767 9768 mi = VTOMI4(vp); 9769 retry: 9770 mutex_enter(&rp->r_statelock); 9771 9772 /* 9773 * Don't create dirty pages faster than they 9774 * can be cleaned so that the system doesn't 9775 * get imbalanced. If the async queue is 9776 * maxed out, then wait for it to drain before 9777 * creating more dirty pages. Also, wait for 9778 * any threads doing pagewalks in the vop_getattr 9779 * entry points so that they don't block for 9780 * long periods. 9781 */ 9782 if (rw == S_CREATE) { 9783 while ((mi->mi_max_threads != 0 && 9784 rp->r_awcount > 2 * mi->mi_max_threads) || 9785 rp->r_gcount > 0) 9786 cv_wait(&rp->r_cv, &rp->r_statelock); 9787 } 9788 9789 /* 9790 * If we are getting called as a side effect of an nfs_write() 9791 * operation the local file size might not be extended yet. 9792 * In this case we want to be able to return pages of zeroes. 9793 */ 9794 if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) { 9795 NFS4_DEBUG(nfs4_pageio_debug, 9796 (CE_NOTE, "getpage beyond EOF: off=%lld, " 9797 "len=%llu, size=%llu, attrsize =%llu", off, 9798 (u_longlong_t)len, rp->r_size, rp->r_attr.va_size)); 9799 mutex_exit(&rp->r_statelock); 9800 return (EFAULT); /* beyond EOF */ 9801 } 9802 9803 mutex_exit(&rp->r_statelock); 9804 9805 error = pvn_getpages(nfs4_getapage, vp, off, len, protp, 9806 pl, plsz, seg, addr, rw, cr); 9807 NFS4_DEBUG(nfs4_pageio_debug && error, 9808 (CE_NOTE, "getpages error %d; off=%lld, len=%lld", 9809 error, off, (u_longlong_t)len)); 9810 9811 switch (error) { 9812 case NFS_EOF: 9813 nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE); 9814 goto retry; 9815 case ESTALE: 9816 nfs4_purge_stale_fh(error, vp, cr); 9817 } 9818 9819 return (error); 9820 } 9821 9822 /* 9823 * Called from pvn_getpages to get a particular page. 9824 */ 9825 /* ARGSUSED */ 9826 static int 9827 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp, 9828 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 9829 enum seg_rw rw, cred_t *cr) 9830 { 9831 rnode4_t *rp; 9832 uint_t bsize; 9833 struct buf *bp; 9834 page_t *pp; 9835 u_offset_t lbn; 9836 u_offset_t io_off; 9837 u_offset_t blkoff; 9838 u_offset_t rablkoff; 9839 size_t io_len; 9840 uint_t blksize; 9841 int error; 9842 int readahead; 9843 int readahead_issued = 0; 9844 int ra_window; /* readahead window */ 9845 page_t *pagefound; 9846 page_t *savepp; 9847 9848 if (nfs_zone() != VTOMI4(vp)->mi_zone) 9849 return (EIO); 9850 9851 rp = VTOR4(vp); 9852 ASSERT(!IS_SHADOW(vp, rp)); 9853 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 9854 9855 reread: 9856 bp = NULL; 9857 pp = NULL; 9858 pagefound = NULL; 9859 9860 if (pl != NULL) 9861 pl[0] = NULL; 9862 9863 error = 0; 9864 lbn = off / bsize; 9865 blkoff = lbn * bsize; 9866 9867 /* 9868 * Queueing up the readahead before doing the synchronous read 9869 * results in a significant increase in read throughput because 9870 * of the increased parallelism between the async threads and 9871 * the process context. 9872 */ 9873 if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 && 9874 rw != S_CREATE && 9875 !(vp->v_flag & VNOCACHE)) { 9876 mutex_enter(&rp->r_statelock); 9877 9878 /* 9879 * Calculate the number of readaheads to do. 9880 * a) No readaheads at offset = 0. 9881 * b) Do maximum(nfs4_nra) readaheads when the readahead 9882 * window is closed. 9883 * c) Do readaheads between 1 to (nfs4_nra - 1) depending 9884 * upon how far the readahead window is open or close. 9885 * d) No readaheads if rp->r_nextr is not within the scope 9886 * of the readahead window (random i/o). 9887 */ 9888 9889 if (off == 0) 9890 readahead = 0; 9891 else if (blkoff == rp->r_nextr) 9892 readahead = nfs4_nra; 9893 else if (rp->r_nextr > blkoff && 9894 ((ra_window = (rp->r_nextr - blkoff) / bsize) 9895 <= (nfs4_nra - 1))) 9896 readahead = nfs4_nra - ra_window; 9897 else 9898 readahead = 0; 9899 9900 rablkoff = rp->r_nextr; 9901 while (readahead > 0 && rablkoff + bsize < rp->r_size) { 9902 mutex_exit(&rp->r_statelock); 9903 if (nfs4_async_readahead(vp, rablkoff + bsize, 9904 addr + (rablkoff + bsize - off), 9905 seg, cr, nfs4_readahead) < 0) { 9906 mutex_enter(&rp->r_statelock); 9907 break; 9908 } 9909 readahead--; 9910 rablkoff += bsize; 9911 /* 9912 * Indicate that we did a readahead so 9913 * readahead offset is not updated 9914 * by the synchronous read below. 9915 */ 9916 readahead_issued = 1; 9917 mutex_enter(&rp->r_statelock); 9918 /* 9919 * set readahead offset to 9920 * offset of last async readahead 9921 * request. 9922 */ 9923 rp->r_nextr = rablkoff; 9924 } 9925 mutex_exit(&rp->r_statelock); 9926 } 9927 9928 again: 9929 if ((pagefound = page_exists(vp, off)) == NULL) { 9930 if (pl == NULL) { 9931 (void) nfs4_async_readahead(vp, blkoff, addr, seg, cr, 9932 nfs4_readahead); 9933 } else if (rw == S_CREATE) { 9934 /* 9935 * Block for this page is not allocated, or the offset 9936 * is beyond the current allocation size, or we're 9937 * allocating a swap slot and the page was not found, 9938 * so allocate it and return a zero page. 9939 */ 9940 if ((pp = page_create_va(vp, off, 9941 PAGESIZE, PG_WAIT, seg, addr)) == NULL) 9942 cmn_err(CE_PANIC, "nfs4_getapage: page_create"); 9943 io_len = PAGESIZE; 9944 mutex_enter(&rp->r_statelock); 9945 rp->r_nextr = off + PAGESIZE; 9946 mutex_exit(&rp->r_statelock); 9947 } else { 9948 /* 9949 * Need to go to server to get a block 9950 */ 9951 mutex_enter(&rp->r_statelock); 9952 if (blkoff < rp->r_size && 9953 blkoff + bsize > rp->r_size) { 9954 /* 9955 * If less than a block left in 9956 * file read less than a block. 9957 */ 9958 if (rp->r_size <= off) { 9959 /* 9960 * Trying to access beyond EOF, 9961 * set up to get at least one page. 9962 */ 9963 blksize = off + PAGESIZE - blkoff; 9964 } else 9965 blksize = rp->r_size - blkoff; 9966 } else if ((off == 0) || 9967 (off != rp->r_nextr && !readahead_issued)) { 9968 blksize = PAGESIZE; 9969 blkoff = off; /* block = page here */ 9970 } else 9971 blksize = bsize; 9972 mutex_exit(&rp->r_statelock); 9973 9974 pp = pvn_read_kluster(vp, off, seg, addr, &io_off, 9975 &io_len, blkoff, blksize, 0); 9976 9977 /* 9978 * Some other thread has entered the page, 9979 * so just use it. 9980 */ 9981 if (pp == NULL) 9982 goto again; 9983 9984 /* 9985 * Now round the request size up to page boundaries. 9986 * This ensures that the entire page will be 9987 * initialized to zeroes if EOF is encountered. 9988 */ 9989 io_len = ptob(btopr(io_len)); 9990 9991 bp = pageio_setup(pp, io_len, vp, B_READ); 9992 ASSERT(bp != NULL); 9993 9994 /* 9995 * pageio_setup should have set b_addr to 0. This 9996 * is correct since we want to do I/O on a page 9997 * boundary. bp_mapin will use this addr to calculate 9998 * an offset, and then set b_addr to the kernel virtual 9999 * address it allocated for us. 10000 */ 10001 ASSERT(bp->b_un.b_addr == 0); 10002 10003 bp->b_edev = 0; 10004 bp->b_dev = 0; 10005 bp->b_lblkno = lbtodb(io_off); 10006 bp->b_file = vp; 10007 bp->b_offset = (offset_t)off; 10008 bp_mapin(bp); 10009 10010 /* 10011 * If doing a write beyond what we believe is EOF, 10012 * don't bother trying to read the pages from the 10013 * server, we'll just zero the pages here. We 10014 * don't check that the rw flag is S_WRITE here 10015 * because some implementations may attempt a 10016 * read access to the buffer before copying data. 10017 */ 10018 mutex_enter(&rp->r_statelock); 10019 if (io_off >= rp->r_size && seg == segkmap) { 10020 mutex_exit(&rp->r_statelock); 10021 bzero(bp->b_un.b_addr, io_len); 10022 } else { 10023 mutex_exit(&rp->r_statelock); 10024 error = nfs4_bio(bp, NULL, cr, FALSE); 10025 } 10026 10027 /* 10028 * Unmap the buffer before freeing it. 10029 */ 10030 bp_mapout(bp); 10031 pageio_done(bp); 10032 10033 savepp = pp; 10034 do { 10035 pp->p_fsdata = C_NOCOMMIT; 10036 } while ((pp = pp->p_next) != savepp); 10037 10038 if (error == NFS_EOF) { 10039 /* 10040 * If doing a write system call just return 10041 * zeroed pages, else user tried to get pages 10042 * beyond EOF, return error. We don't check 10043 * that the rw flag is S_WRITE here because 10044 * some implementations may attempt a read 10045 * access to the buffer before copying data. 10046 */ 10047 if (seg == segkmap) 10048 error = 0; 10049 else 10050 error = EFAULT; 10051 } 10052 10053 if (!readahead_issued && !error) { 10054 mutex_enter(&rp->r_statelock); 10055 rp->r_nextr = io_off + io_len; 10056 mutex_exit(&rp->r_statelock); 10057 } 10058 } 10059 } 10060 10061 out: 10062 if (pl == NULL) 10063 return (error); 10064 10065 if (error) { 10066 if (pp != NULL) 10067 pvn_read_done(pp, B_ERROR); 10068 return (error); 10069 } 10070 10071 if (pagefound) { 10072 se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED); 10073 10074 /* 10075 * Page exists in the cache, acquire the appropriate lock. 10076 * If this fails, start all over again. 10077 */ 10078 if ((pp = page_lookup(vp, off, se)) == NULL) { 10079 #ifdef DEBUG 10080 nfs4_lostpage++; 10081 #endif 10082 goto reread; 10083 } 10084 pl[0] = pp; 10085 pl[1] = NULL; 10086 return (0); 10087 } 10088 10089 if (pp != NULL) 10090 pvn_plist_init(pp, pl, plsz, off, io_len, rw); 10091 10092 return (error); 10093 } 10094 10095 static void 10096 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg, 10097 cred_t *cr) 10098 { 10099 int error; 10100 page_t *pp; 10101 u_offset_t io_off; 10102 size_t io_len; 10103 struct buf *bp; 10104 uint_t bsize, blksize; 10105 rnode4_t *rp = VTOR4(vp); 10106 page_t *savepp; 10107 10108 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10109 10110 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10111 10112 mutex_enter(&rp->r_statelock); 10113 if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) { 10114 /* 10115 * If less than a block left in file read less 10116 * than a block. 10117 */ 10118 blksize = rp->r_size - blkoff; 10119 } else 10120 blksize = bsize; 10121 mutex_exit(&rp->r_statelock); 10122 10123 pp = pvn_read_kluster(vp, blkoff, segkmap, addr, 10124 &io_off, &io_len, blkoff, blksize, 1); 10125 /* 10126 * The isra flag passed to the kluster function is 1, we may have 10127 * gotten a return value of NULL for a variety of reasons (# of free 10128 * pages < minfree, someone entered the page on the vnode etc). In all 10129 * cases, we want to punt on the readahead. 10130 */ 10131 if (pp == NULL) 10132 return; 10133 10134 /* 10135 * Now round the request size up to page boundaries. 10136 * This ensures that the entire page will be 10137 * initialized to zeroes if EOF is encountered. 10138 */ 10139 io_len = ptob(btopr(io_len)); 10140 10141 bp = pageio_setup(pp, io_len, vp, B_READ); 10142 ASSERT(bp != NULL); 10143 10144 /* 10145 * pageio_setup should have set b_addr to 0. This is correct since 10146 * we want to do I/O on a page boundary. bp_mapin() will use this addr 10147 * to calculate an offset, and then set b_addr to the kernel virtual 10148 * address it allocated for us. 10149 */ 10150 ASSERT(bp->b_un.b_addr == 0); 10151 10152 bp->b_edev = 0; 10153 bp->b_dev = 0; 10154 bp->b_lblkno = lbtodb(io_off); 10155 bp->b_file = vp; 10156 bp->b_offset = (offset_t)blkoff; 10157 bp_mapin(bp); 10158 10159 /* 10160 * If doing a write beyond what we believe is EOF, don't bother trying 10161 * to read the pages from the server, we'll just zero the pages here. 10162 * We don't check that the rw flag is S_WRITE here because some 10163 * implementations may attempt a read access to the buffer before 10164 * copying data. 10165 */ 10166 mutex_enter(&rp->r_statelock); 10167 if (io_off >= rp->r_size && seg == segkmap) { 10168 mutex_exit(&rp->r_statelock); 10169 bzero(bp->b_un.b_addr, io_len); 10170 error = 0; 10171 } else { 10172 mutex_exit(&rp->r_statelock); 10173 error = nfs4_bio(bp, NULL, cr, TRUE); 10174 if (error == NFS_EOF) 10175 error = 0; 10176 } 10177 10178 /* 10179 * Unmap the buffer before freeing it. 10180 */ 10181 bp_mapout(bp); 10182 pageio_done(bp); 10183 10184 savepp = pp; 10185 do { 10186 pp->p_fsdata = C_NOCOMMIT; 10187 } while ((pp = pp->p_next) != savepp); 10188 10189 pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ); 10190 10191 /* 10192 * In case of error set readahead offset 10193 * to the lowest offset. 10194 * pvn_read_done() calls VN_DISPOSE to destroy the pages 10195 */ 10196 if (error && rp->r_nextr > io_off) { 10197 mutex_enter(&rp->r_statelock); 10198 if (rp->r_nextr > io_off) 10199 rp->r_nextr = io_off; 10200 mutex_exit(&rp->r_statelock); 10201 } 10202 } 10203 10204 /* 10205 * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE} 10206 * If len == 0, do from off to EOF. 10207 * 10208 * The normal cases should be len == 0 && off == 0 (entire vp list) or 10209 * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE 10210 * (from pageout). 10211 */ 10212 /* ARGSUSED */ 10213 static int 10214 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, 10215 caller_context_t *ct) 10216 { 10217 int error; 10218 rnode4_t *rp; 10219 10220 ASSERT(cr != NULL); 10221 10222 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 10223 return (EIO); 10224 10225 rp = VTOR4(vp); 10226 if (IS_SHADOW(vp, rp)) 10227 vp = RTOV4(rp); 10228 10229 /* 10230 * XXX - Why should this check be made here? 10231 */ 10232 if (vp->v_flag & VNOMAP) 10233 return (ENOSYS); 10234 10235 if (len == 0 && !(flags & B_INVAL) && 10236 (vp->v_vfsp->vfs_flag & VFS_RDONLY)) 10237 return (0); 10238 10239 mutex_enter(&rp->r_statelock); 10240 rp->r_count++; 10241 mutex_exit(&rp->r_statelock); 10242 error = nfs4_putpages(vp, off, len, flags, cr); 10243 mutex_enter(&rp->r_statelock); 10244 rp->r_count--; 10245 cv_broadcast(&rp->r_cv); 10246 mutex_exit(&rp->r_statelock); 10247 10248 return (error); 10249 } 10250 10251 /* 10252 * Write out a single page, possibly klustering adjacent dirty pages. 10253 */ 10254 int 10255 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, 10256 int flags, cred_t *cr) 10257 { 10258 u_offset_t io_off; 10259 u_offset_t lbn_off; 10260 u_offset_t lbn; 10261 size_t io_len; 10262 uint_t bsize; 10263 int error; 10264 rnode4_t *rp; 10265 10266 ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY)); 10267 ASSERT(pp != NULL); 10268 ASSERT(cr != NULL); 10269 ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone); 10270 10271 rp = VTOR4(vp); 10272 ASSERT(rp->r_count > 0); 10273 ASSERT(!IS_SHADOW(vp, rp)); 10274 10275 bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE); 10276 lbn = pp->p_offset / bsize; 10277 lbn_off = lbn * bsize; 10278 10279 /* 10280 * Find a kluster that fits in one block, or in 10281 * one page if pages are bigger than blocks. If 10282 * there is less file space allocated than a whole 10283 * page, we'll shorten the i/o request below. 10284 */ 10285 pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off, 10286 roundup(bsize, PAGESIZE), flags); 10287 10288 /* 10289 * pvn_write_kluster shouldn't have returned a page with offset 10290 * behind the original page we were given. Verify that. 10291 */ 10292 ASSERT((pp->p_offset / bsize) >= lbn); 10293 10294 /* 10295 * Now pp will have the list of kept dirty pages marked for 10296 * write back. It will also handle invalidation and freeing 10297 * of pages that are not dirty. Check for page length rounding 10298 * problems. 10299 */ 10300 if (io_off + io_len > lbn_off + bsize) { 10301 ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE); 10302 io_len = lbn_off + bsize - io_off; 10303 } 10304 /* 10305 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10306 * consistent value of r_size. R4MODINPROGRESS is set in writerp4(). 10307 * When R4MODINPROGRESS is set it indicates that a uiomove() is in 10308 * progress and the r_size has not been made consistent with the 10309 * new size of the file. When the uiomove() completes the r_size is 10310 * updated and the R4MODINPROGRESS flag is cleared. 10311 * 10312 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a 10313 * consistent value of r_size. Without this handshaking, it is 10314 * possible that nfs4_bio() picks up the old value of r_size 10315 * before the uiomove() in writerp4() completes. This will result 10316 * in the write through nfs4_bio() being dropped. 10317 * 10318 * More precisely, there is a window between the time the uiomove() 10319 * completes and the time the r_size is updated. If a VOP_PUTPAGE() 10320 * operation intervenes in this window, the page will be picked up, 10321 * because it is dirty (it will be unlocked, unless it was 10322 * pagecreate'd). When the page is picked up as dirty, the dirty 10323 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is 10324 * checked. This will still be the old size. Therefore the page will 10325 * not be written out. When segmap_release() calls VOP_PUTPAGE(), 10326 * the page will be found to be clean and the write will be dropped. 10327 */ 10328 if (rp->r_flags & R4MODINPROGRESS) { 10329 mutex_enter(&rp->r_statelock); 10330 if ((rp->r_flags & R4MODINPROGRESS) && 10331 rp->r_modaddr + MAXBSIZE > io_off && 10332 rp->r_modaddr < io_off + io_len) { 10333 page_t *plist; 10334 /* 10335 * A write is in progress for this region of the file. 10336 * If we did not detect R4MODINPROGRESS here then this 10337 * path through nfs_putapage() would eventually go to 10338 * nfs4_bio() and may not write out all of the data 10339 * in the pages. We end up losing data. So we decide 10340 * to set the modified bit on each page in the page 10341 * list and mark the rnode with R4DIRTY. This write 10342 * will be restarted at some later time. 10343 */ 10344 plist = pp; 10345 while (plist != NULL) { 10346 pp = plist; 10347 page_sub(&plist, pp); 10348 hat_setmod(pp); 10349 page_io_unlock(pp); 10350 page_unlock(pp); 10351 } 10352 rp->r_flags |= R4DIRTY; 10353 mutex_exit(&rp->r_statelock); 10354 if (offp) 10355 *offp = io_off; 10356 if (lenp) 10357 *lenp = io_len; 10358 return (0); 10359 } 10360 mutex_exit(&rp->r_statelock); 10361 } 10362 10363 if (flags & B_ASYNC) { 10364 error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr, 10365 nfs4_sync_putapage); 10366 } else 10367 error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr); 10368 10369 if (offp) 10370 *offp = io_off; 10371 if (lenp) 10372 *lenp = io_len; 10373 return (error); 10374 } 10375 10376 static int 10377 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 10378 int flags, cred_t *cr) 10379 { 10380 int error; 10381 rnode4_t *rp; 10382 10383 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 10384 10385 flags |= B_WRITE; 10386 10387 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 10388 10389 rp = VTOR4(vp); 10390 10391 if ((error == ENOSPC || error == EDQUOT || error == EFBIG || 10392 error == EACCES) && 10393 (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) { 10394 if (!(rp->r_flags & R4OUTOFSPACE)) { 10395 mutex_enter(&rp->r_statelock); 10396 rp->r_flags |= R4OUTOFSPACE; 10397 mutex_exit(&rp->r_statelock); 10398 } 10399 flags |= B_ERROR; 10400 pvn_write_done(pp, flags); 10401 /* 10402 * If this was not an async thread, then try again to 10403 * write out the pages, but this time, also destroy 10404 * them whether or not the write is successful. This 10405 * will prevent memory from filling up with these 10406 * pages and destroying them is the only alternative 10407 * if they can't be written out. 10408 * 10409 * Don't do this if this is an async thread because 10410 * when the pages are unlocked in pvn_write_done, 10411 * some other thread could have come along, locked 10412 * them, and queued for an async thread. It would be 10413 * possible for all of the async threads to be tied 10414 * up waiting to lock the pages again and they would 10415 * all already be locked and waiting for an async 10416 * thread to handle them. Deadlock. 10417 */ 10418 if (!(flags & B_ASYNC)) { 10419 error = nfs4_putpage(vp, io_off, io_len, 10420 B_INVAL | B_FORCE, cr, NULL); 10421 } 10422 } else { 10423 if (error) 10424 flags |= B_ERROR; 10425 else if (rp->r_flags & R4OUTOFSPACE) { 10426 mutex_enter(&rp->r_statelock); 10427 rp->r_flags &= ~R4OUTOFSPACE; 10428 mutex_exit(&rp->r_statelock); 10429 } 10430 pvn_write_done(pp, flags); 10431 if (freemem < desfree) 10432 (void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr, 10433 NFS4_WRITE_NOWAIT); 10434 } 10435 10436 return (error); 10437 } 10438 10439 #ifdef DEBUG 10440 int nfs4_force_open_before_mmap = 0; 10441 #endif 10442 10443 /* ARGSUSED */ 10444 static int 10445 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, 10446 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10447 caller_context_t *ct) 10448 { 10449 struct segvn_crargs vn_a; 10450 int error = 0; 10451 rnode4_t *rp = VTOR4(vp); 10452 mntinfo4_t *mi = VTOMI4(vp); 10453 10454 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10455 return (EIO); 10456 10457 if (vp->v_flag & VNOMAP) 10458 return (ENOSYS); 10459 10460 if (off < 0 || (off + len) < 0) 10461 return (ENXIO); 10462 10463 if (vp->v_type != VREG) 10464 return (ENODEV); 10465 10466 /* 10467 * If the file is delegated to the client don't do anything. 10468 * If the file is not delegated, then validate the data cache. 10469 */ 10470 mutex_enter(&rp->r_statev4_lock); 10471 if (rp->r_deleg_type == OPEN_DELEGATE_NONE) { 10472 mutex_exit(&rp->r_statev4_lock); 10473 error = nfs4_validate_caches(vp, cr); 10474 if (error) 10475 return (error); 10476 } else { 10477 mutex_exit(&rp->r_statev4_lock); 10478 } 10479 10480 /* 10481 * Check to see if the vnode is currently marked as not cachable. 10482 * This means portions of the file are locked (through VOP_FRLOCK). 10483 * In this case the map request must be refused. We use 10484 * rp->r_lkserlock to avoid a race with concurrent lock requests. 10485 * 10486 * Atomically increment r_inmap after acquiring r_rwlock. The 10487 * idea here is to acquire r_rwlock to block read/write and 10488 * not to protect r_inmap. r_inmap will inform nfs4_read/write() 10489 * that we are in nfs4_map(). Now, r_rwlock is acquired in order 10490 * and we can prevent the deadlock that would have occurred 10491 * when nfs4_addmap() would have acquired it out of order. 10492 * 10493 * Since we are not protecting r_inmap by any lock, we do not 10494 * hold any lock when we decrement it. We atomically decrement 10495 * r_inmap after we release r_lkserlock. 10496 */ 10497 10498 if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp))) 10499 return (EINTR); 10500 atomic_inc_uint(&rp->r_inmap); 10501 nfs_rw_exit(&rp->r_rwlock); 10502 10503 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) { 10504 atomic_dec_uint(&rp->r_inmap); 10505 return (EINTR); 10506 } 10507 10508 10509 if (vp->v_flag & VNOCACHE) { 10510 error = EAGAIN; 10511 goto done; 10512 } 10513 10514 /* 10515 * Don't allow concurrent locks and mapping if mandatory locking is 10516 * enabled. 10517 */ 10518 if (flk_has_remote_locks(vp)) { 10519 struct vattr va; 10520 va.va_mask = AT_MODE; 10521 error = nfs4getattr(vp, &va, cr); 10522 if (error != 0) 10523 goto done; 10524 if (MANDLOCK(vp, va.va_mode)) { 10525 error = EAGAIN; 10526 goto done; 10527 } 10528 } 10529 10530 /* 10531 * It is possible that the rnode has a lost lock request that we 10532 * are still trying to recover, and that the request conflicts with 10533 * this map request. 10534 * 10535 * An alternative approach would be for nfs4_safemap() to consider 10536 * queued lock requests when deciding whether to set or clear 10537 * VNOCACHE. This would require the frlock code path to call 10538 * nfs4_safemap() after enqueing a lost request. 10539 */ 10540 if (nfs4_map_lost_lock_conflict(vp)) { 10541 error = EAGAIN; 10542 goto done; 10543 } 10544 10545 as_rangelock(as); 10546 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); 10547 if (error != 0) { 10548 as_rangeunlock(as); 10549 goto done; 10550 } 10551 10552 if (vp->v_type == VREG) { 10553 /* 10554 * We need to retrieve the open stream 10555 */ 10556 nfs4_open_stream_t *osp = NULL; 10557 nfs4_open_owner_t *oop = NULL; 10558 10559 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10560 if (oop != NULL) { 10561 /* returns with 'os_sync_lock' held */ 10562 osp = find_open_stream(oop, rp); 10563 open_owner_rele(oop); 10564 } 10565 if (osp == NULL) { 10566 #ifdef DEBUG 10567 if (nfs4_force_open_before_mmap) { 10568 error = EIO; 10569 goto done; 10570 } 10571 #endif 10572 /* returns with 'os_sync_lock' held */ 10573 error = open_and_get_osp(vp, cr, &osp); 10574 if (osp == NULL) { 10575 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10576 "nfs4_map: we tried to OPEN the file " 10577 "but again no osp, so fail with EIO")); 10578 goto done; 10579 } 10580 } 10581 10582 if (osp->os_failed_reopen) { 10583 mutex_exit(&osp->os_sync_lock); 10584 open_stream_rele(osp, rp); 10585 NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, 10586 "nfs4_map: os_failed_reopen set on " 10587 "osp %p, cr %p, rp %s", (void *)osp, 10588 (void *)cr, rnode4info(rp))); 10589 error = EIO; 10590 goto done; 10591 } 10592 mutex_exit(&osp->os_sync_lock); 10593 open_stream_rele(osp, rp); 10594 } 10595 10596 vn_a.vp = vp; 10597 vn_a.offset = off; 10598 vn_a.type = (flags & MAP_TYPE); 10599 vn_a.prot = (uchar_t)prot; 10600 vn_a.maxprot = (uchar_t)maxprot; 10601 vn_a.flags = (flags & ~MAP_TYPE); 10602 vn_a.cred = cr; 10603 vn_a.amp = NULL; 10604 vn_a.szc = 0; 10605 vn_a.lgrp_mem_policy_flags = 0; 10606 10607 error = as_map(as, *addrp, len, segvn_create, &vn_a); 10608 as_rangeunlock(as); 10609 10610 done: 10611 nfs_rw_exit(&rp->r_lkserlock); 10612 atomic_dec_uint(&rp->r_inmap); 10613 return (error); 10614 } 10615 10616 /* 10617 * We're most likely dealing with a kernel module that likes to READ 10618 * and mmap without OPENing the file (ie: lookup/read/mmap), so lets 10619 * officially OPEN the file to create the necessary client state 10620 * for bookkeeping of os_mmap_read/write counts. 10621 * 10622 * Since VOP_MAP only passes in a pointer to the vnode rather than 10623 * a double pointer, we can't handle the case where nfs4open_otw() 10624 * returns a different vnode than the one passed into VOP_MAP (since 10625 * VOP_DELMAP will not see the vnode nfs4open_otw used). In this case, 10626 * we return NULL and let nfs4_map() fail. Note: the only case where 10627 * this should happen is if the file got removed and replaced with the 10628 * same name on the server (in addition to the fact that we're trying 10629 * to VOP_MAP withouth VOP_OPENing the file in the first place). 10630 */ 10631 static int 10632 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp) 10633 { 10634 rnode4_t *rp, *drp; 10635 vnode_t *dvp, *open_vp; 10636 char file_name[MAXNAMELEN]; 10637 int just_created; 10638 nfs4_open_stream_t *osp; 10639 nfs4_open_owner_t *oop; 10640 int error; 10641 10642 *ospp = NULL; 10643 open_vp = map_vp; 10644 10645 rp = VTOR4(open_vp); 10646 if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0) 10647 return (error); 10648 drp = VTOR4(dvp); 10649 10650 if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) { 10651 VN_RELE(dvp); 10652 return (EINTR); 10653 } 10654 10655 if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) { 10656 nfs_rw_exit(&drp->r_rwlock); 10657 VN_RELE(dvp); 10658 return (error); 10659 } 10660 10661 mutex_enter(&rp->r_statev4_lock); 10662 if (rp->created_v4) { 10663 rp->created_v4 = 0; 10664 mutex_exit(&rp->r_statev4_lock); 10665 10666 dnlc_update(dvp, file_name, open_vp); 10667 /* This is needed so we don't bump the open ref count */ 10668 just_created = 1; 10669 } else { 10670 mutex_exit(&rp->r_statev4_lock); 10671 just_created = 0; 10672 } 10673 10674 VN_HOLD(map_vp); 10675 10676 error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0, 10677 just_created); 10678 if (error) { 10679 nfs_rw_exit(&drp->r_rwlock); 10680 VN_RELE(dvp); 10681 VN_RELE(map_vp); 10682 return (error); 10683 } 10684 10685 nfs_rw_exit(&drp->r_rwlock); 10686 VN_RELE(dvp); 10687 10688 /* 10689 * If nfs4open_otw() returned a different vnode then "undo" 10690 * the open and return failure to the caller. 10691 */ 10692 if (!VN_CMP(open_vp, map_vp)) { 10693 nfs4_error_t e; 10694 10695 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10696 "open returned a different vnode")); 10697 /* 10698 * If there's an error, ignore it, 10699 * and let VOP_INACTIVE handle it. 10700 */ 10701 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10702 CLOSE_NORM, 0, 0, 0); 10703 VN_RELE(map_vp); 10704 return (EIO); 10705 } 10706 10707 VN_RELE(map_vp); 10708 10709 oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp)); 10710 if (!oop) { 10711 nfs4_error_t e; 10712 10713 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: " 10714 "no open owner")); 10715 /* 10716 * If there's an error, ignore it, 10717 * and let VOP_INACTIVE handle it. 10718 */ 10719 (void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e, 10720 CLOSE_NORM, 0, 0, 0); 10721 return (EIO); 10722 } 10723 osp = find_open_stream(oop, rp); 10724 open_owner_rele(oop); 10725 *ospp = osp; 10726 return (0); 10727 } 10728 10729 /* 10730 * Please be aware that when this function is called, the address space write 10731 * a_lock is held. Do not put over the wire calls in this function. 10732 */ 10733 /* ARGSUSED */ 10734 static int 10735 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 10736 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr, 10737 caller_context_t *ct) 10738 { 10739 rnode4_t *rp; 10740 int error = 0; 10741 mntinfo4_t *mi; 10742 10743 mi = VTOMI4(vp); 10744 rp = VTOR4(vp); 10745 10746 if (nfs_zone() != mi->mi_zone) 10747 return (EIO); 10748 if (vp->v_flag & VNOMAP) 10749 return (ENOSYS); 10750 10751 /* 10752 * Don't need to update the open stream first, since this 10753 * mmap can't add any additional share access that isn't 10754 * already contained in the open stream (for the case where we 10755 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't 10756 * take into account os_mmap_read[write] counts). 10757 */ 10758 atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len)); 10759 10760 if (vp->v_type == VREG) { 10761 /* 10762 * We need to retrieve the open stream and update the counts. 10763 * If there is no open stream here, something is wrong. 10764 */ 10765 nfs4_open_stream_t *osp = NULL; 10766 nfs4_open_owner_t *oop = NULL; 10767 10768 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 10769 if (oop != NULL) { 10770 /* returns with 'os_sync_lock' held */ 10771 osp = find_open_stream(oop, rp); 10772 open_owner_rele(oop); 10773 } 10774 if (osp == NULL) { 10775 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, 10776 "nfs4_addmap: we should have an osp" 10777 "but we don't, so fail with EIO")); 10778 error = EIO; 10779 goto out; 10780 } 10781 10782 NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p," 10783 " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot)); 10784 10785 /* 10786 * Update the map count in the open stream. 10787 * This is necessary in the case where we 10788 * open/mmap/close/, then the server reboots, and we 10789 * attempt to reopen. If the mmap doesn't add share 10790 * access then we send an invalid reopen with 10791 * access = NONE. 10792 * 10793 * We need to specifically check each PROT_* so a mmap 10794 * call of (PROT_WRITE | PROT_EXEC) will ensure us both 10795 * read and write access. A simple comparison of prot 10796 * to ~PROT_WRITE to determine read access is insufficient 10797 * since prot can be |= with PROT_USER, etc. 10798 */ 10799 10800 /* 10801 * Unless we're MAP_SHARED, no sense in adding os_mmap_write 10802 */ 10803 if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 10804 osp->os_mmap_write += btopr(len); 10805 if (maxprot & PROT_READ) 10806 osp->os_mmap_read += btopr(len); 10807 if (maxprot & PROT_EXEC) 10808 osp->os_mmap_read += btopr(len); 10809 /* 10810 * Ensure that os_mmap_read gets incremented, even if 10811 * maxprot were to look like PROT_NONE. 10812 */ 10813 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 10814 !(maxprot & PROT_EXEC)) 10815 osp->os_mmap_read += btopr(len); 10816 osp->os_mapcnt += btopr(len); 10817 mutex_exit(&osp->os_sync_lock); 10818 open_stream_rele(osp, rp); 10819 } 10820 10821 out: 10822 /* 10823 * If we got an error, then undo our 10824 * incrementing of 'r_mapcnt'. 10825 */ 10826 10827 if (error) { 10828 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len)); 10829 ASSERT(rp->r_mapcnt >= 0); 10830 } 10831 return (error); 10832 } 10833 10834 /* ARGSUSED */ 10835 static int 10836 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct) 10837 { 10838 10839 return (VTOR4(vp1) == VTOR4(vp2)); 10840 } 10841 10842 /* 10843 * Data structure for nfs4_lkserlock_callback() function. 10844 */ 10845 struct nfs4_lkserlock_callback_data { 10846 vnode_t *vp; 10847 int rc; 10848 }; 10849 10850 /* 10851 * Callback function for reclock(). 10852 */ 10853 static callb_cpr_t * 10854 nfs4_lkserlock_callback(flk_cb_when_t when, void *infop) 10855 { 10856 struct nfs4_lkserlock_callback_data *dp = 10857 (struct nfs4_lkserlock_callback_data *)infop; 10858 rnode4_t *rp = VTOR4(dp->vp); 10859 10860 if (when == FLK_BEFORE_SLEEP) 10861 nfs_rw_exit(&rp->r_lkserlock); 10862 else 10863 dp->rc = nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, 10864 INTR4(dp->vp)); 10865 10866 return (NULL); 10867 } 10868 10869 /* ARGSUSED */ 10870 static int 10871 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 10872 offset_t offset, struct flk_callback *flk_cbp, cred_t *cr, 10873 caller_context_t *ct) 10874 { 10875 int rc = 0; 10876 rnode4_t *rp; 10877 int intr = INTR4(vp); 10878 nfs4_error_t e; 10879 int frcmd; 10880 struct lm_sysid *ls = NULL; 10881 10882 if (nfs_zone() != VTOMI4(vp)->mi_zone) 10883 return (EIO); 10884 10885 /* check for valid cmd parameter and set frcmd appropriately */ 10886 switch (cmd) { 10887 case F_GETLK: 10888 frcmd = 0; 10889 break; 10890 case F_SETLK: 10891 frcmd = SETFLCK; 10892 break; 10893 case F_SETLKW: 10894 frcmd = SETFLCK | SLPFLCK; 10895 break; 10896 default: 10897 return (EINVAL); 10898 } 10899 10900 /* 10901 * If lock is relative to EOF, we need the newest length of the file. 10902 * Therefore invalidate the ATTR_CACHE. 10903 */ 10904 if (bfp->l_whence == 2) /* SEEK_END */ 10905 PURGE_ATTRCACHE4(vp); 10906 10907 /* 10908 * If the filesystem is mounted using local locking, pass the 10909 * request off to the local locking code. 10910 */ 10911 if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) { 10912 if (cmd == F_SETLK || cmd == F_SETLKW) { 10913 /* 10914 * For complete safety, we should be holding 10915 * r_lkserlock. However, we can't call 10916 * nfs4_safelock and then fs_frlock while 10917 * holding r_lkserlock, so just invoke 10918 * nfs4_safelock and expect that this will 10919 * catch enough of the cases. 10920 */ 10921 if (!nfs4_safelock(vp, bfp, cr)) 10922 return (EAGAIN); 10923 } 10924 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 10925 } 10926 10927 /* 10928 * Convert the offset. We need to do this to make sure our view of the 10929 * locking range is always the same through the rest of this function. 10930 * This is especially needed for bfp->l_whence == SEEK_END, because the 10931 * length of the file could change anytime and thus the locking range 10932 * would be a moving target for us. 10933 * 10934 * For the bfp->l_whence == SEEK_CUR case this is just a convenient 10935 * conversion to make the life easier for nfs4frlock(). 10936 */ 10937 rc = convoff(vp, bfp, 0, offset); 10938 if (rc != 0) 10939 return (rc); 10940 10941 if (bfp->l_type == F_UNLCK) { 10942 u_offset_t start, end; 10943 10944 /* 10945 * Shortcut for trivial case. 10946 */ 10947 if (cmd == F_GETLK) 10948 return (rc); 10949 10950 /* 10951 * For every lock or unlock request we need to do two steps: 10952 * (un)register the local lock, and (un)register the lock at 10953 * the NFSv4 server. It is essential to make sure the lock 10954 * status registered at the server and registered locally is 10955 * same and never goes out of sync. This means that if one 10956 * step fails, the other one needs to be either skipped, or 10957 * reverted. 10958 * 10959 * For lock requests the situation is easy since a lock 10960 * registration can be reverted without any risk of data 10961 * corruption. 10962 * 10963 * The unlock requests cannot be reverted because once a lock 10964 * is unregistered the race window is open and some other 10965 * process could grab a conflicting lock. This means that once 10966 * the first step (the first lock unregistration) succeeded, 10967 * the second step cannot fail. The second step for the unlock 10968 * request is the local lock unregistration by the reclock() 10969 * call. 10970 * 10971 * The only way how the reclock() call for an unlock request 10972 * could fail is the invalid unlock range so we check it here, 10973 * before the lock is unregistered at NFSv4 server. This 10974 * duplicates the check done in the reclock() function. 10975 */ 10976 rc = flk_convert_lock_data(vp, bfp, &start, &end, offset); 10977 if (rc != 0) 10978 return (rc); 10979 rc = flk_check_lock_data(start, end, MAXEND); 10980 if (rc != 0) 10981 return (rc); 10982 10983 intr = 0; 10984 } 10985 10986 /* 10987 * For F_SETLK and F_SETLKW we need to set sysid. 10988 */ 10989 if (cmd == F_SETLK || cmd == F_SETLKW) { 10990 rc = nfs4frlock_get_sysid(&ls, vp, bfp); 10991 if (rc != 0) 10992 return (rc); 10993 10994 /* 10995 * Client locks are registerred locally by oring the sysid with 10996 * LM_SYSID_CLIENT. The server registers locks locally using 10997 * just the sysid. We need to distinguish between the two to 10998 * avoid collision in a case one machine is used as both client 10999 * and server. 11000 */ 11001 bfp->l_sysid |= LM_SYSID_CLIENT; 11002 } 11003 11004 bfp->l_pid = curproc->p_pid; 11005 11006 rp = VTOR4(vp); 11007 11008 /* 11009 * Check whether the given lock request can proceed, given the 11010 * current file mappings. 11011 */ 11012 if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr)) { 11013 if (ls != NULL) 11014 lm_rel_sysid(ls); 11015 return (EINTR); 11016 } 11017 if (cmd == F_SETLK || cmd == F_SETLKW) { 11018 if (!nfs4_safelock(vp, bfp, cr)) { 11019 rc = EAGAIN; 11020 goto done; 11021 } 11022 } 11023 11024 /* 11025 * For query we will try to find a conflicting local lock first by 11026 * calling reclock(). 11027 * 11028 * In a case this is a lock request we need to register it locally 11029 * first before we consult the NFSv4 server. 11030 */ 11031 if (cmd == F_GETLK || bfp->l_type != F_UNLCK) { 11032 /* 11033 * If we might sleep in reclock() we need to register a 11034 * callback to release the r_lkserlock during the sleep. 11035 */ 11036 if ((frcmd & SLPFLCK) == 0) { 11037 rc = reclock(vp, bfp, frcmd, flag, 0, flk_cbp); 11038 } else { 11039 flk_callback_t callback; 11040 struct nfs4_lkserlock_callback_data callback_data = 11041 {vp, 0}; 11042 11043 flk_add_callback(&callback, nfs4_lkserlock_callback, 11044 &callback_data, flk_cbp); 11045 rc = reclock(vp, bfp, frcmd, flag, 0, &callback); 11046 flk_del_callback(&callback); 11047 11048 if (callback_data.rc != 0) { 11049 /* 11050 * The nfs_rw_enter_sig() call in 11051 * nfs4_lkserlock_callback() failed. 11052 */ 11053 11054 if (rc == 0) { 11055 /* 11056 * The reclock() call above succeeded 11057 * so we need to revert it. 11058 */ 11059 bfp->l_type = F_UNLCK; 11060 rc = reclock(vp, bfp, frcmd, flag, 0, 11061 flk_cbp); 11062 /* The unlock cannot fail */ 11063 ASSERT(rc == 0); 11064 11065 /* 11066 * We are here because we failed to 11067 * acquire r_lkserlock in 11068 * nfs4_lkserlock_callback() due to a 11069 * signal. Return the appropriate 11070 * error. 11071 */ 11072 rc = EINTR; 11073 } 11074 11075 ASSERT(ls != NULL); 11076 lm_rel_sysid(ls); 11077 11078 return (rc); 11079 } 11080 11081 /* 11082 * We possibly released r_lkserlock in reclock() so 11083 * make sure it is still safe to lock the file. 11084 */ 11085 if (!nfs4_safelock(vp, bfp, cr)) { 11086 rc = EAGAIN; 11087 goto revert; 11088 } 11089 11090 } 11091 11092 /* 11093 * If the reclock() call failed we are done and we will return 11094 * an error to the caller. Similarly, if we found a 11095 * conflicting lock registered locally we are done too. We do 11096 * not need to consult the server. 11097 */ 11098 if ((rc != 0) || (cmd == F_GETLK && bfp->l_type != F_UNLCK)) 11099 goto done; 11100 } 11101 11102 /* 11103 * Flush the cache after waiting for async I/O to finish. For new 11104 * locks, this is so that the process gets the latest bits from the 11105 * server. For unlocks, this is so that other clients see the 11106 * latest bits once the file has been unlocked. If currently dirty 11107 * pages can't be flushed, then don't allow a lock to be set. But 11108 * allow unlocks to succeed, to avoid having orphan locks on the 11109 * server. 11110 */ 11111 if (cmd != F_GETLK) { 11112 mutex_enter(&rp->r_statelock); 11113 while (rp->r_count > 0) { 11114 if (intr) { 11115 klwp_t *lwp = ttolwp(curthread); 11116 11117 if (lwp != NULL) 11118 lwp->lwp_nostop++; 11119 if (cv_wait_sig(&rp->r_cv, 11120 &rp->r_statelock) == 0) { 11121 if (lwp != NULL) 11122 lwp->lwp_nostop--; 11123 rc = EINTR; 11124 break; 11125 } 11126 if (lwp != NULL) 11127 lwp->lwp_nostop--; 11128 } else 11129 cv_wait(&rp->r_cv, &rp->r_statelock); 11130 } 11131 mutex_exit(&rp->r_statelock); 11132 if (rc != 0) { 11133 ASSERT(bfp->l_type != F_UNLCK); 11134 11135 goto revert; 11136 } 11137 11138 rc = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct); 11139 if (rc != 0) { 11140 if (rc == ENOSPC || rc == EDQUOT) { 11141 mutex_enter(&rp->r_statelock); 11142 if (!rp->r_error) 11143 rp->r_error = rc; 11144 mutex_exit(&rp->r_statelock); 11145 } 11146 11147 /* 11148 * If this was a lock request, make sure it is 11149 * reverted. 11150 */ 11151 if (bfp->l_type != F_UNLCK) { 11152 rc = ENOLCK; 11153 goto revert; 11154 } 11155 } 11156 } 11157 11158 /* 11159 * Call the lock manager to do the real work of contacting 11160 * the server and obtaining the lock. 11161 */ 11162 nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, cr, &e, NULL, NULL); 11163 rc = e.error; 11164 11165 if (rc == 0) 11166 nfs4_lockcompletion(vp, cmd); 11167 11168 revert: 11169 /* 11170 * If this is either successful unlock request or a lock request that 11171 * failed we should unregister/revert the local lock now. 11172 */ 11173 if ((rc == 0 && cmd != F_GETLK && bfp->l_type == F_UNLCK) || 11174 (rc != 0 && cmd != F_GETLK && bfp->l_type != F_UNLCK)) { 11175 int r; 11176 11177 bfp->l_type = F_UNLCK; 11178 r = reclock(vp, bfp, frcmd, flag, 0, flk_cbp); 11179 /* The unlock cannot fail */ 11180 ASSERT(r == 0); 11181 } 11182 11183 done: 11184 nfs_rw_exit(&rp->r_lkserlock); 11185 if (ls != NULL) 11186 lm_rel_sysid(ls); 11187 11188 return (rc); 11189 } 11190 11191 /* 11192 * Free storage space associated with the specified vnode. The portion 11193 * to be freed is specified by bfp->l_start and bfp->l_len (already 11194 * normalized to a "whence" of 0). 11195 * 11196 * This is an experimental facility whose continued existence is not 11197 * guaranteed. Currently, we only support the special case 11198 * of l_len == 0, meaning free to end of file. 11199 */ 11200 /* ARGSUSED */ 11201 static int 11202 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag, 11203 offset_t offset, cred_t *cr, caller_context_t *ct) 11204 { 11205 int error; 11206 11207 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11208 return (EIO); 11209 ASSERT(vp->v_type == VREG); 11210 if (cmd != F_FREESP) 11211 return (EINVAL); 11212 11213 error = convoff(vp, bfp, 0, offset); 11214 if (!error) { 11215 ASSERT(bfp->l_start >= 0); 11216 if (bfp->l_len == 0) { 11217 struct vattr va; 11218 11219 va.va_mask = AT_SIZE; 11220 va.va_size = bfp->l_start; 11221 error = nfs4setattr(vp, &va, 0, cr, NULL); 11222 11223 if (error == 0 && bfp->l_start == 0) 11224 vnevent_truncate(vp, ct); 11225 } else 11226 error = EINVAL; 11227 } 11228 11229 return (error); 11230 } 11231 11232 /* ARGSUSED */ 11233 int 11234 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct) 11235 { 11236 rnode4_t *rp; 11237 rp = VTOR4(vp); 11238 11239 if (vp->v_type == VREG && IS_SHADOW(vp, rp)) { 11240 vp = RTOV4(rp); 11241 } 11242 *vpp = vp; 11243 return (0); 11244 } 11245 11246 /* 11247 * Setup and add an address space callback to do the work of the delmap call. 11248 * The callback will (and must be) deleted in the actual callback function. 11249 * 11250 * This is done in order to take care of the problem that we have with holding 11251 * the address space's a_lock for a long period of time (e.g. if the NFS server 11252 * is down). Callbacks will be executed in the address space code while the 11253 * a_lock is not held. Holding the address space's a_lock causes things such 11254 * as ps and fork to hang because they are trying to acquire this lock as well. 11255 */ 11256 /* ARGSUSED */ 11257 static int 11258 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, 11259 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr, 11260 caller_context_t *ct) 11261 { 11262 int caller_found; 11263 int error; 11264 rnode4_t *rp; 11265 nfs4_delmap_args_t *dmapp; 11266 nfs4_delmapcall_t *delmap_call; 11267 11268 if (vp->v_flag & VNOMAP) 11269 return (ENOSYS); 11270 11271 /* 11272 * A process may not change zones if it has NFS pages mmap'ed 11273 * in, so we can't legitimately get here from the wrong zone. 11274 */ 11275 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11276 11277 rp = VTOR4(vp); 11278 11279 /* 11280 * The way that the address space of this process deletes its mapping 11281 * of this file is via the following call chains: 11282 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11283 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap() 11284 * 11285 * With the use of address space callbacks we are allowed to drop the 11286 * address space lock, a_lock, while executing the NFS operations that 11287 * need to go over the wire. Returning EAGAIN to the caller of this 11288 * function is what drives the execution of the callback that we add 11289 * below. The callback will be executed by the address space code 11290 * after dropping the a_lock. When the callback is finished, since 11291 * we dropped the a_lock, it must be re-acquired and segvn_unmap() 11292 * is called again on the same segment to finish the rest of the work 11293 * that needs to happen during unmapping. 11294 * 11295 * This action of calling back into the segment driver causes 11296 * nfs4_delmap() to get called again, but since the callback was 11297 * already executed at this point, it already did the work and there 11298 * is nothing left for us to do. 11299 * 11300 * To Summarize: 11301 * - The first time nfs4_delmap is called by the current thread is when 11302 * we add the caller associated with this delmap to the delmap caller 11303 * list, add the callback, and return EAGAIN. 11304 * - The second time in this call chain when nfs4_delmap is called we 11305 * will find this caller in the delmap caller list and realize there 11306 * is no more work to do thus removing this caller from the list and 11307 * returning the error that was set in the callback execution. 11308 */ 11309 caller_found = nfs4_find_and_delete_delmapcall(rp, &error); 11310 if (caller_found) { 11311 /* 11312 * 'error' is from the actual delmap operations. To avoid 11313 * hangs, we need to handle the return of EAGAIN differently 11314 * since this is what drives the callback execution. 11315 * In this case, we don't want to return EAGAIN and do the 11316 * callback execution because there are none to execute. 11317 */ 11318 if (error == EAGAIN) 11319 return (0); 11320 else 11321 return (error); 11322 } 11323 11324 /* current caller was not in the list */ 11325 delmap_call = nfs4_init_delmapcall(); 11326 11327 mutex_enter(&rp->r_statelock); 11328 list_insert_tail(&rp->r_indelmap, delmap_call); 11329 mutex_exit(&rp->r_statelock); 11330 11331 dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP); 11332 11333 dmapp->vp = vp; 11334 dmapp->off = off; 11335 dmapp->addr = addr; 11336 dmapp->len = len; 11337 dmapp->prot = prot; 11338 dmapp->maxprot = maxprot; 11339 dmapp->flags = flags; 11340 dmapp->cr = cr; 11341 dmapp->caller = delmap_call; 11342 11343 error = as_add_callback(as, nfs4_delmap_callback, dmapp, 11344 AS_UNMAP_EVENT, addr, len, KM_SLEEP); 11345 11346 return (error ? error : EAGAIN); 11347 } 11348 11349 static nfs4_delmapcall_t * 11350 nfs4_init_delmapcall() 11351 { 11352 nfs4_delmapcall_t *delmap_call; 11353 11354 delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP); 11355 delmap_call->call_id = curthread; 11356 delmap_call->error = 0; 11357 11358 return (delmap_call); 11359 } 11360 11361 static void 11362 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call) 11363 { 11364 kmem_free(delmap_call, sizeof (nfs4_delmapcall_t)); 11365 } 11366 11367 /* 11368 * Searches for the current delmap caller (based on curthread) in the list of 11369 * callers. If it is found, we remove it and free the delmap caller. 11370 * Returns: 11371 * 0 if the caller wasn't found 11372 * 1 if the caller was found, removed and freed. *errp will be set 11373 * to what the result of the delmap was. 11374 */ 11375 static int 11376 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp) 11377 { 11378 nfs4_delmapcall_t *delmap_call; 11379 11380 /* 11381 * If the list doesn't exist yet, we create it and return 11382 * that the caller wasn't found. No list = no callers. 11383 */ 11384 mutex_enter(&rp->r_statelock); 11385 if (!(rp->r_flags & R4DELMAPLIST)) { 11386 /* The list does not exist */ 11387 list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t), 11388 offsetof(nfs4_delmapcall_t, call_node)); 11389 rp->r_flags |= R4DELMAPLIST; 11390 mutex_exit(&rp->r_statelock); 11391 return (0); 11392 } else { 11393 /* The list exists so search it */ 11394 for (delmap_call = list_head(&rp->r_indelmap); 11395 delmap_call != NULL; 11396 delmap_call = list_next(&rp->r_indelmap, delmap_call)) { 11397 if (delmap_call->call_id == curthread) { 11398 /* current caller is in the list */ 11399 *errp = delmap_call->error; 11400 list_remove(&rp->r_indelmap, delmap_call); 11401 mutex_exit(&rp->r_statelock); 11402 nfs4_free_delmapcall(delmap_call); 11403 return (1); 11404 } 11405 } 11406 } 11407 mutex_exit(&rp->r_statelock); 11408 return (0); 11409 } 11410 11411 /* 11412 * Remove some pages from an mmap'd vnode. Just update the 11413 * count of pages. If doing close-to-open, then flush and 11414 * commit all of the pages associated with this file. 11415 * Otherwise, start an asynchronous page flush to write out 11416 * any dirty pages. This will also associate a credential 11417 * with the rnode which can be used to write the pages. 11418 */ 11419 /* ARGSUSED */ 11420 static void 11421 nfs4_delmap_callback(struct as *as, void *arg, uint_t event) 11422 { 11423 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11424 rnode4_t *rp; 11425 mntinfo4_t *mi; 11426 nfs4_delmap_args_t *dmapp = (nfs4_delmap_args_t *)arg; 11427 11428 rp = VTOR4(dmapp->vp); 11429 mi = VTOMI4(dmapp->vp); 11430 11431 atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len)); 11432 ASSERT(rp->r_mapcnt >= 0); 11433 11434 /* 11435 * Initiate a page flush and potential commit if there are 11436 * pages, the file system was not mounted readonly, the segment 11437 * was mapped shared, and the pages themselves were writeable. 11438 */ 11439 if (nfs4_has_pages(dmapp->vp) && 11440 !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) && 11441 dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) { 11442 mutex_enter(&rp->r_statelock); 11443 rp->r_flags |= R4DIRTY; 11444 mutex_exit(&rp->r_statelock); 11445 e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off, 11446 dmapp->len, dmapp->cr); 11447 if (!e.error) { 11448 mutex_enter(&rp->r_statelock); 11449 e.error = rp->r_error; 11450 rp->r_error = 0; 11451 mutex_exit(&rp->r_statelock); 11452 } 11453 } else 11454 e.error = 0; 11455 11456 if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) 11457 (void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len, 11458 B_INVAL, dmapp->cr, NULL); 11459 11460 if (e.error) { 11461 e.stat = puterrno4(e.error); 11462 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11463 OP_COMMIT, FALSE, NULL, 0, dmapp->vp); 11464 dmapp->caller->error = e.error; 11465 } 11466 11467 /* Check to see if we need to close the file */ 11468 11469 if (dmapp->vp->v_type == VREG) { 11470 nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e, 11471 CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags); 11472 11473 if (e.error != 0 || e.stat != NFS4_OK) { 11474 /* 11475 * Since it is possible that e.error == 0 and 11476 * e.stat != NFS4_OK (and vice versa), 11477 * we do the proper checking in order to get both 11478 * e.error and e.stat reporting the correct info. 11479 */ 11480 if (e.stat == NFS4_OK) 11481 e.stat = puterrno4(e.error); 11482 if (e.error == 0) 11483 e.error = geterrno4(e.stat); 11484 11485 nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0, 11486 OP_CLOSE, FALSE, NULL, 0, dmapp->vp); 11487 dmapp->caller->error = e.error; 11488 } 11489 } 11490 11491 (void) as_delete_callback(as, arg); 11492 kmem_free(dmapp, sizeof (nfs4_delmap_args_t)); 11493 } 11494 11495 11496 static uint_t 11497 fattr4_maxfilesize_to_bits(uint64_t ll) 11498 { 11499 uint_t l = 1; 11500 11501 if (ll == 0) { 11502 return (0); 11503 } 11504 11505 if (ll & 0xffffffff00000000) { 11506 l += 32; ll >>= 32; 11507 } 11508 if (ll & 0xffff0000) { 11509 l += 16; ll >>= 16; 11510 } 11511 if (ll & 0xff00) { 11512 l += 8; ll >>= 8; 11513 } 11514 if (ll & 0xf0) { 11515 l += 4; ll >>= 4; 11516 } 11517 if (ll & 0xc) { 11518 l += 2; ll >>= 2; 11519 } 11520 if (ll & 0x2) { 11521 l += 1; 11522 } 11523 return (l); 11524 } 11525 11526 static int 11527 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr) 11528 { 11529 vnode_t *avp = NULL; 11530 int error; 11531 11532 if ((error = nfs4lookup_xattr(vp, "", &avp, 11533 LOOKUP_XATTR, cr)) == 0) 11534 error = do_xattr_exists_check(avp, valp, cr); 11535 if (avp) 11536 VN_RELE(avp); 11537 11538 return (error); 11539 } 11540 11541 /* ARGSUSED */ 11542 int 11543 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, 11544 caller_context_t *ct) 11545 { 11546 int error; 11547 hrtime_t t; 11548 rnode4_t *rp; 11549 nfs4_ga_res_t gar; 11550 nfs4_ga_ext_res_t ger; 11551 11552 gar.n4g_ext_res = &ger; 11553 11554 if (nfs_zone() != VTOMI4(vp)->mi_zone) 11555 return (EIO); 11556 if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) { 11557 *valp = MAXPATHLEN; 11558 return (0); 11559 } 11560 if (cmd == _PC_ACL_ENABLED) { 11561 *valp = _ACL_ACE_ENABLED; 11562 return (0); 11563 } 11564 11565 rp = VTOR4(vp); 11566 if (cmd == _PC_XATTR_EXISTS) { 11567 /* 11568 * The existence of the xattr directory is not sufficient 11569 * for determining whether generic user attributes exists. 11570 * The attribute directory could only be a transient directory 11571 * used for Solaris sysattr support. Do a small readdir 11572 * to verify if the only entries are sysattrs or not. 11573 * 11574 * pc4_xattr_valid can be only be trusted when r_xattr_dir 11575 * is NULL. Once the xadir vp exists, we can create xattrs, 11576 * and we don't have any way to update the "base" object's 11577 * pc4_xattr_exists from the xattr or xadir. Maybe FEM 11578 * could help out. 11579 */ 11580 if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid && 11581 rp->r_xattr_dir == NULL) { 11582 return (nfs4_have_xattrs(vp, valp, cr)); 11583 } 11584 } else { /* OLD CODE */ 11585 if (ATTRCACHE4_VALID(vp)) { 11586 mutex_enter(&rp->r_statelock); 11587 if (rp->r_pathconf.pc4_cache_valid) { 11588 error = 0; 11589 switch (cmd) { 11590 case _PC_FILESIZEBITS: 11591 *valp = 11592 rp->r_pathconf.pc4_filesizebits; 11593 break; 11594 case _PC_LINK_MAX: 11595 *valp = 11596 rp->r_pathconf.pc4_link_max; 11597 break; 11598 case _PC_NAME_MAX: 11599 *valp = 11600 rp->r_pathconf.pc4_name_max; 11601 break; 11602 case _PC_CHOWN_RESTRICTED: 11603 *valp = 11604 rp->r_pathconf.pc4_chown_restricted; 11605 break; 11606 case _PC_NO_TRUNC: 11607 *valp = 11608 rp->r_pathconf.pc4_no_trunc; 11609 break; 11610 default: 11611 error = EINVAL; 11612 break; 11613 } 11614 mutex_exit(&rp->r_statelock); 11615 #ifdef DEBUG 11616 nfs4_pathconf_cache_hits++; 11617 #endif 11618 return (error); 11619 } 11620 mutex_exit(&rp->r_statelock); 11621 } 11622 } 11623 #ifdef DEBUG 11624 nfs4_pathconf_cache_misses++; 11625 #endif 11626 11627 t = gethrtime(); 11628 11629 error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr); 11630 11631 if (error) { 11632 mutex_enter(&rp->r_statelock); 11633 rp->r_pathconf.pc4_cache_valid = FALSE; 11634 rp->r_pathconf.pc4_xattr_valid = FALSE; 11635 mutex_exit(&rp->r_statelock); 11636 return (error); 11637 } 11638 11639 /* interpret the max filesize */ 11640 gar.n4g_ext_res->n4g_pc4.pc4_filesizebits = 11641 fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize); 11642 11643 /* Store the attributes we just received */ 11644 nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL); 11645 11646 switch (cmd) { 11647 case _PC_FILESIZEBITS: 11648 *valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits; 11649 break; 11650 case _PC_LINK_MAX: 11651 *valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max; 11652 break; 11653 case _PC_NAME_MAX: 11654 *valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max; 11655 break; 11656 case _PC_CHOWN_RESTRICTED: 11657 *valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted; 11658 break; 11659 case _PC_NO_TRUNC: 11660 *valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc; 11661 break; 11662 case _PC_XATTR_EXISTS: 11663 if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) { 11664 if (error = nfs4_have_xattrs(vp, valp, cr)) 11665 return (error); 11666 } 11667 break; 11668 default: 11669 return (EINVAL); 11670 } 11671 11672 return (0); 11673 } 11674 11675 /* 11676 * Called by async thread to do synchronous pageio. Do the i/o, wait 11677 * for it to complete, and cleanup the page list when done. 11678 */ 11679 static int 11680 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11681 int flags, cred_t *cr) 11682 { 11683 int error; 11684 11685 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11686 11687 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11688 if (flags & B_READ) 11689 pvn_read_done(pp, (error ? B_ERROR : 0) | flags); 11690 else 11691 pvn_write_done(pp, (error ? B_ERROR : 0) | flags); 11692 return (error); 11693 } 11694 11695 /* ARGSUSED */ 11696 static int 11697 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len, 11698 int flags, cred_t *cr, caller_context_t *ct) 11699 { 11700 int error; 11701 rnode4_t *rp; 11702 11703 if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone) 11704 return (EIO); 11705 11706 if (pp == NULL) 11707 return (EINVAL); 11708 11709 rp = VTOR4(vp); 11710 mutex_enter(&rp->r_statelock); 11711 rp->r_count++; 11712 mutex_exit(&rp->r_statelock); 11713 11714 if (flags & B_ASYNC) { 11715 error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr, 11716 nfs4_sync_pageio); 11717 } else 11718 error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr); 11719 mutex_enter(&rp->r_statelock); 11720 rp->r_count--; 11721 cv_broadcast(&rp->r_cv); 11722 mutex_exit(&rp->r_statelock); 11723 return (error); 11724 } 11725 11726 /* ARGSUSED */ 11727 static void 11728 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr, 11729 caller_context_t *ct) 11730 { 11731 int error; 11732 rnode4_t *rp; 11733 page_t *plist; 11734 page_t *pptr; 11735 offset3 offset; 11736 count3 len; 11737 k_sigset_t smask; 11738 11739 /* 11740 * We should get called with fl equal to either B_FREE or 11741 * B_INVAL. Any other value is illegal. 11742 * 11743 * The page that we are either supposed to free or destroy 11744 * should be exclusive locked and its io lock should not 11745 * be held. 11746 */ 11747 ASSERT(fl == B_FREE || fl == B_INVAL); 11748 ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr); 11749 11750 rp = VTOR4(vp); 11751 11752 /* 11753 * If the page doesn't need to be committed or we shouldn't 11754 * even bother attempting to commit it, then just make sure 11755 * that the p_fsdata byte is clear and then either free or 11756 * destroy the page as appropriate. 11757 */ 11758 if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) { 11759 pp->p_fsdata = C_NOCOMMIT; 11760 if (fl == B_FREE) 11761 page_free(pp, dn); 11762 else 11763 page_destroy(pp, dn); 11764 return; 11765 } 11766 11767 /* 11768 * If there is a page invalidation operation going on, then 11769 * if this is one of the pages being destroyed, then just 11770 * clear the p_fsdata byte and then either free or destroy 11771 * the page as appropriate. 11772 */ 11773 mutex_enter(&rp->r_statelock); 11774 if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) { 11775 mutex_exit(&rp->r_statelock); 11776 pp->p_fsdata = C_NOCOMMIT; 11777 if (fl == B_FREE) 11778 page_free(pp, dn); 11779 else 11780 page_destroy(pp, dn); 11781 return; 11782 } 11783 11784 /* 11785 * If we are freeing this page and someone else is already 11786 * waiting to do a commit, then just unlock the page and 11787 * return. That other thread will take care of commiting 11788 * this page. The page can be freed sometime after the 11789 * commit has finished. Otherwise, if the page is marked 11790 * as delay commit, then we may be getting called from 11791 * pvn_write_done, one page at a time. This could result 11792 * in one commit per page, so we end up doing lots of small 11793 * commits instead of fewer larger commits. This is bad, 11794 * we want do as few commits as possible. 11795 */ 11796 if (fl == B_FREE) { 11797 if (rp->r_flags & R4COMMITWAIT) { 11798 page_unlock(pp); 11799 mutex_exit(&rp->r_statelock); 11800 return; 11801 } 11802 if (pp->p_fsdata == C_DELAYCOMMIT) { 11803 pp->p_fsdata = C_COMMIT; 11804 page_unlock(pp); 11805 mutex_exit(&rp->r_statelock); 11806 return; 11807 } 11808 } 11809 11810 /* 11811 * Check to see if there is a signal which would prevent an 11812 * attempt to commit the pages from being successful. If so, 11813 * then don't bother with all of the work to gather pages and 11814 * generate the unsuccessful RPC. Just return from here and 11815 * let the page be committed at some later time. 11816 */ 11817 sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT); 11818 if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) { 11819 sigunintr(&smask); 11820 page_unlock(pp); 11821 mutex_exit(&rp->r_statelock); 11822 return; 11823 } 11824 sigunintr(&smask); 11825 11826 /* 11827 * We are starting to need to commit pages, so let's try 11828 * to commit as many as possible at once to reduce the 11829 * overhead. 11830 * 11831 * Set the `commit inprogress' state bit. We must 11832 * first wait until any current one finishes. Then 11833 * we initialize the c_pages list with this page. 11834 */ 11835 while (rp->r_flags & R4COMMIT) { 11836 rp->r_flags |= R4COMMITWAIT; 11837 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 11838 rp->r_flags &= ~R4COMMITWAIT; 11839 } 11840 rp->r_flags |= R4COMMIT; 11841 mutex_exit(&rp->r_statelock); 11842 ASSERT(rp->r_commit.c_pages == NULL); 11843 rp->r_commit.c_pages = pp; 11844 rp->r_commit.c_commbase = (offset3)pp->p_offset; 11845 rp->r_commit.c_commlen = PAGESIZE; 11846 11847 /* 11848 * Gather together all other pages which can be committed. 11849 * They will all be chained off r_commit.c_pages. 11850 */ 11851 nfs4_get_commit(vp); 11852 11853 /* 11854 * Clear the `commit inprogress' status and disconnect 11855 * the list of pages to be committed from the rnode. 11856 * At this same time, we also save the starting offset 11857 * and length of data to be committed on the server. 11858 */ 11859 plist = rp->r_commit.c_pages; 11860 rp->r_commit.c_pages = NULL; 11861 offset = rp->r_commit.c_commbase; 11862 len = rp->r_commit.c_commlen; 11863 mutex_enter(&rp->r_statelock); 11864 rp->r_flags &= ~R4COMMIT; 11865 cv_broadcast(&rp->r_commit.c_cv); 11866 mutex_exit(&rp->r_statelock); 11867 11868 if (curproc == proc_pageout || curproc == proc_fsflush || 11869 nfs_zone() != VTOMI4(vp)->mi_zone) { 11870 nfs4_async_commit(vp, plist, offset, len, 11871 cr, do_nfs4_async_commit); 11872 return; 11873 } 11874 11875 /* 11876 * Actually generate the COMMIT op over the wire operation. 11877 */ 11878 error = nfs4_commit(vp, (offset4)offset, (count4)len, cr); 11879 11880 /* 11881 * If we got an error during the commit, just unlock all 11882 * of the pages. The pages will get retransmitted to the 11883 * server during a putpage operation. 11884 */ 11885 if (error) { 11886 while (plist != NULL) { 11887 pptr = plist; 11888 page_sub(&plist, pptr); 11889 page_unlock(pptr); 11890 } 11891 return; 11892 } 11893 11894 /* 11895 * We've tried as hard as we can to commit the data to stable 11896 * storage on the server. We just unlock the rest of the pages 11897 * and clear the commit required state. They will be put 11898 * onto the tail of the cachelist if they are nolonger 11899 * mapped. 11900 */ 11901 while (plist != pp) { 11902 pptr = plist; 11903 page_sub(&plist, pptr); 11904 pptr->p_fsdata = C_NOCOMMIT; 11905 page_unlock(pptr); 11906 } 11907 11908 /* 11909 * It is possible that nfs4_commit didn't return error but 11910 * some other thread has modified the page we are going 11911 * to free/destroy. 11912 * In this case we need to rewrite the page. Do an explicit check 11913 * before attempting to free/destroy the page. If modified, needs to 11914 * be rewritten so unlock the page and return. 11915 */ 11916 if (hat_ismod(pp)) { 11917 pp->p_fsdata = C_NOCOMMIT; 11918 page_unlock(pp); 11919 return; 11920 } 11921 11922 /* 11923 * Now, as appropriate, either free or destroy the page 11924 * that we were called with. 11925 */ 11926 pp->p_fsdata = C_NOCOMMIT; 11927 if (fl == B_FREE) 11928 page_free(pp, dn); 11929 else 11930 page_destroy(pp, dn); 11931 } 11932 11933 /* 11934 * Commit requires that the current fh be the file written to. 11935 * The compound op structure is: 11936 * PUTFH(file), COMMIT 11937 */ 11938 static int 11939 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr) 11940 { 11941 COMPOUND4args_clnt args; 11942 COMPOUND4res_clnt res; 11943 COMMIT4res *cm_res; 11944 nfs_argop4 argop[2]; 11945 nfs_resop4 *resop; 11946 int doqueue; 11947 mntinfo4_t *mi; 11948 rnode4_t *rp; 11949 cred_t *cred_otw = NULL; 11950 bool_t needrecov = FALSE; 11951 nfs4_recov_state_t recov_state; 11952 nfs4_open_stream_t *osp = NULL; 11953 bool_t first_time = TRUE; /* first time getting OTW cred */ 11954 bool_t last_time = FALSE; /* last time getting OTW cred */ 11955 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 11956 11957 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 11958 11959 rp = VTOR4(vp); 11960 11961 mi = VTOMI4(vp); 11962 recov_state.rs_flags = 0; 11963 recov_state.rs_num_retry_despite_err = 0; 11964 get_commit_cred: 11965 /* 11966 * Releases the osp, if a valid open stream is provided. 11967 * Puts a hold on the cred_otw and the new osp (if found). 11968 */ 11969 cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp, 11970 &first_time, &last_time); 11971 args.ctag = TAG_COMMIT; 11972 recov_retry: 11973 /* 11974 * Commit ops: putfh file; commit 11975 */ 11976 args.array_len = 2; 11977 args.array = argop; 11978 11979 e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 11980 &recov_state, NULL); 11981 if (e.error) { 11982 crfree(cred_otw); 11983 if (osp != NULL) 11984 open_stream_rele(osp, rp); 11985 return (e.error); 11986 } 11987 11988 /* putfh directory */ 11989 argop[0].argop = OP_CPUTFH; 11990 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 11991 11992 /* commit */ 11993 argop[1].argop = OP_COMMIT; 11994 argop[1].nfs_argop4_u.opcommit.offset = offset; 11995 argop[1].nfs_argop4_u.opcommit.count = count; 11996 11997 doqueue = 1; 11998 rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e); 11999 12000 needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp); 12001 if (!needrecov && e.error) { 12002 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, 12003 needrecov); 12004 crfree(cred_otw); 12005 if (e.error == EACCES && last_time == FALSE) 12006 goto get_commit_cred; 12007 if (osp != NULL) 12008 open_stream_rele(osp, rp); 12009 return (e.error); 12010 } 12011 12012 if (needrecov) { 12013 if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 12014 NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) { 12015 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12016 &recov_state, needrecov); 12017 if (!e.error) 12018 (void) xdr_free(xdr_COMPOUND4res_clnt, 12019 (caddr_t)&res); 12020 goto recov_retry; 12021 } 12022 if (e.error) { 12023 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12024 &recov_state, needrecov); 12025 crfree(cred_otw); 12026 if (osp != NULL) 12027 open_stream_rele(osp, rp); 12028 return (e.error); 12029 } 12030 /* fall through for res.status case */ 12031 } 12032 12033 if (res.status) { 12034 e.error = geterrno4(res.status); 12035 if (e.error == EACCES && last_time == FALSE) { 12036 crfree(cred_otw); 12037 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12038 &recov_state, needrecov); 12039 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12040 goto get_commit_cred; 12041 } 12042 /* 12043 * Can't do a nfs4_purge_stale_fh here because this 12044 * can cause a deadlock. nfs4_commit can 12045 * be called from nfs4_dispose which can be called 12046 * indirectly via pvn_vplist_dirty. nfs4_purge_stale_fh 12047 * can call back to pvn_vplist_dirty. 12048 */ 12049 if (e.error == ESTALE) { 12050 mutex_enter(&rp->r_statelock); 12051 rp->r_flags |= R4STALE; 12052 if (!rp->r_error) 12053 rp->r_error = e.error; 12054 mutex_exit(&rp->r_statelock); 12055 PURGE_ATTRCACHE4(vp); 12056 } else { 12057 mutex_enter(&rp->r_statelock); 12058 if (!rp->r_error) 12059 rp->r_error = e.error; 12060 mutex_exit(&rp->r_statelock); 12061 } 12062 } else { 12063 ASSERT(rp->r_flags & R4HAVEVERF); 12064 resop = &res.array[1]; /* commit res */ 12065 cm_res = &resop->nfs_resop4_u.opcommit; 12066 mutex_enter(&rp->r_statelock); 12067 if (cm_res->writeverf == rp->r_writeverf) { 12068 mutex_exit(&rp->r_statelock); 12069 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12070 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, 12071 &recov_state, needrecov); 12072 crfree(cred_otw); 12073 if (osp != NULL) 12074 open_stream_rele(osp, rp); 12075 return (0); 12076 } 12077 nfs4_set_mod(vp); 12078 rp->r_writeverf = cm_res->writeverf; 12079 mutex_exit(&rp->r_statelock); 12080 e.error = NFS_VERF_MISMATCH; 12081 } 12082 12083 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 12084 nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov); 12085 crfree(cred_otw); 12086 if (osp != NULL) 12087 open_stream_rele(osp, rp); 12088 12089 return (e.error); 12090 } 12091 12092 static void 12093 nfs4_set_mod(vnode_t *vp) 12094 { 12095 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12096 12097 /* make sure we're looking at the master vnode, not a shadow */ 12098 pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check); 12099 } 12100 12101 /* 12102 * This function is used to gather a page list of the pages which 12103 * can be committed on the server. 12104 * 12105 * The calling thread must have set R4COMMIT. This bit is used to 12106 * serialize access to the commit structure in the rnode. As long 12107 * as the thread has set R4COMMIT, then it can manipulate the commit 12108 * structure without requiring any other locks. 12109 * 12110 * When this function is called from nfs4_dispose() the page passed 12111 * into nfs4_dispose() will be SE_EXCL locked, and so this function 12112 * will skip it. This is not a problem since we initially add the 12113 * page to the r_commit page list. 12114 * 12115 */ 12116 static void 12117 nfs4_get_commit(vnode_t *vp) 12118 { 12119 rnode4_t *rp; 12120 page_t *pp; 12121 kmutex_t *vphm; 12122 12123 rp = VTOR4(vp); 12124 12125 ASSERT(rp->r_flags & R4COMMIT); 12126 12127 /* make sure we're looking at the master vnode, not a shadow */ 12128 12129 if (IS_SHADOW(vp, rp)) 12130 vp = RTOV4(rp); 12131 12132 vphm = page_vnode_mutex(vp); 12133 mutex_enter(vphm); 12134 12135 /* 12136 * If there are no pages associated with this vnode, then 12137 * just return. 12138 */ 12139 if ((pp = vp->v_pages) == NULL) { 12140 mutex_exit(vphm); 12141 return; 12142 } 12143 12144 /* 12145 * Step through all of the pages associated with this vnode 12146 * looking for pages which need to be committed. 12147 */ 12148 do { 12149 /* Skip marker pages. */ 12150 if (pp->p_hash == PVN_VPLIST_HASH_TAG) 12151 continue; 12152 12153 /* 12154 * First short-cut everything (without the page_lock) 12155 * and see if this page does not need to be committed 12156 * or is modified if so then we'll just skip it. 12157 */ 12158 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) 12159 continue; 12160 12161 /* 12162 * Attempt to lock the page. If we can't, then 12163 * someone else is messing with it or we have been 12164 * called from nfs4_dispose and this is the page that 12165 * nfs4_dispose was called with.. anyway just skip it. 12166 */ 12167 if (!page_trylock(pp, SE_EXCL)) 12168 continue; 12169 12170 /* 12171 * Lets check again now that we have the page lock. 12172 */ 12173 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12174 page_unlock(pp); 12175 continue; 12176 } 12177 12178 /* this had better not be a free page */ 12179 ASSERT(PP_ISFREE(pp) == 0); 12180 12181 /* 12182 * The page needs to be committed and we locked it. 12183 * Update the base and length parameters and add it 12184 * to r_pages. 12185 */ 12186 if (rp->r_commit.c_pages == NULL) { 12187 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12188 rp->r_commit.c_commlen = PAGESIZE; 12189 } else if (pp->p_offset < rp->r_commit.c_commbase) { 12190 rp->r_commit.c_commlen = rp->r_commit.c_commbase - 12191 (offset3)pp->p_offset + rp->r_commit.c_commlen; 12192 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12193 } else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen) 12194 <= pp->p_offset) { 12195 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12196 rp->r_commit.c_commbase + PAGESIZE; 12197 } 12198 page_add(&rp->r_commit.c_pages, pp); 12199 } while ((pp = pp->p_vpnext) != vp->v_pages); 12200 12201 mutex_exit(vphm); 12202 } 12203 12204 /* 12205 * This routine is used to gather together a page list of the pages 12206 * which are to be committed on the server. This routine must not 12207 * be called if the calling thread holds any locked pages. 12208 * 12209 * The calling thread must have set R4COMMIT. This bit is used to 12210 * serialize access to the commit structure in the rnode. As long 12211 * as the thread has set R4COMMIT, then it can manipulate the commit 12212 * structure without requiring any other locks. 12213 */ 12214 static void 12215 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len) 12216 { 12217 12218 rnode4_t *rp; 12219 page_t *pp; 12220 u_offset_t end; 12221 u_offset_t off; 12222 ASSERT(len != 0); 12223 rp = VTOR4(vp); 12224 ASSERT(rp->r_flags & R4COMMIT); 12225 12226 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12227 12228 /* make sure we're looking at the master vnode, not a shadow */ 12229 12230 if (IS_SHADOW(vp, rp)) 12231 vp = RTOV4(rp); 12232 12233 /* 12234 * If there are no pages associated with this vnode, then 12235 * just return. 12236 */ 12237 if ((pp = vp->v_pages) == NULL) 12238 return; 12239 /* 12240 * Calculate the ending offset. 12241 */ 12242 end = soff + len; 12243 for (off = soff; off < end; off += PAGESIZE) { 12244 /* 12245 * Lookup each page by vp, offset. 12246 */ 12247 if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL) 12248 continue; 12249 /* 12250 * If this page does not need to be committed or is 12251 * modified, then just skip it. 12252 */ 12253 if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) { 12254 page_unlock(pp); 12255 continue; 12256 } 12257 12258 ASSERT(PP_ISFREE(pp) == 0); 12259 /* 12260 * The page needs to be committed and we locked it. 12261 * Update the base and length parameters and add it 12262 * to r_pages. 12263 */ 12264 if (rp->r_commit.c_pages == NULL) { 12265 rp->r_commit.c_commbase = (offset3)pp->p_offset; 12266 rp->r_commit.c_commlen = PAGESIZE; 12267 } else { 12268 rp->r_commit.c_commlen = (offset3)pp->p_offset - 12269 rp->r_commit.c_commbase + PAGESIZE; 12270 } 12271 page_add(&rp->r_commit.c_pages, pp); 12272 } 12273 } 12274 12275 /* 12276 * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap(). 12277 * Flushes and commits data to the server. 12278 */ 12279 static int 12280 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr) 12281 { 12282 int error; 12283 verifier4 write_verf; 12284 rnode4_t *rp = VTOR4(vp); 12285 12286 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12287 12288 /* 12289 * Flush the data portion of the file and then commit any 12290 * portions which need to be committed. This may need to 12291 * be done twice if the server has changed state since 12292 * data was last written. The data will need to be 12293 * rewritten to the server and then a new commit done. 12294 * 12295 * In fact, this may need to be done several times if the 12296 * server is having problems and crashing while we are 12297 * attempting to do this. 12298 */ 12299 12300 top: 12301 /* 12302 * Do a flush based on the poff and plen arguments. This 12303 * will synchronously write out any modified pages in the 12304 * range specified by (poff, plen). This starts all of the 12305 * i/o operations which will be waited for in the next 12306 * call to nfs4_putpage 12307 */ 12308 12309 mutex_enter(&rp->r_statelock); 12310 write_verf = rp->r_writeverf; 12311 mutex_exit(&rp->r_statelock); 12312 12313 error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL); 12314 if (error == EAGAIN) 12315 error = 0; 12316 12317 /* 12318 * Do a flush based on the poff and plen arguments. This 12319 * will synchronously write out any modified pages in the 12320 * range specified by (poff, plen) and wait until all of 12321 * the asynchronous i/o's in that range are done as well. 12322 */ 12323 if (!error) 12324 error = nfs4_putpage(vp, poff, plen, 0, cr, NULL); 12325 12326 if (error) 12327 return (error); 12328 12329 mutex_enter(&rp->r_statelock); 12330 if (rp->r_writeverf != write_verf) { 12331 mutex_exit(&rp->r_statelock); 12332 goto top; 12333 } 12334 mutex_exit(&rp->r_statelock); 12335 12336 /* 12337 * Now commit any pages which might need to be committed. 12338 * If the error, NFS_VERF_MISMATCH, is returned, then 12339 * start over with the flush operation. 12340 */ 12341 error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT); 12342 12343 if (error == NFS_VERF_MISMATCH) 12344 goto top; 12345 12346 return (error); 12347 } 12348 12349 /* 12350 * nfs4_commit_vp() will wait for other pending commits and 12351 * will either commit the whole file or a range, plen dictates 12352 * if we commit whole file. a value of zero indicates the whole 12353 * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage() 12354 */ 12355 static int 12356 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen, 12357 cred_t *cr, int wait_on_writes) 12358 { 12359 rnode4_t *rp; 12360 page_t *plist; 12361 offset3 offset; 12362 count3 len; 12363 12364 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12365 12366 rp = VTOR4(vp); 12367 12368 /* 12369 * before we gather commitable pages make 12370 * sure there are no outstanding async writes 12371 */ 12372 if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) { 12373 mutex_enter(&rp->r_statelock); 12374 while (rp->r_count > 0) { 12375 cv_wait(&rp->r_cv, &rp->r_statelock); 12376 } 12377 mutex_exit(&rp->r_statelock); 12378 } 12379 12380 /* 12381 * Set the `commit inprogress' state bit. We must 12382 * first wait until any current one finishes. 12383 */ 12384 mutex_enter(&rp->r_statelock); 12385 while (rp->r_flags & R4COMMIT) { 12386 rp->r_flags |= R4COMMITWAIT; 12387 cv_wait(&rp->r_commit.c_cv, &rp->r_statelock); 12388 rp->r_flags &= ~R4COMMITWAIT; 12389 } 12390 rp->r_flags |= R4COMMIT; 12391 mutex_exit(&rp->r_statelock); 12392 12393 /* 12394 * Gather all of the pages which need to be 12395 * committed. 12396 */ 12397 if (plen == 0) 12398 nfs4_get_commit(vp); 12399 else 12400 nfs4_get_commit_range(vp, poff, plen); 12401 12402 /* 12403 * Clear the `commit inprogress' bit and disconnect the 12404 * page list which was gathered by nfs4_get_commit. 12405 */ 12406 plist = rp->r_commit.c_pages; 12407 rp->r_commit.c_pages = NULL; 12408 offset = rp->r_commit.c_commbase; 12409 len = rp->r_commit.c_commlen; 12410 mutex_enter(&rp->r_statelock); 12411 rp->r_flags &= ~R4COMMIT; 12412 cv_broadcast(&rp->r_commit.c_cv); 12413 mutex_exit(&rp->r_statelock); 12414 12415 /* 12416 * If any pages need to be committed, commit them and 12417 * then unlock them so that they can be freed some 12418 * time later. 12419 */ 12420 if (plist == NULL) 12421 return (0); 12422 12423 /* 12424 * No error occurred during the flush portion 12425 * of this operation, so now attempt to commit 12426 * the data to stable storage on the server. 12427 * 12428 * This will unlock all of the pages on the list. 12429 */ 12430 return (nfs4_sync_commit(vp, plist, offset, len, cr)); 12431 } 12432 12433 static int 12434 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12435 cred_t *cr) 12436 { 12437 int error; 12438 page_t *pp; 12439 12440 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12441 12442 error = nfs4_commit(vp, (offset4)offset, (count3)count, cr); 12443 12444 /* 12445 * If we got an error, then just unlock all of the pages 12446 * on the list. 12447 */ 12448 if (error) { 12449 while (plist != NULL) { 12450 pp = plist; 12451 page_sub(&plist, pp); 12452 page_unlock(pp); 12453 } 12454 return (error); 12455 } 12456 /* 12457 * We've tried as hard as we can to commit the data to stable 12458 * storage on the server. We just unlock the pages and clear 12459 * the commit required state. They will get freed later. 12460 */ 12461 while (plist != NULL) { 12462 pp = plist; 12463 page_sub(&plist, pp); 12464 pp->p_fsdata = C_NOCOMMIT; 12465 page_unlock(pp); 12466 } 12467 12468 return (error); 12469 } 12470 12471 static void 12472 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count, 12473 cred_t *cr) 12474 { 12475 12476 (void) nfs4_sync_commit(vp, plist, offset, count, cr); 12477 } 12478 12479 /*ARGSUSED*/ 12480 static int 12481 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12482 caller_context_t *ct) 12483 { 12484 int error = 0; 12485 mntinfo4_t *mi; 12486 vattr_t va; 12487 vsecattr_t nfsace4_vsap; 12488 12489 mi = VTOMI4(vp); 12490 if (nfs_zone() != mi->mi_zone) 12491 return (EIO); 12492 if (mi->mi_flags & MI4_ACL) { 12493 /* if we have a delegation, return it */ 12494 if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE) 12495 (void) nfs4delegreturn(VTOR4(vp), 12496 NFS4_DR_REOPEN|NFS4_DR_PUSH); 12497 12498 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, 12499 NFS4_ACL_SET); 12500 if (error) /* EINVAL */ 12501 return (error); 12502 12503 if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) { 12504 /* 12505 * These are aclent_t type entries. 12506 */ 12507 error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap, 12508 vp->v_type == VDIR, FALSE); 12509 if (error) 12510 return (error); 12511 } else { 12512 /* 12513 * These are ace_t type entries. 12514 */ 12515 error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap, 12516 FALSE); 12517 if (error) 12518 return (error); 12519 } 12520 bzero(&va, sizeof (va)); 12521 error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap); 12522 vs_ace4_destroy(&nfsace4_vsap); 12523 return (error); 12524 } 12525 return (ENOSYS); 12526 } 12527 12528 /* ARGSUSED */ 12529 int 12530 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr, 12531 caller_context_t *ct) 12532 { 12533 int error; 12534 mntinfo4_t *mi; 12535 nfs4_ga_res_t gar; 12536 rnode4_t *rp = VTOR4(vp); 12537 12538 mi = VTOMI4(vp); 12539 if (nfs_zone() != mi->mi_zone) 12540 return (EIO); 12541 12542 bzero(&gar, sizeof (gar)); 12543 gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask; 12544 12545 /* 12546 * vsecattr->vsa_mask holds the original acl request mask. 12547 * This is needed when determining what to return. 12548 * (See: nfs4_create_getsecattr_return()) 12549 */ 12550 error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET); 12551 if (error) /* EINVAL */ 12552 return (error); 12553 12554 /* 12555 * If this is a referral stub, don't try to go OTW for an ACL 12556 */ 12557 if (RP_ISSTUB_REFERRAL(VTOR4(vp))) 12558 return (fs_fab_acl(vp, vsecattr, flag, cr, ct)); 12559 12560 if (mi->mi_flags & MI4_ACL) { 12561 /* 12562 * Check if the data is cached and the cache is valid. If it 12563 * is we don't go over the wire. 12564 */ 12565 if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) { 12566 mutex_enter(&rp->r_statelock); 12567 if (rp->r_secattr != NULL) { 12568 error = nfs4_create_getsecattr_return( 12569 rp->r_secattr, vsecattr, rp->r_attr.va_uid, 12570 rp->r_attr.va_gid, 12571 vp->v_type == VDIR); 12572 if (!error) { /* error == 0 - Success! */ 12573 mutex_exit(&rp->r_statelock); 12574 return (error); 12575 } 12576 } 12577 mutex_exit(&rp->r_statelock); 12578 } 12579 12580 /* 12581 * The getattr otw call will always get both the acl, in 12582 * the form of a list of nfsace4's, and the number of acl 12583 * entries; independent of the value of gar.n4g_va.va_mask. 12584 */ 12585 error = nfs4_getattr_otw(vp, &gar, cr, 1); 12586 if (error) { 12587 vs_ace4_destroy(&gar.n4g_vsa); 12588 if (error == ENOTSUP || error == EOPNOTSUPP) 12589 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12590 return (error); 12591 } 12592 12593 if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) { 12594 /* 12595 * No error was returned, but according to the response 12596 * bitmap, neither was an acl. 12597 */ 12598 vs_ace4_destroy(&gar.n4g_vsa); 12599 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12600 return (error); 12601 } 12602 12603 /* 12604 * Update the cache with the ACL. 12605 */ 12606 nfs4_acl_fill_cache(rp, &gar.n4g_vsa); 12607 12608 error = nfs4_create_getsecattr_return(&gar.n4g_vsa, 12609 vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid, 12610 vp->v_type == VDIR); 12611 vs_ace4_destroy(&gar.n4g_vsa); 12612 if ((error) && (vsecattr->vsa_mask & 12613 (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) && 12614 (error != EACCES)) { 12615 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12616 } 12617 return (error); 12618 } 12619 error = fs_fab_acl(vp, vsecattr, flag, cr, ct); 12620 return (error); 12621 } 12622 12623 /* 12624 * The function returns: 12625 * - 0 (zero) if the passed in "acl_mask" is a valid request. 12626 * - EINVAL if the passed in "acl_mask" is an invalid request. 12627 * 12628 * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if: 12629 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12630 * 12631 * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if: 12632 * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE) 12633 * - We have a count field set without the corresponding acl field set. (e.g. - 12634 * VSA_ACECNT is set, but VSA_ACE is not) 12635 */ 12636 static int 12637 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op) 12638 { 12639 /* Shortcut the masks that are always valid. */ 12640 if (acl_mask == (VSA_ACE | VSA_ACECNT)) 12641 return (0); 12642 if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) 12643 return (0); 12644 12645 if (acl_mask & (VSA_ACE | VSA_ACECNT)) { 12646 /* 12647 * We can't have any VSA_ACL type stuff in the mask now. 12648 */ 12649 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12650 VSA_DFACLCNT)) 12651 return (EINVAL); 12652 12653 if (op == NFS4_ACL_SET) { 12654 if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE)) 12655 return (EINVAL); 12656 } 12657 } 12658 12659 if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) { 12660 /* 12661 * We can't have any VSA_ACE type stuff in the mask now. 12662 */ 12663 if (acl_mask & (VSA_ACE | VSA_ACECNT)) 12664 return (EINVAL); 12665 12666 if (op == NFS4_ACL_SET) { 12667 if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL)) 12668 return (EINVAL); 12669 12670 if ((acl_mask & VSA_DFACLCNT) && 12671 !(acl_mask & VSA_DFACL)) 12672 return (EINVAL); 12673 } 12674 } 12675 return (0); 12676 } 12677 12678 /* 12679 * The theory behind creating the correct getsecattr return is simply this: 12680 * "Don't return anything that the caller is not expecting to have to free." 12681 */ 12682 static int 12683 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap, 12684 uid_t uid, gid_t gid, int isdir) 12685 { 12686 int error = 0; 12687 /* Save the mask since the translators modify it. */ 12688 uint_t orig_mask = vsap->vsa_mask; 12689 12690 if (orig_mask & (VSA_ACE | VSA_ACECNT)) { 12691 error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE); 12692 12693 if (error) 12694 return (error); 12695 12696 /* 12697 * If the caller only asked for the ace count (VSA_ACECNT) 12698 * don't give them the full acl (VSA_ACE), free it. 12699 */ 12700 if (!orig_mask & VSA_ACE) { 12701 if (vsap->vsa_aclentp != NULL) { 12702 kmem_free(vsap->vsa_aclentp, 12703 vsap->vsa_aclcnt * sizeof (ace_t)); 12704 vsap->vsa_aclentp = NULL; 12705 } 12706 } 12707 vsap->vsa_mask = orig_mask; 12708 12709 } else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | 12710 VSA_DFACLCNT)) { 12711 error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid, 12712 isdir, FALSE); 12713 12714 if (error) 12715 return (error); 12716 12717 /* 12718 * If the caller only asked for the acl count (VSA_ACLCNT) 12719 * and/or the default acl count (VSA_DFACLCNT) don't give them 12720 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it. 12721 */ 12722 if (!orig_mask & VSA_ACL) { 12723 if (vsap->vsa_aclentp != NULL) { 12724 kmem_free(vsap->vsa_aclentp, 12725 vsap->vsa_aclcnt * sizeof (aclent_t)); 12726 vsap->vsa_aclentp = NULL; 12727 } 12728 } 12729 12730 if (!orig_mask & VSA_DFACL) { 12731 if (vsap->vsa_dfaclentp != NULL) { 12732 kmem_free(vsap->vsa_dfaclentp, 12733 vsap->vsa_dfaclcnt * sizeof (aclent_t)); 12734 vsap->vsa_dfaclentp = NULL; 12735 } 12736 } 12737 vsap->vsa_mask = orig_mask; 12738 } 12739 return (0); 12740 } 12741 12742 /* ARGSUSED */ 12743 int 12744 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr, 12745 caller_context_t *ct) 12746 { 12747 int error; 12748 12749 if (nfs_zone() != VTOMI4(vp)->mi_zone) 12750 return (EIO); 12751 /* 12752 * check for valid cmd parameter 12753 */ 12754 if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS) 12755 return (EINVAL); 12756 12757 /* 12758 * Check access permissions 12759 */ 12760 if ((cmd & F_SHARE) && 12761 (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) || 12762 (shr->s_access == F_WRACC && (flag & FWRITE) == 0))) 12763 return (EBADF); 12764 12765 /* 12766 * If the filesystem is mounted using local locking, pass the 12767 * request off to the local share code. 12768 */ 12769 if (VTOMI4(vp)->mi_flags & MI4_LLOCK) 12770 return (fs_shrlock(vp, cmd, shr, flag, cr, ct)); 12771 12772 switch (cmd) { 12773 case F_SHARE: 12774 case F_UNSHARE: 12775 /* 12776 * This will be properly implemented later, 12777 * see RFE: 4823948 . 12778 */ 12779 error = EAGAIN; 12780 break; 12781 12782 case F_HASREMOTELOCKS: 12783 /* 12784 * NFS client can't store remote locks itself 12785 */ 12786 shr->s_access = 0; 12787 error = 0; 12788 break; 12789 12790 default: 12791 error = EINVAL; 12792 break; 12793 } 12794 12795 return (error); 12796 } 12797 12798 /* 12799 * Common code called by directory ops to update the attrcache 12800 */ 12801 static int 12802 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp, 12803 hrtime_t t, vnode_t *vp, cred_t *cr) 12804 { 12805 int error = 0; 12806 12807 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12808 12809 if (status != NFS4_OK) { 12810 /* getattr not done or failed */ 12811 PURGE_ATTRCACHE4(vp); 12812 return (error); 12813 } 12814 12815 if (garp) { 12816 nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL); 12817 } else { 12818 PURGE_ATTRCACHE4(vp); 12819 } 12820 return (error); 12821 } 12822 12823 /* 12824 * Update directory caches for directory modification ops (link, rename, etc.) 12825 * When dinfo is NULL, manage dircaches in the old way. 12826 */ 12827 static void 12828 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm, 12829 dirattr_info_t *dinfo) 12830 { 12831 rnode4_t *drp = VTOR4(dvp); 12832 12833 ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone); 12834 12835 /* Purge rddir cache for dir since it changed */ 12836 if (drp->r_dir != NULL) 12837 nfs4_purge_rddir_cache(dvp); 12838 12839 /* 12840 * If caller provided dinfo, then use it to manage dir caches. 12841 */ 12842 if (dinfo != NULL) { 12843 if (vp != NULL) { 12844 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12845 if (!VTOR4(vp)->created_v4) { 12846 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12847 dnlc_update(dvp, nm, vp); 12848 } else { 12849 /* 12850 * XXX don't update if the created_v4 flag is 12851 * set 12852 */ 12853 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12854 NFS4_DEBUG(nfs4_client_state_debug, 12855 (CE_NOTE, "nfs4_update_dircaches: " 12856 "don't update dnlc: created_v4 flag")); 12857 } 12858 } 12859 12860 nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call, 12861 dinfo->di_cred, FALSE, cinfo); 12862 12863 return; 12864 } 12865 12866 /* 12867 * Caller didn't provide dinfo, then check change_info4 to update DNLC. 12868 * Since caller modified dir but didn't receive post-dirmod-op dir 12869 * attrs, the dir's attrs must be purged. 12870 * 12871 * XXX this check and dnlc update/purge should really be atomic, 12872 * XXX but can't use rnode statelock because it'll deadlock in 12873 * XXX dnlc_purge_vp, however, the risk is minimal even if a race 12874 * XXX does occur. 12875 * 12876 * XXX We also may want to check that atomic is true in the 12877 * XXX change_info struct. If it is not, the change_info may 12878 * XXX reflect changes by more than one clients which means that 12879 * XXX our cache may not be valid. 12880 */ 12881 PURGE_ATTRCACHE4(dvp); 12882 if (drp->r_change == cinfo->before) { 12883 /* no changes took place in the directory prior to our link */ 12884 if (vp != NULL) { 12885 mutex_enter(&VTOR4(vp)->r_statev4_lock); 12886 if (!VTOR4(vp)->created_v4) { 12887 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12888 dnlc_update(dvp, nm, vp); 12889 } else { 12890 /* 12891 * XXX dont' update if the created_v4 flag 12892 * is set 12893 */ 12894 mutex_exit(&VTOR4(vp)->r_statev4_lock); 12895 NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, 12896 "nfs4_update_dircaches: don't" 12897 " update dnlc: created_v4 flag")); 12898 } 12899 } 12900 } else { 12901 /* Another client modified directory - purge its dnlc cache */ 12902 dnlc_purge_vp(dvp); 12903 } 12904 } 12905 12906 /* 12907 * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a 12908 * file. 12909 * 12910 * The 'reopening_file' boolean should be set to TRUE if we are reopening this 12911 * file (ie: client recovery) and otherwise set to FALSE. 12912 * 12913 * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery 12914 * initiated) calling functions. 12915 * 12916 * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result 12917 * of resending a 'lost' open request. 12918 * 12919 * 'num_bseqid_retryp' makes sure we don't loop forever on a broken 12920 * server that hands out BAD_SEQID on open confirm. 12921 * 12922 * Errors are returned via the nfs4_error_t parameter. 12923 */ 12924 void 12925 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr, 12926 bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop, 12927 bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp) 12928 { 12929 COMPOUND4args_clnt args; 12930 COMPOUND4res_clnt res; 12931 nfs_argop4 argop[2]; 12932 nfs_resop4 *resop; 12933 int doqueue = 1; 12934 mntinfo4_t *mi; 12935 OPEN_CONFIRM4args *open_confirm_args; 12936 int needrecov; 12937 12938 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 12939 #if DEBUG 12940 mutex_enter(&oop->oo_lock); 12941 ASSERT(oop->oo_seqid_inuse); 12942 mutex_exit(&oop->oo_lock); 12943 #endif 12944 12945 recov_retry_confirm: 12946 nfs4_error_zinit(ep); 12947 *retry_open = FALSE; 12948 12949 if (resend) 12950 args.ctag = TAG_OPEN_CONFIRM_LOST; 12951 else 12952 args.ctag = TAG_OPEN_CONFIRM; 12953 12954 args.array_len = 2; 12955 args.array = argop; 12956 12957 /* putfh target fh */ 12958 argop[0].argop = OP_CPUTFH; 12959 argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh; 12960 12961 argop[1].argop = OP_OPEN_CONFIRM; 12962 open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm; 12963 12964 (*seqid) += 1; 12965 open_confirm_args->seqid = *seqid; 12966 open_confirm_args->open_stateid = *stateid; 12967 12968 mi = VTOMI4(vp); 12969 12970 rfs4call(mi, &args, &res, cr, &doqueue, 0, ep); 12971 12972 if (!ep->error && nfs4_need_to_bump_seqid(&res)) { 12973 nfs4_set_open_seqid((*seqid), oop, args.ctag); 12974 } 12975 12976 needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp); 12977 if (!needrecov && ep->error) 12978 return; 12979 12980 if (needrecov) { 12981 bool_t abort = FALSE; 12982 12983 if (reopening_file == FALSE) { 12984 nfs4_bseqid_entry_t *bsep = NULL; 12985 12986 if (!ep->error && res.status == NFS4ERR_BAD_SEQID) 12987 bsep = nfs4_create_bseqid_entry(oop, NULL, 12988 vp, 0, args.ctag, 12989 open_confirm_args->seqid); 12990 12991 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, 12992 NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL); 12993 if (bsep) { 12994 kmem_free(bsep, sizeof (*bsep)); 12995 if (num_bseqid_retryp && 12996 --(*num_bseqid_retryp) == 0) 12997 abort = TRUE; 12998 } 12999 } 13000 if ((ep->error == ETIMEDOUT || 13001 res.status == NFS4ERR_RESOURCE) && 13002 abort == FALSE && resend == FALSE) { 13003 if (!ep->error) 13004 (void) xdr_free(xdr_COMPOUND4res_clnt, 13005 (caddr_t)&res); 13006 13007 delay(SEC_TO_TICK(confirm_retry_sec)); 13008 goto recov_retry_confirm; 13009 } 13010 /* State may have changed so retry the entire OPEN op */ 13011 if (abort == FALSE) 13012 *retry_open = TRUE; 13013 else 13014 *retry_open = FALSE; 13015 if (!ep->error) 13016 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13017 return; 13018 } 13019 13020 if (res.status) { 13021 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13022 return; 13023 } 13024 13025 resop = &res.array[1]; /* open confirm res */ 13026 bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid, 13027 stateid, sizeof (*stateid)); 13028 13029 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res); 13030 } 13031 13032 /* 13033 * Return the credentials associated with a client state object. The 13034 * caller is responsible for freeing the credentials. 13035 */ 13036 13037 static cred_t * 13038 state_to_cred(nfs4_open_stream_t *osp) 13039 { 13040 cred_t *cr; 13041 13042 /* 13043 * It's ok to not lock the open stream and open owner to get 13044 * the oo_cred since this is only written once (upon creation) 13045 * and will not change. 13046 */ 13047 cr = osp->os_open_owner->oo_cred; 13048 crhold(cr); 13049 13050 return (cr); 13051 } 13052 13053 /* 13054 * nfs4_find_sysid 13055 * 13056 * Find the sysid for the knetconfig associated with the given mi. 13057 */ 13058 static struct lm_sysid * 13059 nfs4_find_sysid(mntinfo4_t *mi) 13060 { 13061 ASSERT(nfs_zone() == mi->mi_zone); 13062 13063 /* 13064 * Switch from RDMA knconf to original mount knconf 13065 */ 13066 return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr, 13067 mi->mi_curr_serv->sv_hostname, NULL)); 13068 } 13069 13070 #ifdef DEBUG 13071 /* 13072 * Return a string version of the call type for easy reading. 13073 */ 13074 static char * 13075 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype) 13076 { 13077 switch (ctype) { 13078 case NFS4_LCK_CTYPE_NORM: 13079 return ("NORMAL"); 13080 case NFS4_LCK_CTYPE_RECLAIM: 13081 return ("RECLAIM"); 13082 case NFS4_LCK_CTYPE_RESEND: 13083 return ("RESEND"); 13084 case NFS4_LCK_CTYPE_REINSTATE: 13085 return ("REINSTATE"); 13086 default: 13087 cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal " 13088 "type %d", ctype); 13089 return (""); 13090 } 13091 } 13092 #endif 13093 13094 /* 13095 * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type 13096 * Unlock requests don't have an over-the-wire locktype, so we just return 13097 * something non-threatening. 13098 */ 13099 13100 static nfs_lock_type4 13101 flk_to_locktype(int cmd, int l_type) 13102 { 13103 ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK); 13104 13105 switch (l_type) { 13106 case F_UNLCK: 13107 return (READ_LT); 13108 case F_RDLCK: 13109 if (cmd == F_SETLK) 13110 return (READ_LT); 13111 else 13112 return (READW_LT); 13113 case F_WRLCK: 13114 if (cmd == F_SETLK) 13115 return (WRITE_LT); 13116 else 13117 return (WRITEW_LT); 13118 } 13119 panic("flk_to_locktype"); 13120 /*NOTREACHED*/ 13121 } 13122 13123 /* 13124 * Set the flock64's lm_sysid for nfs4frlock. 13125 */ 13126 static int 13127 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk) 13128 { 13129 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13130 13131 /* Find the lm_sysid */ 13132 *lspp = nfs4_find_sysid(VTOMI4(vp)); 13133 13134 if (*lspp == NULL) { 13135 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13136 "nfs4frlock_get_sysid: no sysid, return ENOLCK")); 13137 return (ENOLCK); 13138 } 13139 13140 flk->l_sysid = lm_sysidt(*lspp); 13141 13142 return (0); 13143 } 13144 13145 /* 13146 * Do the remaining preliminary setup for nfs4frlock. 13147 */ 13148 static void 13149 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep, 13150 vnode_t *vp, cred_t *search_cr, cred_t **cred_otw) 13151 { 13152 /* 13153 * set tick_delay to the base delay time. 13154 * (nfs4_base_wait_time is in msecs) 13155 */ 13156 13157 *tick_delayp = drv_usectohz(nfs4_base_wait_time * 1000); 13158 13159 recov_statep->rs_flags = 0; 13160 recov_statep->rs_num_retry_despite_err = 0; 13161 *cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL); 13162 } 13163 13164 /* 13165 * Initialize and allocate the data structures necessary for 13166 * the nfs4frlock call. 13167 * Allocates argsp's op array, frees up the saved_rqstpp if there is one. 13168 */ 13169 static void 13170 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp, 13171 nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd, 13172 bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp, 13173 bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp) 13174 { 13175 int argoplist_size; 13176 int num_ops = 2; 13177 13178 *retry = FALSE; 13179 *did_start_fop = FALSE; 13180 *skip_get_err = FALSE; 13181 lost_rqstp->lr_op = 0; 13182 argoplist_size = num_ops * sizeof (nfs_argop4); 13183 /* fill array with zero */ 13184 *argopp = kmem_zalloc(argoplist_size, KM_SLEEP); 13185 13186 *argspp = argsp; 13187 *respp = NULL; 13188 13189 argsp->array_len = num_ops; 13190 argsp->array = *argopp; 13191 13192 /* initialize in case of error; will get real value down below */ 13193 argsp->ctag = TAG_NONE; 13194 13195 if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) 13196 *op_hintp = OH_LOCKU; 13197 else 13198 *op_hintp = OH_OTHER; 13199 } 13200 13201 /* 13202 * Call the nfs4_start_fop() for nfs4frlock, if necessary. Assign 13203 * the proper nfs4_server_t for this instance of nfs4frlock. 13204 * Returns 0 (success) or an errno value. 13205 */ 13206 static int 13207 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp, 13208 nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep, 13209 bool_t *did_start_fop, bool_t *startrecovp) 13210 { 13211 int error = 0; 13212 rnode4_t *rp; 13213 13214 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13215 13216 if (ctype == NFS4_LCK_CTYPE_NORM) { 13217 error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint, 13218 recov_statep, startrecovp); 13219 if (error) 13220 return (error); 13221 *did_start_fop = TRUE; 13222 } else { 13223 *did_start_fop = FALSE; 13224 *startrecovp = FALSE; 13225 } 13226 13227 if (!error) { 13228 rp = VTOR4(vp); 13229 13230 /* If the file failed recovery, just quit. */ 13231 mutex_enter(&rp->r_statelock); 13232 if (rp->r_flags & R4RECOVERR) { 13233 error = EIO; 13234 } 13235 mutex_exit(&rp->r_statelock); 13236 } 13237 13238 return (error); 13239 } 13240 13241 /* 13242 * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request. A 13243 * resend nfs4frlock call is initiated by the recovery framework. 13244 * Acquires the lop and oop seqid synchronization. 13245 */ 13246 static void 13247 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp, 13248 COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp, 13249 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13250 LOCK4args **lock_argsp, LOCKU4args **locku_argsp) 13251 { 13252 mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp); 13253 int error; 13254 13255 NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug), 13256 (CE_NOTE, 13257 "nfs4frlock_setup_resend_lock_args: have lost lock to resend")); 13258 ASSERT(resend_rqstp != NULL); 13259 ASSERT(resend_rqstp->lr_op == OP_LOCK || 13260 resend_rqstp->lr_op == OP_LOCKU); 13261 13262 *oopp = resend_rqstp->lr_oop; 13263 if (resend_rqstp->lr_oop) { 13264 open_owner_hold(resend_rqstp->lr_oop); 13265 error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi); 13266 ASSERT(error == 0); /* recov thread always succeeds */ 13267 } 13268 13269 /* Must resend this lost lock/locku request. */ 13270 ASSERT(resend_rqstp->lr_lop != NULL); 13271 *lopp = resend_rqstp->lr_lop; 13272 lock_owner_hold(resend_rqstp->lr_lop); 13273 error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi); 13274 ASSERT(error == 0); /* recov thread always succeeds */ 13275 13276 *ospp = resend_rqstp->lr_osp; 13277 if (*ospp) 13278 open_stream_hold(resend_rqstp->lr_osp); 13279 13280 if (resend_rqstp->lr_op == OP_LOCK) { 13281 LOCK4args *lock_args; 13282 13283 argop->argop = OP_LOCK; 13284 *lock_argsp = lock_args = &argop->nfs_argop4_u.oplock; 13285 lock_args->locktype = resend_rqstp->lr_locktype; 13286 lock_args->reclaim = 13287 (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM); 13288 lock_args->offset = resend_rqstp->lr_flk->l_start; 13289 lock_args->length = resend_rqstp->lr_flk->l_len; 13290 if (lock_args->length == 0) 13291 lock_args->length = ~lock_args->length; 13292 nfs4_setup_lock_args(*lopp, *oopp, *ospp, 13293 mi2clientid(mi), &lock_args->locker); 13294 13295 switch (resend_rqstp->lr_ctype) { 13296 case NFS4_LCK_CTYPE_RESEND: 13297 argsp->ctag = TAG_LOCK_RESEND; 13298 break; 13299 case NFS4_LCK_CTYPE_REINSTATE: 13300 argsp->ctag = TAG_LOCK_REINSTATE; 13301 break; 13302 case NFS4_LCK_CTYPE_RECLAIM: 13303 argsp->ctag = TAG_LOCK_RECLAIM; 13304 break; 13305 default: 13306 argsp->ctag = TAG_LOCK_UNKNOWN; 13307 break; 13308 } 13309 } else { 13310 LOCKU4args *locku_args; 13311 nfs4_lock_owner_t *lop = resend_rqstp->lr_lop; 13312 13313 argop->argop = OP_LOCKU; 13314 *locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku; 13315 locku_args->locktype = READ_LT; 13316 locku_args->seqid = lop->lock_seqid + 1; 13317 mutex_enter(&lop->lo_lock); 13318 locku_args->lock_stateid = lop->lock_stateid; 13319 mutex_exit(&lop->lo_lock); 13320 locku_args->offset = resend_rqstp->lr_flk->l_start; 13321 locku_args->length = resend_rqstp->lr_flk->l_len; 13322 if (locku_args->length == 0) 13323 locku_args->length = ~locku_args->length; 13324 13325 switch (resend_rqstp->lr_ctype) { 13326 case NFS4_LCK_CTYPE_RESEND: 13327 argsp->ctag = TAG_LOCKU_RESEND; 13328 break; 13329 case NFS4_LCK_CTYPE_REINSTATE: 13330 argsp->ctag = TAG_LOCKU_REINSTATE; 13331 break; 13332 default: 13333 argsp->ctag = TAG_LOCK_UNKNOWN; 13334 break; 13335 } 13336 } 13337 } 13338 13339 /* 13340 * Setup the LOCKT4 arguments. 13341 */ 13342 static void 13343 nfs4frlock_setup_lockt_args(nfs_argop4 *argop, LOCKT4args **lockt_argsp, 13344 COMPOUND4args_clnt *argsp, flock64_t *flk, rnode4_t *rp) 13345 { 13346 LOCKT4args *lockt_args; 13347 13348 ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone); 13349 argop->argop = OP_LOCKT; 13350 argsp->ctag = TAG_LOCKT; 13351 lockt_args = &argop->nfs_argop4_u.oplockt; 13352 13353 /* 13354 * The locktype will be READ_LT unless it's 13355 * a write lock. We do this because the Solaris 13356 * system call allows the combination of 13357 * F_UNLCK and F_GETLK* and so in that case the 13358 * unlock is mapped to a read. 13359 */ 13360 if (flk->l_type == F_WRLCK) 13361 lockt_args->locktype = WRITE_LT; 13362 else 13363 lockt_args->locktype = READ_LT; 13364 13365 lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp))); 13366 /* set the lock owner4 args */ 13367 nfs4_setlockowner_args(&lockt_args->owner, rp, flk->l_pid); 13368 lockt_args->offset = flk->l_start; 13369 lockt_args->length = flk->l_len; 13370 if (flk->l_len == 0) 13371 lockt_args->length = ~lockt_args->length; 13372 13373 *lockt_argsp = lockt_args; 13374 } 13375 13376 /* 13377 * If the client is holding a delegation, and the open stream to be used 13378 * with this lock request is a delegation open stream, then re-open the stream. 13379 * Sets the nfs4_error_t to all zeros unless the open stream has already 13380 * failed a reopen or we couldn't find the open stream. NFS4ERR_DELAY 13381 * means the caller should retry (like a recovery retry). 13382 */ 13383 static void 13384 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt) 13385 { 13386 open_delegation_type4 dt; 13387 bool_t reopen_needed, force; 13388 nfs4_open_stream_t *osp; 13389 open_claim_type4 oclaim; 13390 rnode4_t *rp = VTOR4(vp); 13391 mntinfo4_t *mi = VTOMI4(vp); 13392 13393 ASSERT(nfs_zone() == mi->mi_zone); 13394 13395 nfs4_error_zinit(ep); 13396 13397 mutex_enter(&rp->r_statev4_lock); 13398 dt = rp->r_deleg_type; 13399 mutex_exit(&rp->r_statev4_lock); 13400 13401 if (dt != OPEN_DELEGATE_NONE) { 13402 nfs4_open_owner_t *oop; 13403 13404 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 13405 if (!oop) { 13406 ep->stat = NFS4ERR_IO; 13407 return; 13408 } 13409 /* returns with 'os_sync_lock' held */ 13410 osp = find_open_stream(oop, rp); 13411 if (!osp) { 13412 open_owner_rele(oop); 13413 ep->stat = NFS4ERR_IO; 13414 return; 13415 } 13416 13417 if (osp->os_failed_reopen) { 13418 NFS4_DEBUG((nfs4_open_stream_debug || 13419 nfs4_client_lock_debug), (CE_NOTE, 13420 "nfs4frlock_check_deleg: os_failed_reopen set " 13421 "for osp %p, cr %p, rp %s", (void *)osp, 13422 (void *)cr, rnode4info(rp))); 13423 mutex_exit(&osp->os_sync_lock); 13424 open_stream_rele(osp, rp); 13425 open_owner_rele(oop); 13426 ep->stat = NFS4ERR_IO; 13427 return; 13428 } 13429 13430 /* 13431 * Determine whether a reopen is needed. If this 13432 * is a delegation open stream, then send the open 13433 * to the server to give visibility to the open owner. 13434 * Even if it isn't a delegation open stream, we need 13435 * to check if the previous open CLAIM_DELEGATE_CUR 13436 * was sufficient. 13437 */ 13438 13439 reopen_needed = osp->os_delegation || 13440 ((lt == F_RDLCK && 13441 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) || 13442 (lt == F_WRLCK && 13443 !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE))); 13444 13445 mutex_exit(&osp->os_sync_lock); 13446 open_owner_rele(oop); 13447 13448 if (reopen_needed) { 13449 /* 13450 * Always use CLAIM_PREVIOUS after server reboot. 13451 * The server will reject CLAIM_DELEGATE_CUR if 13452 * it is used during the grace period. 13453 */ 13454 mutex_enter(&mi->mi_lock); 13455 if (mi->mi_recovflags & MI4R_SRV_REBOOT) { 13456 oclaim = CLAIM_PREVIOUS; 13457 force = TRUE; 13458 } else { 13459 oclaim = CLAIM_DELEGATE_CUR; 13460 force = FALSE; 13461 } 13462 mutex_exit(&mi->mi_lock); 13463 13464 nfs4_reopen(vp, osp, ep, oclaim, force, FALSE); 13465 if (ep->error == EAGAIN) { 13466 nfs4_error_zinit(ep); 13467 ep->stat = NFS4ERR_DELAY; 13468 } 13469 } 13470 open_stream_rele(osp, rp); 13471 osp = NULL; 13472 } 13473 } 13474 13475 /* 13476 * Setup the LOCKU4 arguments. 13477 * Returns errors via the nfs4_error_t. 13478 * NFS4_OK no problems. *go_otwp is TRUE if call should go 13479 * over-the-wire. The caller must release the 13480 * reference on *lopp. 13481 * NFS4ERR_DELAY caller should retry (like recovery retry) 13482 * (other) unrecoverable error. 13483 */ 13484 static void 13485 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop, 13486 LOCKU4args **locku_argsp, flock64_t *flk, 13487 nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp, 13488 vnode_t *vp, cred_t *cr, bool_t *skip_get_err, bool_t *go_otwp) 13489 { 13490 nfs4_lock_owner_t *lop = NULL; 13491 LOCKU4args *locku_args; 13492 pid_t pid = flk->l_pid; 13493 bool_t is_spec = FALSE; 13494 rnode4_t *rp = VTOR4(vp); 13495 13496 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13497 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13498 13499 nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK); 13500 if (ep->error || ep->stat) 13501 return; 13502 13503 argop->argop = OP_LOCKU; 13504 if (ctype == NFS4_LCK_CTYPE_REINSTATE) 13505 argsp->ctag = TAG_LOCKU_REINSTATE; 13506 else 13507 argsp->ctag = TAG_LOCKU; 13508 locku_args = &argop->nfs_argop4_u.oplocku; 13509 *locku_argsp = locku_args; 13510 13511 /* 13512 * XXX what should locku_args->locktype be? 13513 * setting to ALWAYS be READ_LT so at least 13514 * it is a valid locktype. 13515 */ 13516 13517 locku_args->locktype = READ_LT; 13518 13519 /* 13520 * Get the lock owner stateid. If no lock owner 13521 * exists, return success. 13522 */ 13523 lop = find_lock_owner(rp, pid, LOWN_ANY); 13524 *lopp = lop; 13525 if (lop && CLNT_ISSPECIAL(&lop->lock_stateid)) 13526 is_spec = TRUE; 13527 if (!lop || is_spec) { 13528 /* 13529 * No lock owner so no locks to unlock. 13530 * Return success. 13531 * 13532 * If the lockowner is using a special stateid, 13533 * then the original lock request (that created 13534 * this lockowner) was never successful, so we 13535 * have no lock to undo OTW. 13536 */ 13537 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 13538 "nfs4frlock_setup_locku_args: LOCKU: no lock owner " 13539 "(%ld) so return success", (long)pid)); 13540 13541 /* 13542 * Release our hold and NULL out so final_cleanup 13543 * doesn't try to end a lock seqid sync we 13544 * never started. 13545 */ 13546 if (is_spec) { 13547 lock_owner_rele(lop); 13548 *lopp = NULL; 13549 } 13550 *skip_get_err = TRUE; 13551 *go_otwp = FALSE; 13552 return; 13553 } 13554 13555 ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp)); 13556 if (ep->error == EAGAIN) { 13557 lock_owner_rele(lop); 13558 *lopp = NULL; 13559 return; 13560 } 13561 13562 mutex_enter(&lop->lo_lock); 13563 locku_args->lock_stateid = lop->lock_stateid; 13564 mutex_exit(&lop->lo_lock); 13565 locku_args->seqid = lop->lock_seqid + 1; 13566 13567 /* leave the ref count on lop, rele after RPC call */ 13568 13569 locku_args->offset = flk->l_start; 13570 locku_args->length = flk->l_len; 13571 if (flk->l_len == 0) 13572 locku_args->length = ~locku_args->length; 13573 13574 *go_otwp = TRUE; 13575 } 13576 13577 /* 13578 * Setup the LOCK4 arguments. 13579 * 13580 * Returns errors via the nfs4_error_t. 13581 * NFS4_OK no problems 13582 * NFS4ERR_DELAY caller should retry (like recovery retry) 13583 * (other) unrecoverable error 13584 */ 13585 static void 13586 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp, 13587 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13588 nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp, 13589 flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep) 13590 { 13591 LOCK4args *lock_args; 13592 nfs4_open_owner_t *oop = NULL; 13593 nfs4_open_stream_t *osp = NULL; 13594 nfs4_lock_owner_t *lop = NULL; 13595 pid_t pid = flk->l_pid; 13596 rnode4_t *rp = VTOR4(vp); 13597 13598 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13599 13600 nfs4frlock_check_deleg(vp, ep, cr, flk->l_type); 13601 if (ep->error || ep->stat != NFS4_OK) 13602 return; 13603 13604 argop->argop = OP_LOCK; 13605 if (ctype == NFS4_LCK_CTYPE_NORM) 13606 argsp->ctag = TAG_LOCK; 13607 else if (ctype == NFS4_LCK_CTYPE_RECLAIM) 13608 argsp->ctag = TAG_RELOCK; 13609 else 13610 argsp->ctag = TAG_LOCK_REINSTATE; 13611 lock_args = &argop->nfs_argop4_u.oplock; 13612 lock_args->locktype = flk_to_locktype(cmd, flk->l_type); 13613 lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0; 13614 /* 13615 * Get the lock owner. If no lock owner exists, 13616 * create a 'temporary' one and grab the open seqid 13617 * synchronization (which puts a hold on the open 13618 * owner and open stream). 13619 * This also grabs the lock seqid synchronization. 13620 */ 13621 ep->stat = 13622 nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop); 13623 13624 if (ep->stat != NFS4_OK) 13625 goto out; 13626 13627 nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)), 13628 &lock_args->locker); 13629 13630 lock_args->offset = flk->l_start; 13631 lock_args->length = flk->l_len; 13632 if (flk->l_len == 0) 13633 lock_args->length = ~lock_args->length; 13634 *lock_argsp = lock_args; 13635 out: 13636 *oopp = oop; 13637 *ospp = osp; 13638 *lopp = lop; 13639 } 13640 13641 /* 13642 * After we get the reply from the server, record the proper information 13643 * for possible resend lock requests. 13644 * 13645 * Allocates memory for the saved_rqstp if we have a lost lock to save. 13646 */ 13647 static void 13648 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error, 13649 nfs_lock_type4 locktype, nfs4_open_owner_t *oop, 13650 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk, 13651 nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp) 13652 { 13653 bool_t unlock = (flk->l_type == F_UNLCK); 13654 13655 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13656 ASSERT(ctype == NFS4_LCK_CTYPE_NORM || 13657 ctype == NFS4_LCK_CTYPE_REINSTATE); 13658 13659 if (error != 0 && !unlock) { 13660 NFS4_DEBUG((nfs4_lost_rqst_debug || 13661 nfs4_client_lock_debug), (CE_NOTE, 13662 "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 " 13663 " for lop %p", (void *)lop)); 13664 ASSERT(lop != NULL); 13665 mutex_enter(&lop->lo_lock); 13666 lop->lo_pending_rqsts = 1; 13667 mutex_exit(&lop->lo_lock); 13668 } 13669 13670 lost_rqstp->lr_putfirst = FALSE; 13671 lost_rqstp->lr_op = 0; 13672 13673 /* 13674 * For lock/locku requests, we treat EINTR as ETIMEDOUT for 13675 * recovery purposes so that the lock request that was sent 13676 * can be saved and re-issued later. Ditto for EIO from a forced 13677 * unmount. This is done to have the client's local locking state 13678 * match the v4 server's state; that is, the request was 13679 * potentially received and accepted by the server but the client 13680 * thinks it was not. 13681 */ 13682 if (error == ETIMEDOUT || error == EINTR || 13683 NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) { 13684 NFS4_DEBUG((nfs4_lost_rqst_debug || 13685 nfs4_client_lock_debug), (CE_NOTE, 13686 "nfs4frlock_save_lost_rqst: got a lost %s lock for " 13687 "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK", 13688 (void *)lop, (void *)oop, (void *)osp)); 13689 if (unlock) 13690 lost_rqstp->lr_op = OP_LOCKU; 13691 else { 13692 lost_rqstp->lr_op = OP_LOCK; 13693 lost_rqstp->lr_locktype = locktype; 13694 } 13695 /* 13696 * Objects are held and rele'd via the recovery code. 13697 * See nfs4_save_lost_rqst. 13698 */ 13699 lost_rqstp->lr_vp = vp; 13700 lost_rqstp->lr_dvp = NULL; 13701 lost_rqstp->lr_oop = oop; 13702 lost_rqstp->lr_osp = osp; 13703 lost_rqstp->lr_lop = lop; 13704 lost_rqstp->lr_cr = cr; 13705 switch (ctype) { 13706 case NFS4_LCK_CTYPE_NORM: 13707 lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND; 13708 break; 13709 case NFS4_LCK_CTYPE_REINSTATE: 13710 lost_rqstp->lr_putfirst = TRUE; 13711 lost_rqstp->lr_ctype = ctype; 13712 break; 13713 default: 13714 break; 13715 } 13716 lost_rqstp->lr_flk = flk; 13717 } 13718 } 13719 13720 /* 13721 * Update lop's seqid. Also update the seqid stored in a resend request, 13722 * if any. (Some recovery errors increment the seqid, and we may have to 13723 * send the resend request again.) 13724 */ 13725 13726 static void 13727 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args, 13728 nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type) 13729 { 13730 if (lock_args) { 13731 if (lock_args->locker.new_lock_owner == TRUE) 13732 nfs4_get_and_set_next_open_seqid(oop, tag_type); 13733 else { 13734 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13735 nfs4_set_lock_seqid(lop->lock_seqid + 1, lop); 13736 } 13737 } else if (locku_args) { 13738 ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE); 13739 nfs4_set_lock_seqid(lop->lock_seqid +1, lop); 13740 } 13741 } 13742 13743 /* 13744 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13745 * COMPOUND4 args/res for calls that need to retry. 13746 * Switches the *cred_otwp to base_cr. 13747 */ 13748 static void 13749 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint, 13750 nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop, 13751 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error, 13752 nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp, 13753 nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp) 13754 { 13755 nfs4_open_owner_t *oop = *oopp; 13756 nfs4_open_stream_t *osp = *ospp; 13757 nfs4_lock_owner_t *lop = *lopp; 13758 nfs_argop4 *argop = (*argspp)->array; 13759 13760 if (*did_start_fop) { 13761 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13762 needrecov); 13763 *did_start_fop = FALSE; 13764 } 13765 ASSERT((*argspp)->array_len == 2); 13766 if (argop[1].argop == OP_LOCK) 13767 nfs4args_lock_free(&argop[1]); 13768 else if (argop[1].argop == OP_LOCKT) 13769 nfs4args_lockt_free(&argop[1]); 13770 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13771 if (!error) 13772 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13773 *argspp = NULL; 13774 *respp = NULL; 13775 13776 if (lop) { 13777 nfs4_end_lock_seqid_sync(lop); 13778 lock_owner_rele(lop); 13779 *lopp = NULL; 13780 } 13781 13782 /* need to free up the reference on osp for lock args */ 13783 if (osp != NULL) { 13784 open_stream_rele(osp, VTOR4(vp)); 13785 *ospp = NULL; 13786 } 13787 13788 /* need to free up the reference on oop for lock args */ 13789 if (oop != NULL) { 13790 nfs4_end_open_seqid_sync(oop); 13791 open_owner_rele(oop); 13792 *oopp = NULL; 13793 } 13794 13795 crfree(*cred_otwp); 13796 *cred_otwp = base_cr; 13797 crhold(*cred_otwp); 13798 } 13799 13800 /* 13801 * Function to process the client's recovery for nfs4frlock. 13802 * Returns TRUE if we should retry the lock request; FALSE otherwise. 13803 * 13804 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13805 * COMPOUND4 args/res for calls that need to retry. 13806 * 13807 * Note: the rp's r_lkserlock is *not* dropped during this path. 13808 */ 13809 static bool_t 13810 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep, 13811 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13812 LOCK4args *lock_args, LOCKU4args *locku_args, 13813 nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp, 13814 nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp, 13815 nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint, 13816 bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk) 13817 { 13818 nfs4_open_owner_t *oop = *oopp; 13819 nfs4_open_stream_t *osp = *ospp; 13820 nfs4_lock_owner_t *lop = *lopp; 13821 13822 bool_t abort, retry; 13823 13824 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13825 ASSERT((*argspp) != NULL); 13826 ASSERT((*respp) != NULL); 13827 if (lock_args || locku_args) 13828 ASSERT(lop != NULL); 13829 13830 NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug), 13831 (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n")); 13832 13833 retry = TRUE; 13834 abort = FALSE; 13835 if (needrecov) { 13836 nfs4_bseqid_entry_t *bsep = NULL; 13837 nfs_opnum4 op; 13838 13839 op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT; 13840 13841 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) { 13842 seqid4 seqid; 13843 13844 if (lock_args) { 13845 if (lock_args->locker.new_lock_owner == TRUE) 13846 seqid = lock_args->locker.locker4_u. 13847 open_owner.open_seqid; 13848 else 13849 seqid = lock_args->locker.locker4_u. 13850 lock_owner.lock_seqid; 13851 } else if (locku_args) { 13852 seqid = locku_args->seqid; 13853 } else { 13854 seqid = 0; 13855 } 13856 13857 bsep = nfs4_create_bseqid_entry(oop, lop, vp, 13858 flk->l_pid, (*argspp)->ctag, seqid); 13859 } 13860 13861 abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL, 13862 (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK || 13863 lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp : 13864 NULL, op, bsep, NULL, NULL); 13865 13866 if (bsep) 13867 kmem_free(bsep, sizeof (*bsep)); 13868 } 13869 13870 /* 13871 * Return that we do not want to retry the request for 3 cases: 13872 * 1. If we received EINTR or are bailing out because of a forced 13873 * unmount, we came into this code path just for the sake of 13874 * initiating recovery, we now need to return the error. 13875 * 2. If we have aborted recovery. 13876 * 3. We received NFS4ERR_BAD_SEQID. 13877 */ 13878 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) || 13879 abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID)) 13880 retry = FALSE; 13881 13882 if (*did_start_fop == TRUE) { 13883 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep, 13884 needrecov); 13885 *did_start_fop = FALSE; 13886 } 13887 13888 if (retry == TRUE) { 13889 nfs_argop4 *argop; 13890 13891 argop = (*argspp)->array; 13892 ASSERT((*argspp)->array_len == 2); 13893 13894 if (argop[1].argop == OP_LOCK) 13895 nfs4args_lock_free(&argop[1]); 13896 else if (argop[1].argop == OP_LOCKT) 13897 nfs4args_lockt_free(&argop[1]); 13898 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13899 if (!ep->error) 13900 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp); 13901 *respp = NULL; 13902 *argspp = NULL; 13903 } 13904 13905 if (lop != NULL) { 13906 nfs4_end_lock_seqid_sync(lop); 13907 lock_owner_rele(lop); 13908 } 13909 13910 *lopp = NULL; 13911 13912 /* need to free up the reference on osp for lock args */ 13913 if (osp != NULL) { 13914 open_stream_rele(osp, rp); 13915 *ospp = NULL; 13916 } 13917 13918 /* need to free up the reference on oop for lock args */ 13919 if (oop != NULL) { 13920 nfs4_end_open_seqid_sync(oop); 13921 open_owner_rele(oop); 13922 *oopp = NULL; 13923 } 13924 13925 return (retry); 13926 } 13927 13928 /* 13929 * Handle the DENIED reply from the server for nfs4frlock. 13930 * Returns TRUE if we should retry the request; FALSE otherwise. 13931 * 13932 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 13933 * COMPOUND4 args/res for calls that need to retry. Can also 13934 * drop and regrab the r_lkserlock. 13935 */ 13936 static bool_t 13937 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args, 13938 LOCKT4args *lockt_args, nfs4_open_owner_t **oopp, 13939 nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd, 13940 vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint, 13941 nfs4_recov_state_t *recov_statep, int needrecov, 13942 COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, 13943 clock_t *tick_delayp, int *errorp, 13944 nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop, 13945 bool_t *skip_get_err) 13946 { 13947 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 13948 13949 if (lock_args) { 13950 nfs4_open_owner_t *oop = *oopp; 13951 nfs4_open_stream_t *osp = *ospp; 13952 nfs4_lock_owner_t *lop = *lopp; 13953 int intr; 13954 13955 /* 13956 * Blocking lock needs to sleep and retry from the request. 13957 * 13958 * Do not block and wait for 'resend' or 'reinstate' 13959 * lock requests, just return the error. 13960 * 13961 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW. 13962 */ 13963 if (cmd == F_SETLKW) { 13964 rnode4_t *rp = VTOR4(vp); 13965 nfs_argop4 *argop = (*argspp)->array; 13966 13967 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 13968 13969 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 13970 recov_statep, needrecov); 13971 *did_start_fop = FALSE; 13972 ASSERT((*argspp)->array_len == 2); 13973 if (argop[1].argop == OP_LOCK) 13974 nfs4args_lock_free(&argop[1]); 13975 else if (argop[1].argop == OP_LOCKT) 13976 nfs4args_lockt_free(&argop[1]); 13977 kmem_free(argop, 2 * sizeof (nfs_argop4)); 13978 if (*respp) 13979 (void) xdr_free(xdr_COMPOUND4res_clnt, 13980 (caddr_t)*respp); 13981 *argspp = NULL; 13982 *respp = NULL; 13983 nfs4_end_lock_seqid_sync(lop); 13984 lock_owner_rele(lop); 13985 *lopp = NULL; 13986 if (osp != NULL) { 13987 open_stream_rele(osp, rp); 13988 *ospp = NULL; 13989 } 13990 if (oop != NULL) { 13991 nfs4_end_open_seqid_sync(oop); 13992 open_owner_rele(oop); 13993 *oopp = NULL; 13994 } 13995 13996 nfs_rw_exit(&rp->r_lkserlock); 13997 13998 intr = nfs4_block_and_wait(tick_delayp); 13999 14000 (void) nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, 14001 FALSE); 14002 14003 if (intr) { 14004 *errorp = EINTR; 14005 return (FALSE); 14006 } 14007 14008 /* 14009 * Make sure we are still safe to lock with 14010 * regards to mmapping. 14011 */ 14012 if (!nfs4_safelock(vp, flk, cr)) { 14013 *errorp = EAGAIN; 14014 return (FALSE); 14015 } 14016 14017 return (TRUE); 14018 } 14019 if (ctype == NFS4_LCK_CTYPE_NORM) 14020 *errorp = EAGAIN; 14021 *skip_get_err = TRUE; 14022 flk->l_whence = 0; 14023 return (FALSE); 14024 } else if (lockt_args) { 14025 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14026 "nfs4frlock_results_denied: OP_LOCKT DENIED")); 14027 14028 denied_to_flk(&resop->nfs_resop4_u.oplockt.denied, 14029 flk, lockt_args); 14030 14031 /* according to NLM code */ 14032 *errorp = 0; 14033 *skip_get_err = TRUE; 14034 return (FALSE); 14035 } 14036 return (FALSE); 14037 } 14038 14039 /* 14040 * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock. 14041 */ 14042 static void 14043 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp) 14044 { 14045 switch (resp->status) { 14046 case NFS4ERR_ACCESS: 14047 case NFS4ERR_ADMIN_REVOKED: 14048 case NFS4ERR_BADHANDLE: 14049 case NFS4ERR_BAD_RANGE: 14050 case NFS4ERR_BAD_SEQID: 14051 case NFS4ERR_BAD_STATEID: 14052 case NFS4ERR_BADXDR: 14053 case NFS4ERR_DEADLOCK: 14054 case NFS4ERR_DELAY: 14055 case NFS4ERR_EXPIRED: 14056 case NFS4ERR_FHEXPIRED: 14057 case NFS4ERR_GRACE: 14058 case NFS4ERR_INVAL: 14059 case NFS4ERR_ISDIR: 14060 case NFS4ERR_LEASE_MOVED: 14061 case NFS4ERR_LOCK_NOTSUPP: 14062 case NFS4ERR_LOCK_RANGE: 14063 case NFS4ERR_MOVED: 14064 case NFS4ERR_NOFILEHANDLE: 14065 case NFS4ERR_NO_GRACE: 14066 case NFS4ERR_OLD_STATEID: 14067 case NFS4ERR_OPENMODE: 14068 case NFS4ERR_RECLAIM_BAD: 14069 case NFS4ERR_RECLAIM_CONFLICT: 14070 case NFS4ERR_RESOURCE: 14071 case NFS4ERR_SERVERFAULT: 14072 case NFS4ERR_STALE: 14073 case NFS4ERR_STALE_CLIENTID: 14074 case NFS4ERR_STALE_STATEID: 14075 return; 14076 default: 14077 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14078 "nfs4frlock_results_default: got unrecognizable " 14079 "res.status %d", resp->status)); 14080 *errorp = NFS4ERR_INVAL; 14081 } 14082 } 14083 14084 /* 14085 * The lock request was successful, so update the client's state. 14086 */ 14087 static void 14088 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args, 14089 LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop, 14090 vnode_t *vp, flock64_t *flk, cred_t *cr, 14091 nfs4_lost_rqst_t *resend_rqstp) 14092 { 14093 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14094 14095 if (lock_args) { 14096 LOCK4res *lock_res; 14097 14098 lock_res = &resop->nfs_resop4_u.oplock; 14099 /* update the stateid with server's response */ 14100 14101 if (lock_args->locker.new_lock_owner == TRUE) { 14102 mutex_enter(&lop->lo_lock); 14103 lop->lo_just_created = NFS4_PERM_CREATED; 14104 mutex_exit(&lop->lo_lock); 14105 } 14106 14107 nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid); 14108 14109 /* 14110 * If the lock was the result of a resending a lost 14111 * request, we've synched up the stateid and seqid 14112 * with the server, but now the server might be out of sync 14113 * with what the application thinks it has for locks. 14114 * Clean that up here. It's unclear whether we should do 14115 * this even if the filesystem has been forcibly unmounted. 14116 * For most servers, it's probably wasted effort, but 14117 * RFC3530 lets servers require that unlocks exactly match 14118 * the locks that are held. 14119 */ 14120 if (resend_rqstp != NULL && 14121 resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) { 14122 nfs4_reinstitute_local_lock_state(vp, flk, cr, lop); 14123 } else { 14124 flk->l_whence = 0; 14125 } 14126 } else if (locku_args) { 14127 LOCKU4res *locku_res; 14128 14129 locku_res = &resop->nfs_resop4_u.oplocku; 14130 14131 /* Update the stateid with the server's response */ 14132 nfs4_set_lock_stateid(lop, locku_res->lock_stateid); 14133 } else if (lockt_args) { 14134 /* Switch the lock type to express success, see fcntl */ 14135 flk->l_type = F_UNLCK; 14136 flk->l_whence = 0; 14137 } 14138 } 14139 14140 /* 14141 * Do final cleanup before exiting nfs4frlock. 14142 * Calls nfs4_end_fop, drops the seqid syncs, and frees up the 14143 * COMPOUND4 args/res for calls that haven't already. 14144 */ 14145 static void 14146 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp, 14147 COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint, 14148 nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop, 14149 nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, 14150 int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args, 14151 bool_t did_start_fop, bool_t skip_get_err, 14152 cred_t *cred_otw, cred_t *cred) 14153 { 14154 mntinfo4_t *mi = VTOMI4(vp); 14155 rnode4_t *rp = VTOR4(vp); 14156 int error = *errorp; 14157 nfs_argop4 *argop; 14158 int do_flush_pages = 0; 14159 14160 ASSERT(nfs_zone() == mi->mi_zone); 14161 /* 14162 * The client recovery code wants the raw status information, 14163 * so don't map the NFS status code to an errno value for 14164 * non-normal call types. 14165 */ 14166 if (ctype == NFS4_LCK_CTYPE_NORM) { 14167 if (*errorp == 0 && resp != NULL && skip_get_err == FALSE) 14168 *errorp = geterrno4(resp->status); 14169 if (did_start_fop == TRUE) 14170 nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep, 14171 needrecov); 14172 14173 /* 14174 * We've established a new lock on the server, so invalidate 14175 * the pages associated with the vnode to get the most up to 14176 * date pages from the server after acquiring the lock. We 14177 * want to be sure that the read operation gets the newest data. 14178 * 14179 * We flush the pages below after calling nfs4_end_fop above. 14180 * 14181 * The flush of the page cache must be done after 14182 * nfs4_end_open_seqid_sync() to avoid a 4-way hang. 14183 */ 14184 if (!error && resp && resp->status == NFS4_OK) 14185 do_flush_pages = 1; 14186 } 14187 if (argsp) { 14188 ASSERT(argsp->array_len == 2); 14189 argop = argsp->array; 14190 if (argop[1].argop == OP_LOCK) 14191 nfs4args_lock_free(&argop[1]); 14192 else if (argop[1].argop == OP_LOCKT) 14193 nfs4args_lockt_free(&argop[1]); 14194 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14195 if (resp) 14196 (void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp); 14197 } 14198 14199 /* free the reference on the lock owner */ 14200 if (lop != NULL) { 14201 nfs4_end_lock_seqid_sync(lop); 14202 lock_owner_rele(lop); 14203 } 14204 14205 /* need to free up the reference on osp for lock args */ 14206 if (osp != NULL) 14207 open_stream_rele(osp, rp); 14208 14209 /* need to free up the reference on oop for lock args */ 14210 if (oop != NULL) { 14211 nfs4_end_open_seqid_sync(oop); 14212 open_owner_rele(oop); 14213 } 14214 14215 if (do_flush_pages) 14216 nfs4_flush_pages(vp, cred); 14217 14218 /* 14219 * Record debug information in the event we get EINVAL. 14220 */ 14221 mutex_enter(&mi->mi_lock); 14222 if (*errorp == EINVAL && (lock_args || locku_args) && 14223 (!(mi->mi_flags & MI4_POSIX_LOCK))) { 14224 if (!(mi->mi_flags & MI4_LOCK_DEBUG)) { 14225 zcmn_err(getzoneid(), CE_NOTE, 14226 "%s operation failed with " 14227 "EINVAL probably since the server, %s," 14228 " doesn't support POSIX style locking", 14229 lock_args ? "LOCK" : "LOCKU", 14230 mi->mi_curr_serv->sv_hostname); 14231 mi->mi_flags |= MI4_LOCK_DEBUG; 14232 } 14233 } 14234 mutex_exit(&mi->mi_lock); 14235 14236 if (cred_otw) 14237 crfree(cred_otw); 14238 } 14239 14240 /* 14241 * This calls the server. 14242 * 14243 * Blocking lock requests will continually retry to acquire the lock 14244 * forever. 14245 * 14246 * The ctype is defined as follows: 14247 * NFS4_LCK_CTYPE_NORM: normal lock request. 14248 * 14249 * NFS4_LCK_CTYPE_RECLAIM: bypass the usual calls for synchronizing with client 14250 * recovery. 14251 * 14252 * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition 14253 * that we will use the information passed in via resend_rqstp to setup the 14254 * lock/locku request. This resend is the exact same request as the 'lost 14255 * lock', and is initiated by the recovery framework. A successful resend 14256 * request can initiate one or more reinstate requests. 14257 * 14258 * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it 14259 * does not trigger additional reinstate requests. This lock call type is 14260 * set for setting the v4 server's locking state back to match what the 14261 * client's local locking state is in the event of a received 'lost lock'. 14262 * 14263 * Errors are returned via the nfs4_error_t parameter. 14264 */ 14265 void 14266 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk, 14267 cred_t *cr, nfs4_error_t *ep, nfs4_lost_rqst_t *resend_rqstp, 14268 int *did_reclaimp) 14269 { 14270 COMPOUND4args_clnt args, *argsp = NULL; 14271 COMPOUND4res_clnt res, *resp = NULL; 14272 nfs_argop4 *argop; 14273 nfs_resop4 *resop; 14274 rnode4_t *rp; 14275 int doqueue = 1; 14276 clock_t tick_delay; /* delay in clock ticks */ 14277 LOCK4args *lock_args = NULL; 14278 LOCKU4args *locku_args = NULL; 14279 LOCKT4args *lockt_args = NULL; 14280 nfs4_open_owner_t *oop = NULL; 14281 nfs4_open_stream_t *osp = NULL; 14282 nfs4_lock_owner_t *lop = NULL; 14283 bool_t needrecov = FALSE; 14284 nfs4_recov_state_t recov_state; 14285 nfs4_op_hint_t op_hint; 14286 nfs4_lost_rqst_t lost_rqst; 14287 bool_t retry = FALSE; 14288 bool_t did_start_fop = FALSE; 14289 bool_t skip_get_err = FALSE; 14290 cred_t *cred_otw = NULL; 14291 bool_t recovonly; /* just queue request */ 14292 int frc_no_reclaim = 0; 14293 #ifdef DEBUG 14294 char *name; 14295 #endif 14296 14297 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14298 14299 #ifdef DEBUG 14300 name = fn_name(VTOSV(vp)->sv_name); 14301 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: " 14302 "%s: cmd %d, type %d, start %"PRIx64", " 14303 "length %"PRIu64", pid %d, sysid %d, call type %s, " 14304 "resend request %s", name, cmd, flk->l_type, flk->l_start, 14305 flk->l_len, flk->l_pid, flk->l_sysid, 14306 nfs4frlock_get_call_type(ctype), 14307 resend_rqstp ? "TRUE" : "FALSE")); 14308 kmem_free(name, MAXNAMELEN); 14309 #endif 14310 14311 nfs4_error_zinit(ep); 14312 14313 nfs4frlock_pre_setup(&tick_delay, &recov_state, vp, cr, &cred_otw); 14314 14315 rp = VTOR4(vp); 14316 14317 recov_retry: 14318 nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd, 14319 &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst); 14320 14321 ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state, 14322 &did_start_fop, &recovonly); 14323 14324 if (ep->error) 14325 goto out; 14326 14327 if (recovonly) { 14328 /* 14329 * Leave the request for the recovery system to deal with. 14330 */ 14331 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14332 ASSERT(cmd != F_GETLK); 14333 ASSERT(flk->l_type == F_UNLCK); 14334 14335 nfs4_error_init(ep, EINTR); 14336 needrecov = TRUE; 14337 lop = find_lock_owner(rp, flk->l_pid, LOWN_ANY); 14338 if (lop != NULL) { 14339 nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT, 14340 NULL, NULL, lop, flk, &lost_rqst, cr, vp); 14341 (void) nfs4_start_recovery(ep, 14342 VTOMI4(vp), vp, NULL, NULL, 14343 (lost_rqst.lr_op == OP_LOCK || 14344 lost_rqst.lr_op == OP_LOCKU) ? 14345 &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL); 14346 lock_owner_rele(lop); 14347 lop = NULL; 14348 } 14349 goto out; 14350 } 14351 14352 /* putfh directory fh */ 14353 argop[0].argop = OP_CPUTFH; 14354 argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh; 14355 14356 /* 14357 * Set up the over-the-wire arguments and get references to the 14358 * open owner, etc. 14359 */ 14360 14361 if (ctype == NFS4_LCK_CTYPE_RESEND || 14362 ctype == NFS4_LCK_CTYPE_REINSTATE) { 14363 nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp, 14364 &argop[1], &lop, &oop, &osp, &lock_args, &locku_args); 14365 } else { 14366 bool_t go_otw = TRUE; 14367 14368 ASSERT(resend_rqstp == NULL); 14369 14370 switch (cmd) { 14371 case F_GETLK: 14372 case F_O_GETLK: 14373 ASSERT(ctype == NFS4_LCK_CTYPE_NORM); 14374 nfs4frlock_setup_lockt_args(&argop[1], &lockt_args, 14375 argsp, flk, rp); 14376 break; 14377 case F_SETLKW: 14378 case F_SETLK: 14379 if (flk->l_type == F_UNLCK) 14380 nfs4frlock_setup_locku_args(ctype, 14381 &argop[1], &locku_args, flk, 14382 &lop, ep, argsp, vp, cr, 14383 &skip_get_err, &go_otw); 14384 else 14385 nfs4frlock_setup_lock_args(ctype, 14386 &lock_args, &oop, &osp, &lop, &argop[1], 14387 argsp, flk, cmd, vp, cr, ep); 14388 14389 if (ep->error) 14390 goto out; 14391 14392 switch (ep->stat) { 14393 case NFS4_OK: 14394 break; 14395 case NFS4ERR_DELAY: 14396 /* recov thread never gets this error */ 14397 ASSERT(resend_rqstp == NULL); 14398 ASSERT(did_start_fop); 14399 14400 nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, 14401 &recov_state, TRUE); 14402 did_start_fop = FALSE; 14403 if (argop[1].argop == OP_LOCK) 14404 nfs4args_lock_free(&argop[1]); 14405 else if (argop[1].argop == OP_LOCKT) 14406 nfs4args_lockt_free(&argop[1]); 14407 kmem_free(argop, 2 * sizeof (nfs_argop4)); 14408 argsp = NULL; 14409 goto recov_retry; 14410 default: 14411 ep->error = EIO; 14412 goto out; 14413 } 14414 break; 14415 default: 14416 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14417 "nfs4_frlock: invalid cmd %d", cmd)); 14418 ep->error = EINVAL; 14419 goto out; 14420 } 14421 14422 if (!go_otw) 14423 goto out; 14424 } 14425 14426 /* 14427 * Send the server the lock request. Continually loop with a delay 14428 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE. 14429 */ 14430 resp = &res; 14431 14432 NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug), 14433 (CE_NOTE, 14434 "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first", 14435 rnode4info(rp))); 14436 14437 if (lock_args && frc_no_reclaim) { 14438 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14439 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14440 "nfs4frlock: frc_no_reclaim: clearing reclaim")); 14441 lock_args->reclaim = FALSE; 14442 if (did_reclaimp) 14443 *did_reclaimp = 0; 14444 } 14445 14446 /* 14447 * Do the OTW call. 14448 */ 14449 rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep); 14450 14451 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14452 "nfs4frlock: error %d, status %d", ep->error, resp->status)); 14453 14454 needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp); 14455 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14456 "nfs4frlock: needrecov %d", needrecov)); 14457 14458 if (ep->error == 0 && nfs4_need_to_bump_seqid(resp)) 14459 nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop, 14460 args.ctag); 14461 14462 /* 14463 * Check if one of these mutually exclusive error cases has 14464 * happened: 14465 * need to swap credentials due to access error 14466 * recovery is needed 14467 * different error (only known case is missing Kerberos ticket) 14468 */ 14469 14470 if ((ep->error == EACCES || 14471 (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) && 14472 cred_otw != cr) { 14473 nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov, 14474 &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp, 14475 cr, &cred_otw); 14476 goto recov_retry; 14477 } 14478 14479 if (needrecov) { 14480 /* 14481 * LOCKT requests don't need to recover from lost 14482 * requests since they don't create/modify state. 14483 */ 14484 if ((ep->error == EINTR || 14485 NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) && 14486 lockt_args) 14487 goto out; 14488 /* 14489 * Do not attempt recovery for requests initiated by 14490 * the recovery framework. Let the framework redrive them. 14491 */ 14492 if (ctype != NFS4_LCK_CTYPE_NORM) 14493 goto out; 14494 else { 14495 ASSERT(resend_rqstp == NULL); 14496 } 14497 14498 nfs4frlock_save_lost_rqst(ctype, ep->error, 14499 flk_to_locktype(cmd, flk->l_type), 14500 oop, osp, lop, flk, &lost_rqst, cred_otw, vp); 14501 14502 retry = nfs4frlock_recovery(needrecov, ep, &argsp, 14503 &resp, lock_args, locku_args, &oop, &osp, &lop, 14504 rp, vp, &recov_state, op_hint, &did_start_fop, 14505 cmd != F_GETLK ? &lost_rqst : NULL, flk); 14506 14507 if (retry) { 14508 ASSERT(oop == NULL); 14509 ASSERT(osp == NULL); 14510 ASSERT(lop == NULL); 14511 goto recov_retry; 14512 } 14513 goto out; 14514 } 14515 14516 /* 14517 * Bail out if have reached this point with ep->error set. Can 14518 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr). 14519 * This happens if Kerberos ticket has expired or has been 14520 * destroyed. 14521 */ 14522 if (ep->error != 0) 14523 goto out; 14524 14525 /* 14526 * Process the reply. 14527 */ 14528 switch (resp->status) { 14529 case NFS4_OK: 14530 resop = &resp->array[1]; 14531 /* 14532 * Have a successful lock operation, now update state. 14533 */ 14534 nfs4frlock_update_state(lock_args, locku_args, lockt_args, 14535 resop, lop, vp, flk, cr, resend_rqstp); 14536 break; 14537 14538 case NFS4ERR_DENIED: 14539 resop = &resp->array[1]; 14540 retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args, 14541 &oop, &osp, &lop, cmd, vp, flk, op_hint, 14542 &recov_state, needrecov, &argsp, &resp, 14543 &tick_delay, &ep->error, resop, cr, 14544 &did_start_fop, &skip_get_err); 14545 14546 if (retry) { 14547 ASSERT(oop == NULL); 14548 ASSERT(osp == NULL); 14549 ASSERT(lop == NULL); 14550 goto recov_retry; 14551 } 14552 break; 14553 /* 14554 * If the server won't let us reclaim, fall-back to trying to lock 14555 * the file from scratch. Code elsewhere will check the changeinfo 14556 * to ensure the file hasn't been changed. 14557 */ 14558 case NFS4ERR_NO_GRACE: 14559 if (lock_args && lock_args->reclaim == TRUE) { 14560 ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM); 14561 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14562 "nfs4frlock: reclaim: NFS4ERR_NO_GRACE")); 14563 frc_no_reclaim = 1; 14564 /* clean up before retrying */ 14565 needrecov = 0; 14566 (void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp, 14567 lock_args, locku_args, &oop, &osp, &lop, rp, vp, 14568 &recov_state, op_hint, &did_start_fop, NULL, flk); 14569 goto recov_retry; 14570 } 14571 /* FALLTHROUGH */ 14572 14573 default: 14574 nfs4frlock_results_default(resp, &ep->error); 14575 break; 14576 } 14577 out: 14578 /* 14579 * Process and cleanup from error. Make interrupted unlock 14580 * requests look successful, since they will be handled by the 14581 * client recovery code. 14582 */ 14583 nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state, 14584 needrecov, oop, osp, lop, &ep->error, 14585 lock_args, locku_args, did_start_fop, 14586 skip_get_err, cred_otw, cr); 14587 14588 if (ep->error == EINTR && flk->l_type == F_UNLCK && 14589 (cmd == F_SETLK || cmd == F_SETLKW)) 14590 ep->error = 0; 14591 } 14592 14593 /* 14594 * nfs4_safelock: 14595 * 14596 * Return non-zero if the given lock request can be handled without 14597 * violating the constraints on concurrent mapping and locking. 14598 */ 14599 14600 static int 14601 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr) 14602 { 14603 rnode4_t *rp = VTOR4(vp); 14604 struct vattr va; 14605 int error; 14606 14607 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14608 ASSERT(rp->r_mapcnt >= 0); 14609 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: " 14610 "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ? 14611 "write" : bfp->l_type == F_RDLCK ? "read" : "unlock", 14612 bfp->l_start, bfp->l_len, rp->r_mapcnt)); 14613 14614 if (rp->r_mapcnt == 0) 14615 return (1); /* always safe if not mapped */ 14616 14617 /* 14618 * If the file is already mapped and there are locks, then they 14619 * should be all safe locks. So adding or removing a lock is safe 14620 * as long as the new request is safe (i.e., whole-file, meaning 14621 * length and starting offset are both zero). 14622 */ 14623 14624 if (bfp->l_start != 0 || bfp->l_len != 0) { 14625 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14626 "cannot lock a memory mapped file unless locking the " 14627 "entire file: start %"PRIx64", len %"PRIx64, 14628 bfp->l_start, bfp->l_len)); 14629 return (0); 14630 } 14631 14632 /* mandatory locking and mapping don't mix */ 14633 va.va_mask = AT_MODE; 14634 error = VOP_GETATTR(vp, &va, 0, cr, NULL); 14635 if (error != 0) { 14636 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14637 "getattr error %d", error)); 14638 return (0); /* treat errors conservatively */ 14639 } 14640 if (MANDLOCK(vp, va.va_mode)) { 14641 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: " 14642 "cannot mandatory lock and mmap a file")); 14643 return (0); 14644 } 14645 14646 return (1); 14647 } 14648 14649 /* 14650 * nfs4_lockrelease: 14651 * 14652 * Release any locks on the given vnode that are held by the current 14653 * process. Also removes the lock owner (if one exists) from the rnode's 14654 * list. 14655 */ 14656 static int 14657 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr) 14658 { 14659 flock64_t ld; 14660 int ret, error; 14661 rnode4_t *rp; 14662 nfs4_lock_owner_t *lop; 14663 nfs4_recov_state_t recov_state; 14664 mntinfo4_t *mi; 14665 bool_t possible_orphan = FALSE; 14666 bool_t recovonly; 14667 14668 ASSERT((uintptr_t)vp > KERNELBASE); 14669 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14670 14671 rp = VTOR4(vp); 14672 mi = VTOMI4(vp); 14673 14674 /* 14675 * If we have not locked anything then we can 14676 * just return since we have no work to do. 14677 */ 14678 if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) { 14679 return (0); 14680 } 14681 14682 /* 14683 * We need to comprehend that another thread may 14684 * kick off recovery and the lock_owner we have stashed 14685 * in lop might be invalid so we should NOT cache it 14686 * locally! 14687 */ 14688 recov_state.rs_flags = 0; 14689 recov_state.rs_num_retry_despite_err = 0; 14690 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14691 &recovonly); 14692 if (error) { 14693 mutex_enter(&rp->r_statelock); 14694 rp->r_flags |= R4LODANGLERS; 14695 mutex_exit(&rp->r_statelock); 14696 return (error); 14697 } 14698 14699 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14700 14701 /* 14702 * Check if the lock owner might have a lock (request was sent but 14703 * no response was received). Also check if there are any remote 14704 * locks on the file. (In theory we shouldn't have to make this 14705 * second check if there's no lock owner, but for now we'll be 14706 * conservative and do it anyway.) If either condition is true, 14707 * send an unlock for the entire file to the server. 14708 * 14709 * Note that no explicit synchronization is needed here. At worst, 14710 * flk_has_remote_locks() will return a false positive, in which case 14711 * the unlock call wastes time but doesn't harm correctness. 14712 */ 14713 14714 if (lop) { 14715 mutex_enter(&lop->lo_lock); 14716 possible_orphan = lop->lo_pending_rqsts; 14717 mutex_exit(&lop->lo_lock); 14718 lock_owner_rele(lop); 14719 } 14720 14721 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14722 14723 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14724 "nfs4_lockrelease: possible orphan %d, remote locks %d, for " 14725 "lop %p.", possible_orphan, flk_has_remote_locks(vp), 14726 (void *)lop)); 14727 14728 if (possible_orphan || flk_has_remote_locks(vp)) { 14729 ld.l_type = F_UNLCK; /* set to unlock entire file */ 14730 ld.l_whence = 0; /* unlock from start of file */ 14731 ld.l_start = 0; 14732 ld.l_len = 0; /* do entire file */ 14733 14734 ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL, 14735 cr, NULL); 14736 14737 if (ret != 0) { 14738 /* 14739 * If VOP_FRLOCK fails, make sure we unregister 14740 * local locks before we continue. 14741 */ 14742 struct lm_sysid *lmsid = nfs4_find_sysid(VTOMI4(vp)); 14743 14744 if (lmsid != NULL) { 14745 cleanlocks(vp, curproc->p_pid, 14746 lm_sysidt(lmsid) | LM_SYSID_CLIENT); 14747 lm_rel_sysid(lmsid); 14748 } 14749 14750 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 14751 "nfs4_lockrelease: lock release error on vp" 14752 " %p: error %d.\n", (void *)vp, ret)); 14753 } 14754 } 14755 14756 recov_state.rs_flags = 0; 14757 recov_state.rs_num_retry_despite_err = 0; 14758 error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 14759 &recovonly); 14760 if (error) { 14761 mutex_enter(&rp->r_statelock); 14762 rp->r_flags |= R4LODANGLERS; 14763 mutex_exit(&rp->r_statelock); 14764 return (error); 14765 } 14766 14767 /* 14768 * So, here we're going to need to retrieve the lock-owner 14769 * again (in case recovery has done a switch-a-roo) and 14770 * remove it because we can. 14771 */ 14772 lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY); 14773 14774 if (lop) { 14775 nfs4_rnode_remove_lock_owner(rp, lop); 14776 lock_owner_rele(lop); 14777 } 14778 14779 nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0); 14780 return (0); 14781 } 14782 14783 /* 14784 * Wait for 'tick_delay' clock ticks. 14785 * Implement exponential backoff until hit the lease_time of this nfs4_server. 14786 * 14787 * The client should retry to acquire the lock faster than the lease period. 14788 * We use roughly half of the lease time to use a similar calculation as it is 14789 * used in nfs4_renew_lease_thread(). 14790 * 14791 * XXX For future improvements, should implement a waiting queue scheme. 14792 */ 14793 static int 14794 nfs4_block_and_wait(clock_t *tick_delay) 14795 { 14796 /* wait tick_delay clock ticks or siginteruptus */ 14797 if (delay_sig(*tick_delay)) { 14798 return (EINTR); 14799 } 14800 14801 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: " 14802 "reissue the lock request: blocked for %ld clock ticks: %ld " 14803 "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000)); 14804 14805 *tick_delay = MIN(drv_usectohz(nfs4_max_base_wait_time * 1000), 14806 *tick_delay * 1.5); 14807 return (0); 14808 } 14809 14810 void 14811 nfs4_vnops_init(void) 14812 { 14813 } 14814 14815 void 14816 nfs4_vnops_fini(void) 14817 { 14818 } 14819 14820 /* 14821 * Return a reference to the directory (parent) vnode for a given vnode, 14822 * using the saved pathname information and the directory file handle. The 14823 * caller is responsible for disposing of the reference. 14824 * Returns zero or an errno value. 14825 * 14826 * Caller should set need_start_op to FALSE if it is the recovery 14827 * thread, or if a start_fop has already been done. Otherwise, TRUE. 14828 */ 14829 int 14830 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op) 14831 { 14832 svnode_t *svnp; 14833 vnode_t *dvp = NULL; 14834 servinfo4_t *svp; 14835 nfs4_fname_t *mfname; 14836 int error; 14837 14838 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 14839 14840 if (vp->v_flag & VROOT) { 14841 nfs4_sharedfh_t *sfh; 14842 nfs_fh4 fh; 14843 mntinfo4_t *mi; 14844 14845 ASSERT(vp->v_type == VREG); 14846 14847 mi = VTOMI4(vp); 14848 svp = mi->mi_curr_serv; 14849 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14850 fh.nfs_fh4_len = svp->sv_pfhandle.fh_len; 14851 fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf; 14852 sfh = sfh4_get(&fh, VTOMI4(vp)); 14853 nfs_rw_exit(&svp->sv_lock); 14854 mfname = mi->mi_fname; 14855 fn_hold(mfname); 14856 dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0); 14857 sfh4_rele(&sfh); 14858 14859 if (dvp->v_type == VNON) 14860 dvp->v_type = VDIR; 14861 *dvpp = dvp; 14862 return (0); 14863 } 14864 14865 svnp = VTOSV(vp); 14866 14867 if (svnp == NULL) { 14868 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14869 "shadow node is NULL")); 14870 return (EINVAL); 14871 } 14872 14873 if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) { 14874 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14875 "shadow node name or dfh val == NULL")); 14876 return (EINVAL); 14877 } 14878 14879 error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp, 14880 (int)need_start_op); 14881 if (error != 0) { 14882 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14883 "nfs4_make_dotdot returned %d", error)); 14884 return (error); 14885 } 14886 if (!dvp) { 14887 NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: " 14888 "nfs4_make_dotdot returned a NULL dvp")); 14889 return (EIO); 14890 } 14891 if (dvp->v_type == VNON) 14892 dvp->v_type = VDIR; 14893 ASSERT(dvp->v_type == VDIR); 14894 if (VTOR4(vp)->r_flags & R4ISXATTR) { 14895 mutex_enter(&dvp->v_lock); 14896 dvp->v_flag |= V_XATTRDIR; 14897 mutex_exit(&dvp->v_lock); 14898 } 14899 *dvpp = dvp; 14900 return (0); 14901 } 14902 14903 /* 14904 * Copy the (final) component name of vp to fnamep. maxlen is the maximum 14905 * length that fnamep can accept, including the trailing null. 14906 * Returns 0 if okay, returns an errno value if there was a problem. 14907 */ 14908 14909 int 14910 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen) 14911 { 14912 char *fn; 14913 int err = 0; 14914 servinfo4_t *svp; 14915 svnode_t *shvp; 14916 14917 /* 14918 * If the file being opened has VROOT set, then this is 14919 * a "file" mount. sv_name will not be interesting, so 14920 * go back to the servinfo4 to get the original mount 14921 * path and strip off all but the final edge. Otherwise 14922 * just return the name from the shadow vnode. 14923 */ 14924 14925 if (vp->v_flag & VROOT) { 14926 14927 svp = VTOMI4(vp)->mi_curr_serv; 14928 (void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0); 14929 14930 fn = strrchr(svp->sv_path, '/'); 14931 if (fn == NULL) 14932 err = EINVAL; 14933 else 14934 fn++; 14935 } else { 14936 shvp = VTOSV(vp); 14937 fn = fn_name(shvp->sv_name); 14938 } 14939 14940 if (err == 0) 14941 if (strlen(fn) < maxlen) 14942 (void) strcpy(fnamep, fn); 14943 else 14944 err = ENAMETOOLONG; 14945 14946 if (vp->v_flag & VROOT) 14947 nfs_rw_exit(&svp->sv_lock); 14948 else 14949 kmem_free(fn, MAXNAMELEN); 14950 14951 return (err); 14952 } 14953 14954 /* 14955 * Bookkeeping for a close that doesn't need to go over the wire. 14956 * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise 14957 * it is left at 1. 14958 */ 14959 void 14960 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp) 14961 { 14962 rnode4_t *rp; 14963 mntinfo4_t *mi; 14964 14965 mi = VTOMI4(vp); 14966 rp = VTOR4(vp); 14967 14968 NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: " 14969 "rp=%p osp=%p", (void *)rp, (void *)osp)); 14970 ASSERT(nfs_zone() == mi->mi_zone); 14971 ASSERT(mutex_owned(&osp->os_sync_lock)); 14972 ASSERT(*have_lockp); 14973 14974 if (!osp->os_valid || 14975 osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 14976 return; 14977 } 14978 14979 /* 14980 * This removes the reference obtained at OPEN; ie, 14981 * when the open stream structure was created. 14982 * 14983 * We don't have to worry about calling 'open_stream_rele' 14984 * since we our currently holding a reference to this 14985 * open stream which means the count can not go to 0 with 14986 * this decrement. 14987 */ 14988 ASSERT(osp->os_ref_count >= 2); 14989 osp->os_ref_count--; 14990 osp->os_valid = 0; 14991 mutex_exit(&osp->os_sync_lock); 14992 *have_lockp = 0; 14993 14994 nfs4_dec_state_ref_count(mi); 14995 } 14996 14997 /* 14998 * Close all remaining open streams on the rnode. These open streams 14999 * could be here because: 15000 * - The close attempted at either close or delmap failed 15001 * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE 15002 * - Someone did mknod on a regular file but never opened it 15003 */ 15004 int 15005 nfs4close_all(vnode_t *vp, cred_t *cr) 15006 { 15007 nfs4_open_stream_t *osp; 15008 int error; 15009 nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS }; 15010 rnode4_t *rp; 15011 15012 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15013 15014 error = 0; 15015 rp = VTOR4(vp); 15016 15017 /* 15018 * At this point, all we know is that the last time 15019 * someone called vn_rele, the count was 1. Since then, 15020 * the vnode could have been re-activated. We want to 15021 * loop through the open streams and close each one, but 15022 * we have to be careful since once we release the rnode 15023 * hash bucket lock, someone else is free to come in and 15024 * re-activate the rnode and add new open streams. The 15025 * strategy is take the rnode hash bucket lock, verify that 15026 * the count is still 1, grab the open stream off the 15027 * head of the list and mark it invalid, then release the 15028 * rnode hash bucket lock and proceed with that open stream. 15029 * This is ok because nfs4close_one() will acquire the proper 15030 * open/create to close/destroy synchronization for open 15031 * streams, and will ensure that if someone has reopened 15032 * the open stream after we've dropped the hash bucket lock 15033 * then we'll just simply return without destroying the 15034 * open stream. 15035 * Repeat until the list is empty. 15036 */ 15037 15038 for (;;) { 15039 15040 /* make sure vnode hasn't been reactivated */ 15041 rw_enter(&rp->r_hashq->r_lock, RW_READER); 15042 mutex_enter(&vp->v_lock); 15043 if (vp->v_count > 1) { 15044 mutex_exit(&vp->v_lock); 15045 rw_exit(&rp->r_hashq->r_lock); 15046 break; 15047 } 15048 /* 15049 * Grabbing r_os_lock before releasing v_lock prevents 15050 * a window where the rnode/open stream could get 15051 * reactivated (and os_force_close set to 0) before we 15052 * had a chance to set os_force_close to 1. 15053 */ 15054 mutex_enter(&rp->r_os_lock); 15055 mutex_exit(&vp->v_lock); 15056 15057 osp = list_head(&rp->r_open_streams); 15058 if (!osp) { 15059 /* nothing left to CLOSE OTW, so return */ 15060 mutex_exit(&rp->r_os_lock); 15061 rw_exit(&rp->r_hashq->r_lock); 15062 break; 15063 } 15064 15065 mutex_enter(&rp->r_statev4_lock); 15066 /* the file can't still be mem mapped */ 15067 ASSERT(rp->r_mapcnt == 0); 15068 if (rp->created_v4) 15069 rp->created_v4 = 0; 15070 mutex_exit(&rp->r_statev4_lock); 15071 15072 /* 15073 * Grab a ref on this open stream; nfs4close_one 15074 * will mark it as invalid 15075 */ 15076 mutex_enter(&osp->os_sync_lock); 15077 osp->os_ref_count++; 15078 osp->os_force_close = 1; 15079 mutex_exit(&osp->os_sync_lock); 15080 mutex_exit(&rp->r_os_lock); 15081 rw_exit(&rp->r_hashq->r_lock); 15082 15083 nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0); 15084 15085 /* Update error if it isn't already non-zero */ 15086 if (error == 0) { 15087 if (e.error) 15088 error = e.error; 15089 else if (e.stat) 15090 error = geterrno4(e.stat); 15091 } 15092 15093 #ifdef DEBUG 15094 nfs4close_all_cnt++; 15095 #endif 15096 /* Release the ref on osp acquired above. */ 15097 open_stream_rele(osp, rp); 15098 15099 /* Proceed to the next open stream, if any */ 15100 } 15101 return (error); 15102 } 15103 15104 /* 15105 * nfs4close_one - close one open stream for a file if needed. 15106 * 15107 * "close_type" indicates which close path this is: 15108 * CLOSE_NORM: close initiated via VOP_CLOSE. 15109 * CLOSE_DELMAP: close initiated via VOP_DELMAP. 15110 * CLOSE_FORCE: close initiated via VOP_INACTIVE. This path forces 15111 * the close and release of client state for this open stream 15112 * (unless someone else has the open stream open). 15113 * CLOSE_RESEND: indicates the request is a replay of an earlier request 15114 * (e.g., due to abort because of a signal). 15115 * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN. 15116 * 15117 * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client 15118 * recovery. Instead, the caller is expected to deal with retries. 15119 * 15120 * The caller can either pass in the osp ('provided_osp') or not. 15121 * 15122 * 'access_bits' represents the access we are closing/downgrading. 15123 * 15124 * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP. 'len' is the 15125 * number of bytes we are unmapping, 'maxprot' is the mmap protection, and 15126 * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED). 15127 * 15128 * Errors are returned via the nfs4_error_t. 15129 */ 15130 void 15131 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr, 15132 int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep, 15133 nfs4_close_type_t close_type, size_t len, uint_t maxprot, 15134 uint_t mmap_flags) 15135 { 15136 nfs4_open_owner_t *oop; 15137 nfs4_open_stream_t *osp = NULL; 15138 int retry = 0; 15139 int num_retries = NFS4_NUM_RECOV_RETRIES; 15140 rnode4_t *rp; 15141 mntinfo4_t *mi; 15142 nfs4_recov_state_t recov_state; 15143 cred_t *cred_otw = NULL; 15144 bool_t recovonly = FALSE; 15145 int isrecov; 15146 int force_close; 15147 int close_failed = 0; 15148 int did_dec_count = 0; 15149 int did_start_op = 0; 15150 int did_force_recovlock = 0; 15151 int did_start_seqid_sync = 0; 15152 int have_sync_lock = 0; 15153 15154 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 15155 15156 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, " 15157 "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x", 15158 (void *)vp, (void *)provided_osp, (void *)lrp, close_type, 15159 len, maxprot, mmap_flags, access_bits)); 15160 15161 nfs4_error_zinit(ep); 15162 rp = VTOR4(vp); 15163 mi = VTOMI4(vp); 15164 isrecov = (close_type == CLOSE_RESEND || 15165 close_type == CLOSE_AFTER_RESEND); 15166 15167 /* 15168 * First get the open owner. 15169 */ 15170 if (!provided_osp) { 15171 oop = find_open_owner(cr, NFS4_PERM_CREATED, mi); 15172 } else { 15173 oop = provided_osp->os_open_owner; 15174 ASSERT(oop != NULL); 15175 open_owner_hold(oop); 15176 } 15177 15178 if (!oop) { 15179 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15180 "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, " 15181 "close type %d", (void *)rp, (void *)mi, (void *)cr, 15182 (void *)provided_osp, close_type)); 15183 ep->error = EIO; 15184 goto out; 15185 } 15186 15187 cred_otw = nfs4_get_otw_cred(cr, mi, oop); 15188 recov_retry: 15189 osp = NULL; 15190 close_failed = 0; 15191 force_close = (close_type == CLOSE_FORCE); 15192 retry = 0; 15193 did_start_op = 0; 15194 did_force_recovlock = 0; 15195 did_start_seqid_sync = 0; 15196 have_sync_lock = 0; 15197 recovonly = FALSE; 15198 recov_state.rs_flags = 0; 15199 recov_state.rs_num_retry_despite_err = 0; 15200 15201 /* 15202 * Second synchronize with recovery. 15203 */ 15204 if (!isrecov) { 15205 ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE, 15206 &recov_state, &recovonly); 15207 if (!ep->error) { 15208 did_start_op = 1; 15209 } else { 15210 close_failed = 1; 15211 /* 15212 * If we couldn't get start_fop, but have to 15213 * cleanup state, then at least acquire the 15214 * mi_recovlock so we can synchronize with 15215 * recovery. 15216 */ 15217 if (close_type == CLOSE_FORCE) { 15218 (void) nfs_rw_enter_sig(&mi->mi_recovlock, 15219 RW_READER, FALSE); 15220 did_force_recovlock = 1; 15221 } else 15222 goto out; 15223 } 15224 } 15225 15226 /* 15227 * We cannot attempt to get the open seqid sync if nfs4_start_fop 15228 * set 'recovonly' to TRUE since most likely this is due to 15229 * reovery being active (MI4_RECOV_ACTIV). If recovery is active, 15230 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us 15231 * to retry, causing us to loop until recovery finishes. Plus we 15232 * don't need protection over the open seqid since we're not going 15233 * OTW, hence don't need to use the seqid. 15234 */ 15235 if (recovonly == FALSE) { 15236 /* need to grab the open owner sync before 'os_sync_lock' */ 15237 ep->error = nfs4_start_open_seqid_sync(oop, mi); 15238 if (ep->error == EAGAIN) { 15239 ASSERT(!isrecov); 15240 if (did_start_op) 15241 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15242 &recov_state, TRUE); 15243 if (did_force_recovlock) 15244 nfs_rw_exit(&mi->mi_recovlock); 15245 goto recov_retry; 15246 } 15247 did_start_seqid_sync = 1; 15248 } 15249 15250 /* 15251 * Third get an open stream and acquire 'os_sync_lock' to 15252 * sychronize the opening/creating of an open stream with the 15253 * closing/destroying of an open stream. 15254 */ 15255 if (!provided_osp) { 15256 /* returns with 'os_sync_lock' held */ 15257 osp = find_open_stream(oop, rp); 15258 if (!osp) { 15259 ep->error = EIO; 15260 goto out; 15261 } 15262 } else { 15263 osp = provided_osp; 15264 open_stream_hold(osp); 15265 mutex_enter(&osp->os_sync_lock); 15266 } 15267 have_sync_lock = 1; 15268 15269 ASSERT(oop == osp->os_open_owner); 15270 15271 /* 15272 * Fourth, do any special pre-OTW CLOSE processing 15273 * based on the specific close type. 15274 */ 15275 if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) && 15276 !did_dec_count) { 15277 ASSERT(osp->os_open_ref_count > 0); 15278 osp->os_open_ref_count--; 15279 did_dec_count = 1; 15280 if (osp->os_open_ref_count == 0) 15281 osp->os_final_close = 1; 15282 } 15283 15284 if (close_type == CLOSE_FORCE) { 15285 /* see if somebody reopened the open stream. */ 15286 if (!osp->os_force_close) { 15287 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15288 "nfs4close_one: skip CLOSE_FORCE as osp %p " 15289 "was reopened, vp %p", (void *)osp, (void *)vp)); 15290 ep->error = 0; 15291 ep->stat = NFS4_OK; 15292 goto out; 15293 } 15294 15295 if (!osp->os_final_close && !did_dec_count) { 15296 osp->os_open_ref_count--; 15297 did_dec_count = 1; 15298 } 15299 15300 /* 15301 * We can't depend on os_open_ref_count being 0 due to the 15302 * way executables are opened (VN_RELE to match a VOP_OPEN). 15303 */ 15304 #ifdef NOTYET 15305 ASSERT(osp->os_open_ref_count == 0); 15306 #endif 15307 if (osp->os_open_ref_count != 0) { 15308 NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, 15309 "nfs4close_one: should panic here on an " 15310 "ASSERT(osp->os_open_ref_count == 0). Ignoring " 15311 "since this is probably the exec problem.")); 15312 15313 osp->os_open_ref_count = 0; 15314 } 15315 15316 /* 15317 * There is the possibility that nfs4close_one() 15318 * for close_type == CLOSE_DELMAP couldn't find the 15319 * open stream, thus couldn't decrement its os_mapcnt; 15320 * therefore we can't use this ASSERT yet. 15321 */ 15322 #ifdef NOTYET 15323 ASSERT(osp->os_mapcnt == 0); 15324 #endif 15325 osp->os_mapcnt = 0; 15326 } 15327 15328 if (close_type == CLOSE_DELMAP && !did_dec_count) { 15329 ASSERT(osp->os_mapcnt >= btopr(len)); 15330 15331 if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE)) 15332 osp->os_mmap_write -= btopr(len); 15333 if (maxprot & PROT_READ) 15334 osp->os_mmap_read -= btopr(len); 15335 if (maxprot & PROT_EXEC) 15336 osp->os_mmap_read -= btopr(len); 15337 /* mirror the PROT_NONE check in nfs4_addmap() */ 15338 if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) && 15339 !(maxprot & PROT_EXEC)) 15340 osp->os_mmap_read -= btopr(len); 15341 osp->os_mapcnt -= btopr(len); 15342 did_dec_count = 1; 15343 } 15344 15345 if (recovonly) { 15346 nfs4_lost_rqst_t lost_rqst; 15347 15348 /* request should not already be in recovery queue */ 15349 ASSERT(lrp == NULL); 15350 nfs4_error_init(ep, EINTR); 15351 nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop, 15352 osp, cred_otw, vp); 15353 mutex_exit(&osp->os_sync_lock); 15354 have_sync_lock = 0; 15355 (void) nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15356 lost_rqst.lr_op == OP_CLOSE ? 15357 &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL); 15358 close_failed = 1; 15359 force_close = 0; 15360 goto close_cleanup; 15361 } 15362 15363 /* 15364 * If a previous OTW call got NFS4ERR_BAD_SEQID, then 15365 * we stopped operating on the open owner's <old oo_name, old seqid> 15366 * space, which means we stopped operating on the open stream 15367 * too. So don't go OTW (as the seqid is likely bad, and the 15368 * stateid could be stale, potentially triggering a false 15369 * setclientid), and just clean up the client's internal state. 15370 */ 15371 if (osp->os_orig_oo_name != oop->oo_name) { 15372 NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug, 15373 (CE_NOTE, "nfs4close_one: skip OTW close for osp %p " 15374 "oop %p due to bad seqid (orig oo_name %" PRIx64 " current " 15375 "oo_name %" PRIx64")", 15376 (void *)osp, (void *)oop, osp->os_orig_oo_name, 15377 oop->oo_name)); 15378 close_failed = 1; 15379 } 15380 15381 /* If the file failed recovery, just quit. */ 15382 mutex_enter(&rp->r_statelock); 15383 if (rp->r_flags & R4RECOVERR) { 15384 close_failed = 1; 15385 } 15386 mutex_exit(&rp->r_statelock); 15387 15388 /* 15389 * If the force close path failed to obtain start_fop 15390 * then skip the OTW close and just remove the state. 15391 */ 15392 if (close_failed) 15393 goto close_cleanup; 15394 15395 /* 15396 * Fifth, check to see if there are still mapped pages or other 15397 * opens using this open stream. If there are then we can't 15398 * close yet but we can see if an OPEN_DOWNGRADE is necessary. 15399 */ 15400 if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) { 15401 nfs4_lost_rqst_t new_lost_rqst; 15402 bool_t needrecov = FALSE; 15403 cred_t *odg_cred_otw = NULL; 15404 seqid4 open_dg_seqid = 0; 15405 15406 if (osp->os_delegation) { 15407 /* 15408 * If this open stream was never OPENed OTW then we 15409 * surely can't DOWNGRADE it (especially since the 15410 * osp->open_stateid is really a delegation stateid 15411 * when os_delegation is 1). 15412 */ 15413 if (access_bits & FREAD) 15414 osp->os_share_acc_read--; 15415 if (access_bits & FWRITE) 15416 osp->os_share_acc_write--; 15417 osp->os_share_deny_none--; 15418 nfs4_error_zinit(ep); 15419 goto out; 15420 } 15421 nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr, 15422 lrp, ep, &odg_cred_otw, &open_dg_seqid); 15423 needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp); 15424 if (needrecov && !isrecov) { 15425 bool_t abort; 15426 nfs4_bseqid_entry_t *bsep = NULL; 15427 15428 if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) 15429 bsep = nfs4_create_bseqid_entry(oop, NULL, 15430 vp, 0, 15431 lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG, 15432 open_dg_seqid); 15433 15434 nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst, 15435 oop, osp, odg_cred_otw, vp, access_bits, 0); 15436 mutex_exit(&osp->os_sync_lock); 15437 have_sync_lock = 0; 15438 abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL, 15439 new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ? 15440 &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE, 15441 bsep, NULL, NULL); 15442 if (odg_cred_otw) 15443 crfree(odg_cred_otw); 15444 if (bsep) 15445 kmem_free(bsep, sizeof (*bsep)); 15446 15447 if (abort == TRUE) 15448 goto out; 15449 15450 if (did_start_seqid_sync) { 15451 nfs4_end_open_seqid_sync(oop); 15452 did_start_seqid_sync = 0; 15453 } 15454 open_stream_rele(osp, rp); 15455 15456 if (did_start_op) 15457 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15458 &recov_state, FALSE); 15459 if (did_force_recovlock) 15460 nfs_rw_exit(&mi->mi_recovlock); 15461 15462 goto recov_retry; 15463 } else { 15464 if (odg_cred_otw) 15465 crfree(odg_cred_otw); 15466 } 15467 goto out; 15468 } 15469 15470 /* 15471 * If this open stream was created as the results of an open 15472 * while holding a delegation, then just release it; no need 15473 * to do an OTW close. Otherwise do a "normal" OTW close. 15474 */ 15475 if (osp->os_delegation) { 15476 nfs4close_notw(vp, osp, &have_sync_lock); 15477 nfs4_error_zinit(ep); 15478 goto out; 15479 } 15480 15481 /* 15482 * If this stream is not valid, we're done. 15483 */ 15484 if (!osp->os_valid) { 15485 nfs4_error_zinit(ep); 15486 goto out; 15487 } 15488 15489 /* 15490 * Last open or mmap ref has vanished, need to do an OTW close. 15491 * First check to see if a close is still necessary. 15492 */ 15493 if (osp->os_failed_reopen) { 15494 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15495 "don't close OTW osp %p since reopen failed.", 15496 (void *)osp)); 15497 /* 15498 * Reopen of the open stream failed, hence the 15499 * stateid of the open stream is invalid/stale, and 15500 * sending this OTW would incorrectly cause another 15501 * round of recovery. In this case, we need to set 15502 * the 'os_valid' bit to 0 so another thread doesn't 15503 * come in and re-open this open stream before 15504 * this "closing" thread cleans up state (decrementing 15505 * the nfs4_server_t's state_ref_count and decrementing 15506 * the os_ref_count). 15507 */ 15508 osp->os_valid = 0; 15509 /* 15510 * This removes the reference obtained at OPEN; ie, 15511 * when the open stream structure was created. 15512 * 15513 * We don't have to worry about calling 'open_stream_rele' 15514 * since we our currently holding a reference to this 15515 * open stream which means the count can not go to 0 with 15516 * this decrement. 15517 */ 15518 ASSERT(osp->os_ref_count >= 2); 15519 osp->os_ref_count--; 15520 nfs4_error_zinit(ep); 15521 close_failed = 0; 15522 goto close_cleanup; 15523 } 15524 15525 ASSERT(osp->os_ref_count > 1); 15526 15527 /* 15528 * Sixth, try the CLOSE OTW. 15529 */ 15530 nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync, 15531 close_type, ep, &have_sync_lock); 15532 15533 if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) { 15534 /* 15535 * Let the recovery thread be responsible for 15536 * removing the state for CLOSE. 15537 */ 15538 close_failed = 1; 15539 force_close = 0; 15540 retry = 0; 15541 } 15542 15543 /* See if we need to retry with a different cred */ 15544 if ((ep->error == EACCES || 15545 (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) && 15546 cred_otw != cr) { 15547 crfree(cred_otw); 15548 cred_otw = cr; 15549 crhold(cred_otw); 15550 retry = 1; 15551 } 15552 15553 if (ep->error || ep->stat) 15554 close_failed = 1; 15555 15556 if (retry && !isrecov && num_retries-- > 0) { 15557 if (have_sync_lock) { 15558 mutex_exit(&osp->os_sync_lock); 15559 have_sync_lock = 0; 15560 } 15561 if (did_start_seqid_sync) { 15562 nfs4_end_open_seqid_sync(oop); 15563 did_start_seqid_sync = 0; 15564 } 15565 open_stream_rele(osp, rp); 15566 15567 if (did_start_op) 15568 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, 15569 &recov_state, FALSE); 15570 if (did_force_recovlock) 15571 nfs_rw_exit(&mi->mi_recovlock); 15572 NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, 15573 "nfs4close_one: need to retry the close " 15574 "operation")); 15575 goto recov_retry; 15576 } 15577 close_cleanup: 15578 /* 15579 * Seventh and lastly, process our results. 15580 */ 15581 if (close_failed && force_close) { 15582 /* 15583 * It's ok to drop and regrab the 'os_sync_lock' since 15584 * nfs4close_notw() will recheck to make sure the 15585 * "close"/removal of state should happen. 15586 */ 15587 if (!have_sync_lock) { 15588 mutex_enter(&osp->os_sync_lock); 15589 have_sync_lock = 1; 15590 } 15591 /* 15592 * This is last call, remove the ref on the open 15593 * stream created by open and clean everything up. 15594 */ 15595 osp->os_pending_close = 0; 15596 nfs4close_notw(vp, osp, &have_sync_lock); 15597 nfs4_error_zinit(ep); 15598 } 15599 15600 if (!close_failed) { 15601 if (have_sync_lock) { 15602 osp->os_pending_close = 0; 15603 mutex_exit(&osp->os_sync_lock); 15604 have_sync_lock = 0; 15605 } else { 15606 mutex_enter(&osp->os_sync_lock); 15607 osp->os_pending_close = 0; 15608 mutex_exit(&osp->os_sync_lock); 15609 } 15610 if (did_start_op && recov_state.rs_sp != NULL) { 15611 mutex_enter(&recov_state.rs_sp->s_lock); 15612 nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi); 15613 mutex_exit(&recov_state.rs_sp->s_lock); 15614 } else { 15615 nfs4_dec_state_ref_count(mi); 15616 } 15617 nfs4_error_zinit(ep); 15618 } 15619 15620 out: 15621 if (have_sync_lock) 15622 mutex_exit(&osp->os_sync_lock); 15623 if (did_start_op) 15624 nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state, 15625 recovonly ? TRUE : FALSE); 15626 if (did_force_recovlock) 15627 nfs_rw_exit(&mi->mi_recovlock); 15628 if (cred_otw) 15629 crfree(cred_otw); 15630 if (osp) 15631 open_stream_rele(osp, rp); 15632 if (oop) { 15633 if (did_start_seqid_sync) 15634 nfs4_end_open_seqid_sync(oop); 15635 open_owner_rele(oop); 15636 } 15637 } 15638 15639 /* 15640 * Convert information returned by the server in the LOCK4denied 15641 * structure to the form required by fcntl. 15642 */ 15643 static void 15644 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args) 15645 { 15646 nfs4_lo_name_t *lo; 15647 15648 #ifdef DEBUG 15649 if (denied_to_flk_debug) { 15650 lockt_denied_debug = lockt_denied; 15651 debug_enter("lockt_denied"); 15652 } 15653 #endif 15654 15655 flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK; 15656 flk->l_whence = 0; /* aka SEEK_SET */ 15657 flk->l_start = lockt_denied->offset; 15658 flk->l_len = lockt_denied->length; 15659 15660 /* 15661 * If the blocking clientid matches our client id, then we can 15662 * interpret the lockowner (since we built it). If not, then 15663 * fabricate a sysid and pid. Note that the l_sysid field 15664 * in *flk already has the local sysid. 15665 */ 15666 15667 if (lockt_denied->owner.clientid == lockt_args->owner.clientid) { 15668 15669 if (lockt_denied->owner.owner_len == sizeof (*lo)) { 15670 lo = (nfs4_lo_name_t *) 15671 lockt_denied->owner.owner_val; 15672 15673 flk->l_pid = lo->ln_pid; 15674 } else { 15675 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15676 "denied_to_flk: bad lock owner length\n")); 15677 15678 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15679 } 15680 } else { 15681 NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, 15682 "denied_to_flk: foreign clientid\n")); 15683 15684 /* 15685 * Construct a new sysid which should be different from 15686 * sysids of other systems. 15687 */ 15688 15689 flk->l_sysid++; 15690 flk->l_pid = lo_to_pid(&lockt_denied->owner); 15691 } 15692 } 15693 15694 static pid_t 15695 lo_to_pid(lock_owner4 *lop) 15696 { 15697 pid_t pid = 0; 15698 uchar_t *cp; 15699 int i; 15700 15701 cp = (uchar_t *)&lop->clientid; 15702 15703 for (i = 0; i < sizeof (lop->clientid); i++) 15704 pid += (pid_t)*cp++; 15705 15706 cp = (uchar_t *)lop->owner_val; 15707 15708 for (i = 0; i < lop->owner_len; i++) 15709 pid += (pid_t)*cp++; 15710 15711 return (pid); 15712 } 15713 15714 /* 15715 * Given a lock pointer, returns the length of that lock. 15716 * "end" is the last locked offset the "l_len" covers from 15717 * the start of the lock. 15718 */ 15719 static off64_t 15720 lock_to_end(flock64_t *lock) 15721 { 15722 off64_t lock_end; 15723 15724 if (lock->l_len == 0) 15725 lock_end = (off64_t)MAXEND; 15726 else 15727 lock_end = lock->l_start + lock->l_len - 1; 15728 15729 return (lock_end); 15730 } 15731 15732 /* 15733 * Given the end of a lock, it will return you the length "l_len" for that lock. 15734 */ 15735 static off64_t 15736 end_to_len(off64_t start, off64_t end) 15737 { 15738 off64_t lock_len; 15739 15740 ASSERT(end >= start); 15741 if (end == MAXEND) 15742 lock_len = 0; 15743 else 15744 lock_len = end - start + 1; 15745 15746 return (lock_len); 15747 } 15748 15749 /* 15750 * On given end for a lock it determines if it is the last locked offset 15751 * or not, if so keeps it as is, else adds one to return the length for 15752 * valid start. 15753 */ 15754 static off64_t 15755 start_check(off64_t x) 15756 { 15757 if (x == MAXEND) 15758 return (x); 15759 else 15760 return (x + 1); 15761 } 15762 15763 /* 15764 * See if these two locks overlap, and if so return 1; 15765 * otherwise, return 0. 15766 */ 15767 static int 15768 locks_intersect(flock64_t *llfp, flock64_t *curfp) 15769 { 15770 off64_t llfp_end, curfp_end; 15771 15772 llfp_end = lock_to_end(llfp); 15773 curfp_end = lock_to_end(curfp); 15774 15775 if (((llfp_end >= curfp->l_start) && 15776 (llfp->l_start <= curfp->l_start)) || 15777 ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start))) 15778 return (1); 15779 return (0); 15780 } 15781 15782 /* 15783 * Determine what the intersecting lock region is, and add that to the 15784 * 'nl_llpp' locklist in increasing order (by l_start). 15785 */ 15786 static void 15787 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp, 15788 locklist_t **nl_llpp, vnode_t *vp) 15789 { 15790 locklist_t *intersect_llp, *tmp_fllp, *cur_fllp; 15791 off64_t lost_flp_end, local_flp_end, len, start; 15792 15793 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:")); 15794 15795 if (!locks_intersect(lost_flp, local_flp)) 15796 return; 15797 15798 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15799 "locks intersect")); 15800 15801 lost_flp_end = lock_to_end(lost_flp); 15802 local_flp_end = lock_to_end(local_flp); 15803 15804 /* Find the starting point of the intersecting region */ 15805 if (local_flp->l_start > lost_flp->l_start) 15806 start = local_flp->l_start; 15807 else 15808 start = lost_flp->l_start; 15809 15810 /* Find the lenght of the intersecting region */ 15811 if (lost_flp_end < local_flp_end) 15812 len = end_to_len(start, lost_flp_end); 15813 else 15814 len = end_to_len(start, local_flp_end); 15815 15816 /* 15817 * Prepare the flock structure for the intersection found and insert 15818 * it into the new list in increasing l_start order. This list contains 15819 * intersections of locks registered by the client with the local host 15820 * and the lost lock. 15821 * The lock type of this lock is the same as that of the local_flp. 15822 */ 15823 intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP); 15824 intersect_llp->ll_flock.l_start = start; 15825 intersect_llp->ll_flock.l_len = len; 15826 intersect_llp->ll_flock.l_type = local_flp->l_type; 15827 intersect_llp->ll_flock.l_pid = local_flp->l_pid; 15828 intersect_llp->ll_flock.l_sysid = local_flp->l_sysid; 15829 intersect_llp->ll_flock.l_whence = 0; /* aka SEEK_SET */ 15830 intersect_llp->ll_vp = vp; 15831 15832 tmp_fllp = *nl_llpp; 15833 cur_fllp = NULL; 15834 while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start < 15835 intersect_llp->ll_flock.l_start) { 15836 cur_fllp = tmp_fllp; 15837 tmp_fllp = tmp_fllp->ll_next; 15838 } 15839 if (cur_fllp == NULL) { 15840 /* first on the list */ 15841 intersect_llp->ll_next = *nl_llpp; 15842 *nl_llpp = intersect_llp; 15843 } else { 15844 intersect_llp->ll_next = cur_fllp->ll_next; 15845 cur_fllp->ll_next = intersect_llp; 15846 } 15847 15848 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: " 15849 "created lock region: start %"PRIx64" end %"PRIx64" : %s\n", 15850 intersect_llp->ll_flock.l_start, 15851 intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len, 15852 intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE")); 15853 } 15854 15855 /* 15856 * Our local locking current state is potentially different than 15857 * what the NFSv4 server thinks we have due to a lost lock that was 15858 * resent and then received. We need to reset our "NFSv4" locking 15859 * state to match the current local locking state for this pid since 15860 * that is what the user/application sees as what the world is. 15861 * 15862 * We cannot afford to drop the open/lock seqid sync since then we can 15863 * get confused about what the current local locking state "is" versus 15864 * "was". 15865 * 15866 * If we are unable to fix up the locks, we send SIGLOST to the affected 15867 * process. This is not done if the filesystem has been forcibly 15868 * unmounted, in case the process has already exited and a new process 15869 * exists with the same pid. 15870 */ 15871 static void 15872 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr, 15873 nfs4_lock_owner_t *lop) 15874 { 15875 locklist_t *locks, *llp, *ri_llp, *tmp_llp; 15876 mntinfo4_t *mi = VTOMI4(vp); 15877 const int cmd = F_SETLK; 15878 off64_t cur_start, llp_ll_flock_end, lost_flp_end; 15879 flock64_t ul_fl; 15880 15881 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15882 "nfs4_reinstitute_local_lock_state")); 15883 15884 /* 15885 * Find active locks for this vp from the local locking code. 15886 * Scan through this list and find out the locks that intersect with 15887 * the lost lock. Once we find the lock that intersects, add the 15888 * intersection area as a new lock to a new list "ri_llp". The lock 15889 * type of the intersection region lock added to ri_llp is the same 15890 * as that found in the active lock list, "list". The intersecting 15891 * region locks are added to ri_llp in increasing l_start order. 15892 */ 15893 ASSERT(nfs_zone() == mi->mi_zone); 15894 15895 locks = flk_active_locks_for_vp(vp); 15896 ri_llp = NULL; 15897 15898 for (llp = locks; llp != NULL; llp = llp->ll_next) { 15899 ASSERT(llp->ll_vp == vp); 15900 /* 15901 * Pick locks that belong to this pid/lockowner 15902 */ 15903 if (llp->ll_flock.l_pid != lost_flp->l_pid) 15904 continue; 15905 15906 nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp); 15907 } 15908 15909 /* 15910 * Now we have the list of intersections with the lost lock. These are 15911 * the locks that were/are active before the server replied to the 15912 * last/lost lock. Issue these locks to the server here. Playing these 15913 * locks to the server will re-establish aur current local locking state 15914 * with the v4 server. 15915 * If we get an error, send SIGLOST to the application for that lock. 15916 */ 15917 15918 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15919 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15920 "nfs4_reinstitute_local_lock_state: need to issue " 15921 "flock: [%"PRIx64" - %"PRIx64"] : %s", 15922 llp->ll_flock.l_start, 15923 llp->ll_flock.l_start + llp->ll_flock.l_len, 15924 llp->ll_flock.l_type == F_RDLCK ? "READ" : 15925 llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID")); 15926 /* 15927 * No need to relock what we already have 15928 */ 15929 if (llp->ll_flock.l_type == lost_flp->l_type) 15930 continue; 15931 15932 push_reinstate(vp, cmd, &llp->ll_flock, cr, lop); 15933 } 15934 15935 /* 15936 * Now keeping the start of the lost lock as our reference parse the 15937 * newly created ri_llp locklist to find the ranges that we have locked 15938 * with the v4 server but not in the current local locking. We need 15939 * to unlock these ranges. 15940 * These ranges can also be reffered to as those ranges, where the lost 15941 * lock does not overlap with the locks in the ri_llp but are locked 15942 * since the server replied to the lost lock. 15943 */ 15944 cur_start = lost_flp->l_start; 15945 lost_flp_end = lock_to_end(lost_flp); 15946 15947 ul_fl.l_type = F_UNLCK; 15948 ul_fl.l_whence = 0; /* aka SEEK_SET */ 15949 ul_fl.l_sysid = lost_flp->l_sysid; 15950 ul_fl.l_pid = lost_flp->l_pid; 15951 15952 for (llp = ri_llp; llp != NULL; llp = llp->ll_next) { 15953 llp_ll_flock_end = lock_to_end(&llp->ll_flock); 15954 15955 if (llp->ll_flock.l_start <= cur_start) { 15956 cur_start = start_check(llp_ll_flock_end); 15957 continue; 15958 } 15959 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15960 "nfs4_reinstitute_local_lock_state: " 15961 "UNLOCK [%"PRIx64" - %"PRIx64"]", 15962 cur_start, llp->ll_flock.l_start)); 15963 15964 ul_fl.l_start = cur_start; 15965 ul_fl.l_len = end_to_len(cur_start, 15966 (llp->ll_flock.l_start - 1)); 15967 15968 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15969 cur_start = start_check(llp_ll_flock_end); 15970 } 15971 15972 /* 15973 * In the case where the lost lock ends after all intersecting locks, 15974 * unlock the last part of the lost lock range. 15975 */ 15976 if (cur_start != start_check(lost_flp_end)) { 15977 NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, 15978 "nfs4_reinstitute_local_lock_state: UNLOCK end of the " 15979 "lost lock region [%"PRIx64" - %"PRIx64"]", 15980 cur_start, lost_flp->l_start + lost_flp->l_len)); 15981 15982 ul_fl.l_start = cur_start; 15983 /* 15984 * Is it an to-EOF lock? if so unlock till the end 15985 */ 15986 if (lost_flp->l_len == 0) 15987 ul_fl.l_len = 0; 15988 else 15989 ul_fl.l_len = start_check(lost_flp_end) - cur_start; 15990 15991 push_reinstate(vp, cmd, &ul_fl, cr, lop); 15992 } 15993 15994 if (locks != NULL) 15995 flk_free_locklist(locks); 15996 15997 /* Free up our newly created locklist */ 15998 for (llp = ri_llp; llp != NULL; ) { 15999 tmp_llp = llp->ll_next; 16000 kmem_free(llp, sizeof (locklist_t)); 16001 llp = tmp_llp; 16002 } 16003 16004 /* 16005 * Now return back to the original calling nfs4frlock() 16006 * and let us naturally drop our seqid syncs. 16007 */ 16008 } 16009 16010 /* 16011 * Create a lost state record for the given lock reinstantiation request 16012 * and push it onto the lost state queue. 16013 */ 16014 static void 16015 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr, 16016 nfs4_lock_owner_t *lop) 16017 { 16018 nfs4_lost_rqst_t req; 16019 nfs_lock_type4 locktype; 16020 nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS }; 16021 16022 ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone); 16023 16024 locktype = flk_to_locktype(cmd, flk->l_type); 16025 nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype, 16026 NULL, NULL, lop, flk, &req, cr, vp); 16027 (void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL, 16028 (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ? 16029 &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK, 16030 NULL, NULL, NULL); 16031 } 16032