xref: /titanic_50/usr/src/uts/common/fs/nfs/nfs4_vnops.c (revision 7014882c6a3672fd0e5d60200af8643ae53c5928)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
27  */
28 
29 /*
30  *	Copyright 1983,1984,1985,1986,1987,1988,1989 AT&T.
31  *	All Rights Reserved
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/systm.h>
37 #include <sys/cred.h>
38 #include <sys/time.h>
39 #include <sys/vnode.h>
40 #include <sys/vfs.h>
41 #include <sys/vfs_opreg.h>
42 #include <sys/file.h>
43 #include <sys/filio.h>
44 #include <sys/uio.h>
45 #include <sys/buf.h>
46 #include <sys/mman.h>
47 #include <sys/pathname.h>
48 #include <sys/dirent.h>
49 #include <sys/debug.h>
50 #include <sys/vmsystm.h>
51 #include <sys/fcntl.h>
52 #include <sys/flock.h>
53 #include <sys/swap.h>
54 #include <sys/errno.h>
55 #include <sys/strsubr.h>
56 #include <sys/sysmacros.h>
57 #include <sys/kmem.h>
58 #include <sys/cmn_err.h>
59 #include <sys/pathconf.h>
60 #include <sys/utsname.h>
61 #include <sys/dnlc.h>
62 #include <sys/acl.h>
63 #include <sys/systeminfo.h>
64 #include <sys/policy.h>
65 #include <sys/sdt.h>
66 #include <sys/list.h>
67 #include <sys/stat.h>
68 #include <sys/zone.h>
69 
70 #include <rpc/types.h>
71 #include <rpc/auth.h>
72 #include <rpc/clnt.h>
73 
74 #include <nfs/nfs.h>
75 #include <nfs/nfs_clnt.h>
76 #include <nfs/nfs_acl.h>
77 #include <nfs/lm.h>
78 #include <nfs/nfs4.h>
79 #include <nfs/nfs4_kprot.h>
80 #include <nfs/rnode4.h>
81 #include <nfs/nfs4_clnt.h>
82 
83 #include <vm/hat.h>
84 #include <vm/as.h>
85 #include <vm/page.h>
86 #include <vm/pvn.h>
87 #include <vm/seg.h>
88 #include <vm/seg_map.h>
89 #include <vm/seg_kpm.h>
90 #include <vm/seg_vn.h>
91 
92 #include <fs/fs_subr.h>
93 
94 #include <sys/ddi.h>
95 #include <sys/int_fmtio.h>
96 #include <sys/fs/autofs.h>
97 
98 typedef struct {
99 	nfs4_ga_res_t	*di_garp;
100 	cred_t		*di_cred;
101 	hrtime_t	di_time_call;
102 } dirattr_info_t;
103 
104 typedef enum nfs4_acl_op {
105 	NFS4_ACL_GET,
106 	NFS4_ACL_SET
107 } nfs4_acl_op_t;
108 
109 static struct lm_sysid *nfs4_find_sysid(mntinfo4_t *mi);
110 
111 static void	nfs4_update_dircaches(change_info4 *, vnode_t *, vnode_t *,
112 			char *, dirattr_info_t *);
113 
114 static void	nfs4close_otw(rnode4_t *, cred_t *, nfs4_open_owner_t *,
115 		    nfs4_open_stream_t *, int *, int *, nfs4_close_type_t,
116 		    nfs4_error_t *, int *);
117 static int	nfs4_rdwrlbn(vnode_t *, page_t *, u_offset_t, size_t, int,
118 			cred_t *);
119 static int	nfs4write(vnode_t *, caddr_t, u_offset_t, int, cred_t *,
120 			stable_how4 *);
121 static int	nfs4read(vnode_t *, caddr_t, offset_t, int, size_t *,
122 			cred_t *, bool_t, struct uio *);
123 static int	nfs4setattr(vnode_t *, struct vattr *, int, cred_t *,
124 			vsecattr_t *);
125 static int	nfs4openattr(vnode_t *, vnode_t **, int, cred_t *);
126 static int	nfs4lookup(vnode_t *, char *, vnode_t **, cred_t *, int);
127 static int	nfs4lookup_xattr(vnode_t *, char *, vnode_t **, int, cred_t *);
128 static int	nfs4lookupvalidate_otw(vnode_t *, char *, vnode_t **, cred_t *);
129 static int	nfs4lookupnew_otw(vnode_t *, char *, vnode_t **, cred_t *);
130 static int	nfs4mknod(vnode_t *, char *, struct vattr *, enum vcexcl,
131 			int, vnode_t **, cred_t *);
132 static int	nfs4open_otw(vnode_t *, char *, struct vattr *, vnode_t **,
133 			cred_t *, int, int, enum createmode4, int);
134 static int	nfs4rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
135 			caller_context_t *);
136 static int	nfs4rename_persistent_fh(vnode_t *, char *, vnode_t *,
137 			vnode_t *, char *, cred_t *, nfsstat4 *);
138 static int	nfs4rename_volatile_fh(vnode_t *, char *, vnode_t *,
139 			vnode_t *, char *, cred_t *, nfsstat4 *);
140 static int	do_nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
141 static void	nfs4readdir(vnode_t *, rddir4_cache *, cred_t *);
142 static int	nfs4_bio(struct buf *, stable_how4 *, cred_t *, bool_t);
143 static int	nfs4_getapage(vnode_t *, u_offset_t, size_t, uint_t *,
144 			page_t *[], size_t, struct seg *, caddr_t,
145 			enum seg_rw, cred_t *);
146 static void	nfs4_readahead(vnode_t *, u_offset_t, caddr_t, struct seg *,
147 			cred_t *);
148 static int	nfs4_sync_putapage(vnode_t *, page_t *, u_offset_t, size_t,
149 			int, cred_t *);
150 static int	nfs4_sync_pageio(vnode_t *, page_t *, u_offset_t, size_t,
151 			int, cred_t *);
152 static int	nfs4_commit(vnode_t *, offset4, count4, cred_t *);
153 static void	nfs4_set_mod(vnode_t *);
154 static void	nfs4_get_commit(vnode_t *);
155 static void	nfs4_get_commit_range(vnode_t *, u_offset_t, size_t);
156 static int	nfs4_putpage_commit(vnode_t *, offset_t, size_t, cred_t *);
157 static int	nfs4_commit_vp(vnode_t *, u_offset_t, size_t, cred_t *, int);
158 static int	nfs4_sync_commit(vnode_t *, page_t *, offset3, count3,
159 			cred_t *);
160 static void	do_nfs4_async_commit(vnode_t *, page_t *, offset3, count3,
161 			cred_t *);
162 static int	nfs4_update_attrcache(nfsstat4, nfs4_ga_res_t *,
163 			hrtime_t, vnode_t *, cred_t *);
164 static int	nfs4_open_non_reg_file(vnode_t **, int, cred_t *);
165 static int	nfs4_safelock(vnode_t *, const struct flock64 *, cred_t *);
166 static void	nfs4_register_lock_locally(vnode_t *, struct flock64 *, int,
167 			u_offset_t);
168 static int 	nfs4_lockrelease(vnode_t *, int, offset_t, cred_t *);
169 static int	nfs4_block_and_wait(clock_t *, rnode4_t *);
170 static cred_t  *state_to_cred(nfs4_open_stream_t *);
171 static void	denied_to_flk(LOCK4denied *, flock64_t *, LOCKT4args *);
172 static pid_t	lo_to_pid(lock_owner4 *);
173 static void	nfs4_reinstitute_local_lock_state(vnode_t *, flock64_t *,
174 			cred_t *, nfs4_lock_owner_t *);
175 static void	push_reinstate(vnode_t *, int, flock64_t *, cred_t *,
176 			nfs4_lock_owner_t *);
177 static int 	open_and_get_osp(vnode_t *, cred_t *, nfs4_open_stream_t **);
178 static void	nfs4_delmap_callback(struct as *, void *, uint_t);
179 static void	nfs4_free_delmapcall(nfs4_delmapcall_t *);
180 static nfs4_delmapcall_t	*nfs4_init_delmapcall();
181 static int	nfs4_find_and_delete_delmapcall(rnode4_t *, int *);
182 static int	nfs4_is_acl_mask_valid(uint_t, nfs4_acl_op_t);
183 static int	nfs4_create_getsecattr_return(vsecattr_t *, vsecattr_t *,
184 			uid_t, gid_t, int);
185 
186 /*
187  * Routines that implement the setting of v4 args for the misc. ops
188  */
189 static void	nfs4args_lock_free(nfs_argop4 *);
190 static void	nfs4args_lockt_free(nfs_argop4 *);
191 static void	nfs4args_setattr(nfs_argop4 *, vattr_t *, vsecattr_t *,
192 			int, rnode4_t *, cred_t *, bitmap4, int *,
193 			nfs4_stateid_types_t *);
194 static void	nfs4args_setattr_free(nfs_argop4 *);
195 static int	nfs4args_verify(nfs_argop4 *, vattr_t *, enum nfs_opnum4,
196 			bitmap4);
197 static void	nfs4args_verify_free(nfs_argop4 *);
198 static void	nfs4args_write(nfs_argop4 *, stable_how4, rnode4_t *, cred_t *,
199 			WRITE4args **, nfs4_stateid_types_t *);
200 
201 /*
202  * These are the vnode ops functions that implement the vnode interface to
203  * the networked file system.  See more comments below at nfs4_vnodeops.
204  */
205 static int	nfs4_open(vnode_t **, int, cred_t *, caller_context_t *);
206 static int	nfs4_close(vnode_t *, int, int, offset_t, cred_t *,
207 			caller_context_t *);
208 static int	nfs4_read(vnode_t *, struct uio *, int, cred_t *,
209 			caller_context_t *);
210 static int	nfs4_write(vnode_t *, struct uio *, int, cred_t *,
211 			caller_context_t *);
212 static int	nfs4_ioctl(vnode_t *, int, intptr_t, int, cred_t *, int *,
213 			caller_context_t *);
214 static int	nfs4_setattr(vnode_t *, struct vattr *, int, cred_t *,
215 			caller_context_t *);
216 static int	nfs4_access(vnode_t *, int, int, cred_t *, caller_context_t *);
217 static int	nfs4_readlink(vnode_t *, struct uio *, cred_t *,
218 			caller_context_t *);
219 static int	nfs4_fsync(vnode_t *, int, cred_t *, caller_context_t *);
220 static int	nfs4_create(vnode_t *, char *, struct vattr *, enum vcexcl,
221 			int, vnode_t **, cred_t *, int, caller_context_t *,
222 			vsecattr_t *);
223 static int	nfs4_remove(vnode_t *, char *, cred_t *, caller_context_t *,
224 			int);
225 static int	nfs4_link(vnode_t *, vnode_t *, char *, cred_t *,
226 			caller_context_t *, int);
227 static int	nfs4_rename(vnode_t *, char *, vnode_t *, char *, cred_t *,
228 			caller_context_t *, int);
229 static int	nfs4_mkdir(vnode_t *, char *, struct vattr *, vnode_t **,
230 			cred_t *, caller_context_t *, int, vsecattr_t *);
231 static int	nfs4_rmdir(vnode_t *, char *, vnode_t *, cred_t *,
232 			caller_context_t *, int);
233 static int	nfs4_symlink(vnode_t *, char *, struct vattr *, char *,
234 			cred_t *, caller_context_t *, int);
235 static int	nfs4_readdir(vnode_t *, struct uio *, cred_t *, int *,
236 			caller_context_t *, int);
237 static int	nfs4_seek(vnode_t *, offset_t, offset_t *, caller_context_t *);
238 static int	nfs4_getpage(vnode_t *, offset_t, size_t, uint_t *,
239 			page_t *[], size_t, struct seg *, caddr_t,
240 			enum seg_rw, cred_t *, caller_context_t *);
241 static int	nfs4_putpage(vnode_t *, offset_t, size_t, int, cred_t *,
242 			caller_context_t *);
243 static int	nfs4_map(vnode_t *, offset_t, struct as *, caddr_t *, size_t,
244 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
245 static int	nfs4_addmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
246 			uchar_t, uchar_t, uint_t, cred_t *, caller_context_t *);
247 static int	nfs4_cmp(vnode_t *, vnode_t *, caller_context_t *);
248 static int	nfs4_frlock(vnode_t *, int, struct flock64 *, int, offset_t,
249 			struct flk_callback *, cred_t *, caller_context_t *);
250 static int	nfs4_space(vnode_t *, int, struct flock64 *, int, offset_t,
251 			cred_t *, caller_context_t *);
252 static int	nfs4_delmap(vnode_t *, offset_t, struct as *, caddr_t, size_t,
253 			uint_t, uint_t, uint_t, cred_t *, caller_context_t *);
254 static int	nfs4_pageio(vnode_t *, page_t *, u_offset_t, size_t, int,
255 			cred_t *, caller_context_t *);
256 static void	nfs4_dispose(vnode_t *, page_t *, int, int, cred_t *,
257 			caller_context_t *);
258 static int	nfs4_setsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
259 			caller_context_t *);
260 /*
261  * These vnode ops are required to be called from outside this source file,
262  * e.g. by ephemeral mount stub vnode ops, and so may not be declared
263  * as static.
264  */
265 int	nfs4_getattr(vnode_t *, struct vattr *, int, cred_t *,
266 	    caller_context_t *);
267 void	nfs4_inactive(vnode_t *, cred_t *, caller_context_t *);
268 int	nfs4_lookup(vnode_t *, char *, vnode_t **,
269 	    struct pathname *, int, vnode_t *, cred_t *,
270 	    caller_context_t *, int *, pathname_t *);
271 int	nfs4_fid(vnode_t *, fid_t *, caller_context_t *);
272 int	nfs4_rwlock(vnode_t *, int, caller_context_t *);
273 void	nfs4_rwunlock(vnode_t *, int, caller_context_t *);
274 int	nfs4_realvp(vnode_t *, vnode_t **, caller_context_t *);
275 int	nfs4_pathconf(vnode_t *, int, ulong_t *, cred_t *,
276 	    caller_context_t *);
277 int	nfs4_getsecattr(vnode_t *, vsecattr_t *, int, cred_t *,
278 	    caller_context_t *);
279 int	nfs4_shrlock(vnode_t *, int, struct shrlock *, int, cred_t *,
280 	    caller_context_t *);
281 
282 /*
283  * Used for nfs4_commit_vp() to indicate if we should
284  * wait on pending writes.
285  */
286 #define	NFS4_WRITE_NOWAIT	0
287 #define	NFS4_WRITE_WAIT		1
288 
289 #define	NFS4_BASE_WAIT_TIME 1	/* 1 second */
290 
291 /*
292  * Error flags used to pass information about certain special errors
293  * which need to be handled specially.
294  */
295 #define	NFS_EOF			-98
296 #define	NFS_VERF_MISMATCH	-97
297 
298 /*
299  * Flags used to differentiate between which operation drove the
300  * potential CLOSE OTW. (see nfs4_close_otw_if_necessary)
301  */
302 #define	NFS4_CLOSE_OP		0x1
303 #define	NFS4_DELMAP_OP		0x2
304 #define	NFS4_INACTIVE_OP	0x3
305 
306 #define	ISVDEV(t) ((t == VBLK) || (t == VCHR) || (t == VFIFO))
307 
308 /* ALIGN64 aligns the given buffer and adjust buffer size to 64 bit */
309 #define	ALIGN64(x, ptr, sz)						\
310 	x = ((uintptr_t)(ptr)) & (sizeof (uint64_t) - 1);		\
311 	if (x) {							\
312 		x = sizeof (uint64_t) - (x);				\
313 		sz -= (x);						\
314 		ptr += (x);						\
315 	}
316 
317 #ifdef DEBUG
318 int nfs4_client_attr_debug = 0;
319 int nfs4_client_state_debug = 0;
320 int nfs4_client_shadow_debug = 0;
321 int nfs4_client_lock_debug = 0;
322 int nfs4_seqid_sync = 0;
323 int nfs4_client_map_debug = 0;
324 static int nfs4_pageio_debug = 0;
325 int nfs4_client_inactive_debug = 0;
326 int nfs4_client_recov_debug = 0;
327 int nfs4_client_failover_debug = 0;
328 int nfs4_client_call_debug = 0;
329 int nfs4_client_lookup_debug = 0;
330 int nfs4_client_zone_debug = 0;
331 int nfs4_lost_rqst_debug = 0;
332 int nfs4_rdattrerr_debug = 0;
333 int nfs4_open_stream_debug = 0;
334 
335 int nfs4read_error_inject;
336 
337 static int nfs4_create_misses = 0;
338 
339 static int nfs4_readdir_cache_shorts = 0;
340 static int nfs4_readdir_readahead = 0;
341 
342 static int nfs4_bio_do_stop = 0;
343 
344 static int nfs4_lostpage = 0;	/* number of times we lost original page */
345 
346 int nfs4_mmap_debug = 0;
347 
348 static int nfs4_pathconf_cache_hits = 0;
349 static int nfs4_pathconf_cache_misses = 0;
350 
351 int nfs4close_all_cnt;
352 int nfs4close_one_debug = 0;
353 int nfs4close_notw_debug = 0;
354 
355 int denied_to_flk_debug = 0;
356 void *lockt_denied_debug;
357 
358 #endif
359 
360 /*
361  * How long to wait before trying again if OPEN_CONFIRM gets ETIMEDOUT
362  * or NFS4ERR_RESOURCE.
363  */
364 static int confirm_retry_sec = 30;
365 
366 static int nfs4_lookup_neg_cache = 1;
367 
368 /*
369  * number of pages to read ahead
370  * optimized for 100 base-T.
371  */
372 static int nfs4_nra = 4;
373 
374 static int nfs4_do_symlink_cache = 1;
375 
376 static int nfs4_pathconf_disable_cache = 0;
377 
378 /*
379  * These are the vnode ops routines which implement the vnode interface to
380  * the networked file system.  These routines just take their parameters,
381  * make them look networkish by putting the right info into interface structs,
382  * and then calling the appropriate remote routine(s) to do the work.
383  *
384  * Note on directory name lookup cacheing:  If we detect a stale fhandle,
385  * we purge the directory cache relative to that vnode.  This way, the
386  * user won't get burned by the cache repeatedly.  See <nfs/rnode4.h> for
387  * more details on rnode locking.
388  */
389 
390 struct vnodeops *nfs4_vnodeops;
391 
392 const fs_operation_def_t nfs4_vnodeops_template[] = {
393 	VOPNAME_OPEN,		{ .vop_open = nfs4_open },
394 	VOPNAME_CLOSE,		{ .vop_close = nfs4_close },
395 	VOPNAME_READ,		{ .vop_read = nfs4_read },
396 	VOPNAME_WRITE,		{ .vop_write = nfs4_write },
397 	VOPNAME_IOCTL,		{ .vop_ioctl = nfs4_ioctl },
398 	VOPNAME_GETATTR,	{ .vop_getattr = nfs4_getattr },
399 	VOPNAME_SETATTR,	{ .vop_setattr = nfs4_setattr },
400 	VOPNAME_ACCESS,		{ .vop_access = nfs4_access },
401 	VOPNAME_LOOKUP,		{ .vop_lookup = nfs4_lookup },
402 	VOPNAME_CREATE,		{ .vop_create = nfs4_create },
403 	VOPNAME_REMOVE,		{ .vop_remove = nfs4_remove },
404 	VOPNAME_LINK,		{ .vop_link = nfs4_link },
405 	VOPNAME_RENAME,		{ .vop_rename = nfs4_rename },
406 	VOPNAME_MKDIR,		{ .vop_mkdir = nfs4_mkdir },
407 	VOPNAME_RMDIR,		{ .vop_rmdir = nfs4_rmdir },
408 	VOPNAME_READDIR,	{ .vop_readdir = nfs4_readdir },
409 	VOPNAME_SYMLINK,	{ .vop_symlink = nfs4_symlink },
410 	VOPNAME_READLINK,	{ .vop_readlink = nfs4_readlink },
411 	VOPNAME_FSYNC,		{ .vop_fsync = nfs4_fsync },
412 	VOPNAME_INACTIVE,	{ .vop_inactive = nfs4_inactive },
413 	VOPNAME_FID,		{ .vop_fid = nfs4_fid },
414 	VOPNAME_RWLOCK,		{ .vop_rwlock = nfs4_rwlock },
415 	VOPNAME_RWUNLOCK,	{ .vop_rwunlock = nfs4_rwunlock },
416 	VOPNAME_SEEK,		{ .vop_seek = nfs4_seek },
417 	VOPNAME_FRLOCK,		{ .vop_frlock = nfs4_frlock },
418 	VOPNAME_SPACE,		{ .vop_space = nfs4_space },
419 	VOPNAME_REALVP,		{ .vop_realvp = nfs4_realvp },
420 	VOPNAME_GETPAGE,	{ .vop_getpage = nfs4_getpage },
421 	VOPNAME_PUTPAGE,	{ .vop_putpage = nfs4_putpage },
422 	VOPNAME_MAP,		{ .vop_map = nfs4_map },
423 	VOPNAME_ADDMAP,		{ .vop_addmap = nfs4_addmap },
424 	VOPNAME_DELMAP,		{ .vop_delmap = nfs4_delmap },
425 	/* no separate nfs4_dump */
426 	VOPNAME_DUMP,		{ .vop_dump = nfs_dump },
427 	VOPNAME_PATHCONF,	{ .vop_pathconf = nfs4_pathconf },
428 	VOPNAME_PAGEIO,		{ .vop_pageio = nfs4_pageio },
429 	VOPNAME_DISPOSE,	{ .vop_dispose = nfs4_dispose },
430 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = nfs4_setsecattr },
431 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = nfs4_getsecattr },
432 	VOPNAME_SHRLOCK,	{ .vop_shrlock = nfs4_shrlock },
433 	VOPNAME_VNEVENT, 	{ .vop_vnevent = fs_vnevent_support },
434 	NULL,			NULL
435 };
436 
437 /*
438  * The following are subroutines and definitions to set args or get res
439  * for the different nfsv4 ops
440  */
441 
442 void
443 nfs4args_lookup_free(nfs_argop4 *argop, int arglen)
444 {
445 	int		i;
446 
447 	for (i = 0; i < arglen; i++) {
448 		if (argop[i].argop == OP_LOOKUP) {
449 			kmem_free(
450 			    argop[i].nfs_argop4_u.oplookup.
451 			    objname.utf8string_val,
452 			    argop[i].nfs_argop4_u.oplookup.
453 			    objname.utf8string_len);
454 		}
455 	}
456 }
457 
458 static void
459 nfs4args_lock_free(nfs_argop4 *argop)
460 {
461 	locker4 *locker = &argop->nfs_argop4_u.oplock.locker;
462 
463 	if (locker->new_lock_owner == TRUE) {
464 		open_to_lock_owner4 *open_owner;
465 
466 		open_owner = &locker->locker4_u.open_owner;
467 		if (open_owner->lock_owner.owner_val != NULL) {
468 			kmem_free(open_owner->lock_owner.owner_val,
469 			    open_owner->lock_owner.owner_len);
470 		}
471 	}
472 }
473 
474 static void
475 nfs4args_lockt_free(nfs_argop4 *argop)
476 {
477 	lock_owner4 *lowner = &argop->nfs_argop4_u.oplockt.owner;
478 
479 	if (lowner->owner_val != NULL) {
480 		kmem_free(lowner->owner_val, lowner->owner_len);
481 	}
482 }
483 
484 static void
485 nfs4args_setattr(nfs_argop4 *argop, vattr_t *vap, vsecattr_t *vsap, int flags,
486     rnode4_t *rp, cred_t *cr, bitmap4 supp, int *error,
487     nfs4_stateid_types_t *sid_types)
488 {
489 	fattr4		*attr = &argop->nfs_argop4_u.opsetattr.obj_attributes;
490 	mntinfo4_t	*mi;
491 
492 	argop->argop = OP_SETATTR;
493 	/*
494 	 * The stateid is set to 0 if client is not modifying the size
495 	 * and otherwise to whatever nfs4_get_stateid() returns.
496 	 *
497 	 * XXX Note: nfs4_get_stateid() returns 0 if no lockowner and/or no
498 	 * state struct could be found for the process/file pair.  We may
499 	 * want to change this in the future (by OPENing the file).  See
500 	 * bug # 4474852.
501 	 */
502 	if (vap->va_mask & AT_SIZE) {
503 
504 		ASSERT(rp != NULL);
505 		mi = VTOMI4(RTOV4(rp));
506 
507 		argop->nfs_argop4_u.opsetattr.stateid =
508 		    nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
509 		    OP_SETATTR, sid_types, FALSE);
510 	} else {
511 		bzero(&argop->nfs_argop4_u.opsetattr.stateid,
512 		    sizeof (stateid4));
513 	}
514 
515 	*error = vattr_to_fattr4(vap, vsap, attr, flags, OP_SETATTR, supp);
516 	if (*error)
517 		bzero(attr, sizeof (*attr));
518 }
519 
520 static void
521 nfs4args_setattr_free(nfs_argop4 *argop)
522 {
523 	nfs4_fattr4_free(&argop->nfs_argop4_u.opsetattr.obj_attributes);
524 }
525 
526 static int
527 nfs4args_verify(nfs_argop4 *argop, vattr_t *vap, enum nfs_opnum4 op,
528     bitmap4 supp)
529 {
530 	fattr4 *attr;
531 	int error = 0;
532 
533 	argop->argop = op;
534 	switch (op) {
535 	case OP_VERIFY:
536 		attr = &argop->nfs_argop4_u.opverify.obj_attributes;
537 		break;
538 	case OP_NVERIFY:
539 		attr = &argop->nfs_argop4_u.opnverify.obj_attributes;
540 		break;
541 	default:
542 		return (EINVAL);
543 	}
544 	if (!error)
545 		error = vattr_to_fattr4(vap, NULL, attr, 0, op, supp);
546 	if (error)
547 		bzero(attr, sizeof (*attr));
548 	return (error);
549 }
550 
551 static void
552 nfs4args_verify_free(nfs_argop4 *argop)
553 {
554 	switch (argop->argop) {
555 	case OP_VERIFY:
556 		nfs4_fattr4_free(&argop->nfs_argop4_u.opverify.obj_attributes);
557 		break;
558 	case OP_NVERIFY:
559 		nfs4_fattr4_free(&argop->nfs_argop4_u.opnverify.obj_attributes);
560 		break;
561 	default:
562 		break;
563 	}
564 }
565 
566 static void
567 nfs4args_write(nfs_argop4 *argop, stable_how4 stable, rnode4_t *rp, cred_t *cr,
568     WRITE4args **wargs_pp, nfs4_stateid_types_t *sid_tp)
569 {
570 	WRITE4args *wargs = &argop->nfs_argop4_u.opwrite;
571 	mntinfo4_t *mi = VTOMI4(RTOV4(rp));
572 
573 	argop->argop = OP_WRITE;
574 	wargs->stable = stable;
575 	wargs->stateid = nfs4_get_w_stateid(cr, rp, curproc->p_pidp->pid_id,
576 	    mi, OP_WRITE, sid_tp);
577 	wargs->mblk = NULL;
578 	*wargs_pp = wargs;
579 }
580 
581 void
582 nfs4args_copen_free(OPEN4cargs *open_args)
583 {
584 	if (open_args->owner.owner_val) {
585 		kmem_free(open_args->owner.owner_val,
586 		    open_args->owner.owner_len);
587 	}
588 	if ((open_args->opentype == OPEN4_CREATE) &&
589 	    (open_args->mode != EXCLUSIVE4)) {
590 		nfs4_fattr4_free(&open_args->createhow4_u.createattrs);
591 	}
592 }
593 
594 /*
595  * XXX:  This is referenced in modstubs.s
596  */
597 struct vnodeops *
598 nfs4_getvnodeops(void)
599 {
600 	return (nfs4_vnodeops);
601 }
602 
603 /*
604  * The OPEN operation opens a regular file.
605  */
606 /*ARGSUSED3*/
607 static int
608 nfs4_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
609 {
610 	vnode_t *dvp = NULL;
611 	rnode4_t *rp, *drp;
612 	int error;
613 	int just_been_created;
614 	char fn[MAXNAMELEN];
615 
616 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4_open: "));
617 	if (nfs_zone() != VTOMI4(*vpp)->mi_zone)
618 		return (EIO);
619 	rp = VTOR4(*vpp);
620 
621 	/*
622 	 * Check to see if opening something besides a regular file;
623 	 * if so skip the OTW call
624 	 */
625 	if ((*vpp)->v_type != VREG) {
626 		error = nfs4_open_non_reg_file(vpp, flag, cr);
627 		return (error);
628 	}
629 
630 	/*
631 	 * XXX - would like a check right here to know if the file is
632 	 * executable or not, so as to skip OTW
633 	 */
634 
635 	if ((error = vtodv(*vpp, &dvp, cr, TRUE)) != 0)
636 		return (error);
637 
638 	drp = VTOR4(dvp);
639 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
640 		return (EINTR);
641 
642 	if ((error = vtoname(*vpp, fn, MAXNAMELEN)) != 0) {
643 		nfs_rw_exit(&drp->r_rwlock);
644 		return (error);
645 	}
646 
647 	/*
648 	 * See if this file has just been CREATEd.
649 	 * If so, clear the flag and update the dnlc, which was previously
650 	 * skipped in nfs4_create.
651 	 * XXX need better serilization on this.
652 	 * XXX move this into the nf4open_otw call, after we have
653 	 * XXX acquired the open owner seqid sync.
654 	 */
655 	mutex_enter(&rp->r_statev4_lock);
656 	if (rp->created_v4) {
657 		rp->created_v4 = 0;
658 		mutex_exit(&rp->r_statev4_lock);
659 
660 		dnlc_update(dvp, fn, *vpp);
661 		/* This is needed so we don't bump the open ref count */
662 		just_been_created = 1;
663 	} else {
664 		mutex_exit(&rp->r_statev4_lock);
665 		just_been_created = 0;
666 	}
667 
668 	/*
669 	 * If caller specified O_TRUNC/FTRUNC, then be sure to set
670 	 * FWRITE (to drive successful setattr(size=0) after open)
671 	 */
672 	if (flag & FTRUNC)
673 		flag |= FWRITE;
674 
675 	error = nfs4open_otw(dvp, fn, NULL, vpp, cr, 0, flag, 0,
676 	    just_been_created);
677 
678 	if (!error && !((*vpp)->v_flag & VROOT))
679 		dnlc_update(dvp, fn, *vpp);
680 
681 	nfs_rw_exit(&drp->r_rwlock);
682 
683 	/* release the hold from vtodv */
684 	VN_RELE(dvp);
685 
686 	/* exchange the shadow for the master vnode, if needed */
687 
688 	if (error == 0 && IS_SHADOW(*vpp, rp))
689 		sv_exchange(vpp);
690 
691 	return (error);
692 }
693 
694 /*
695  * See if there's a "lost open" request to be saved and recovered.
696  */
697 static void
698 nfs4open_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
699     nfs4_open_owner_t *oop, cred_t *cr, vnode_t *vp,
700     vnode_t *dvp, OPEN4cargs *open_args)
701 {
702 	vfs_t *vfsp;
703 	char *srccfp;
704 
705 	vfsp = (dvp ? dvp->v_vfsp : vp->v_vfsp);
706 
707 	if (error != ETIMEDOUT && error != EINTR &&
708 	    !NFS4_FRC_UNMT_ERR(error, vfsp)) {
709 		lost_rqstp->lr_op = 0;
710 		return;
711 	}
712 
713 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
714 	    "nfs4open_save_lost_rqst: error %d", error));
715 
716 	lost_rqstp->lr_op = OP_OPEN;
717 
718 	/*
719 	 * The vp (if it is not NULL) and dvp are held and rele'd via
720 	 * the recovery code.  See nfs4_save_lost_rqst.
721 	 */
722 	lost_rqstp->lr_vp = vp;
723 	lost_rqstp->lr_dvp = dvp;
724 	lost_rqstp->lr_oop = oop;
725 	lost_rqstp->lr_osp = NULL;
726 	lost_rqstp->lr_lop = NULL;
727 	lost_rqstp->lr_cr = cr;
728 	lost_rqstp->lr_flk = NULL;
729 	lost_rqstp->lr_oacc = open_args->share_access;
730 	lost_rqstp->lr_odeny = open_args->share_deny;
731 	lost_rqstp->lr_oclaim = open_args->claim;
732 	if (open_args->claim == CLAIM_DELEGATE_CUR) {
733 		lost_rqstp->lr_ostateid =
734 		    open_args->open_claim4_u.delegate_cur_info.delegate_stateid;
735 		srccfp = open_args->open_claim4_u.delegate_cur_info.cfile;
736 	} else {
737 		srccfp = open_args->open_claim4_u.cfile;
738 	}
739 	lost_rqstp->lr_ofile.utf8string_len = 0;
740 	lost_rqstp->lr_ofile.utf8string_val = NULL;
741 	(void) str_to_utf8(srccfp, &lost_rqstp->lr_ofile);
742 	lost_rqstp->lr_putfirst = FALSE;
743 }
744 
745 struct nfs4_excl_time {
746 	uint32 seconds;
747 	uint32 nseconds;
748 };
749 
750 /*
751  * The OPEN operation creates and/or opens a regular file
752  *
753  * ARGSUSED
754  */
755 static int
756 nfs4open_otw(vnode_t *dvp, char *file_name, struct vattr *in_va,
757     vnode_t **vpp, cred_t *cr, int create_flag, int open_flag,
758     enum createmode4 createmode, int file_just_been_created)
759 {
760 	rnode4_t *rp;
761 	rnode4_t *drp = VTOR4(dvp);
762 	vnode_t *vp = NULL;
763 	vnode_t *vpi = *vpp;
764 	bool_t needrecov = FALSE;
765 
766 	int doqueue = 1;
767 
768 	COMPOUND4args_clnt args;
769 	COMPOUND4res_clnt res;
770 	nfs_argop4 *argop;
771 	nfs_resop4 *resop;
772 	int argoplist_size;
773 	int idx_open, idx_fattr;
774 
775 	GETFH4res *gf_res = NULL;
776 	OPEN4res *op_res = NULL;
777 	nfs4_ga_res_t *garp;
778 	fattr4 *attr = NULL;
779 	struct nfs4_excl_time verf;
780 	bool_t did_excl_setup = FALSE;
781 	int created_osp;
782 
783 	OPEN4cargs *open_args;
784 	nfs4_open_owner_t	*oop = NULL;
785 	nfs4_open_stream_t	*osp = NULL;
786 	seqid4 seqid = 0;
787 	bool_t retry_open = FALSE;
788 	nfs4_recov_state_t recov_state;
789 	nfs4_lost_rqst_t lost_rqst;
790 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
791 	hrtime_t t;
792 	int acc = 0;
793 	cred_t *cred_otw = NULL;	/* cred used to do the RPC call */
794 	cred_t *ncr = NULL;
795 
796 	nfs4_sharedfh_t *otw_sfh;
797 	nfs4_sharedfh_t *orig_sfh;
798 	int fh_differs = 0;
799 	int numops, setgid_flag;
800 	int num_bseqid_retry = NFS4_NUM_RETRY_BAD_SEQID + 1;
801 
802 	/*
803 	 * Make sure we properly deal with setting the right gid on
804 	 * a newly created file to reflect the parent's setgid bit
805 	 */
806 	setgid_flag = 0;
807 	if (create_flag && in_va) {
808 
809 		/*
810 		 * If there is grpid mount flag used or
811 		 * the parent's directory has the setgid bit set
812 		 * _and_ the client was able to get a valid mapping
813 		 * for the parent dir's owner_group, we want to
814 		 * append NVERIFY(owner_group == dva.va_gid) and
815 		 * SETATTR to the CREATE compound.
816 		 */
817 		mutex_enter(&drp->r_statelock);
818 		if ((VTOMI4(dvp)->mi_flags & MI4_GRPID ||
819 		    drp->r_attr.va_mode & VSGID) &&
820 		    drp->r_attr.va_gid != GID_NOBODY) {
821 			in_va->va_mask |= AT_GID;
822 			in_va->va_gid = drp->r_attr.va_gid;
823 			setgid_flag = 1;
824 		}
825 		mutex_exit(&drp->r_statelock);
826 	}
827 
828 	/*
829 	 * Normal/non-create compound:
830 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new)
831 	 *
832 	 * Open(create) compound no setgid:
833 	 * PUTFH(dfh) + SAVEFH + OPEN(create) + GETFH + GETATTR(new) +
834 	 * RESTOREFH + GETATTR
835 	 *
836 	 * Open(create) setgid:
837 	 * PUTFH(dfh) + OPEN(create) + GETFH + GETATTR(new) +
838 	 * SAVEFH + PUTFH(dfh) + GETATTR(dvp) + RESTOREFH +
839 	 * NVERIFY(grp) + SETATTR
840 	 */
841 	if (setgid_flag) {
842 		numops = 10;
843 		idx_open = 1;
844 		idx_fattr = 3;
845 	} else if (create_flag) {
846 		numops = 7;
847 		idx_open = 2;
848 		idx_fattr = 4;
849 	} else {
850 		numops = 4;
851 		idx_open = 1;
852 		idx_fattr = 3;
853 	}
854 
855 	args.array_len = numops;
856 	argoplist_size = numops * sizeof (nfs_argop4);
857 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
858 
859 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw: "
860 	    "open %s open flag 0x%x cred %p", file_name, open_flag,
861 	    (void *)cr));
862 
863 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
864 	if (create_flag) {
865 		/*
866 		 * We are to create a file.  Initialize the passed in vnode
867 		 * pointer.
868 		 */
869 		vpi = NULL;
870 	} else {
871 		/*
872 		 * Check to see if the client owns a read delegation and is
873 		 * trying to open for write.  If so, then return the delegation
874 		 * to avoid the server doing a cb_recall and returning DELAY.
875 		 * NB - we don't use the statev4_lock here because we'd have
876 		 * to drop the lock anyway and the result would be stale.
877 		 */
878 		if ((open_flag & FWRITE) &&
879 		    VTOR4(vpi)->r_deleg_type == OPEN_DELEGATE_READ)
880 			(void) nfs4delegreturn(VTOR4(vpi), NFS4_DR_REOPEN);
881 
882 		/*
883 		 * If the file has a delegation, then do an access check up
884 		 * front.  This avoids having to an access check later after
885 		 * we've already done start_op, which could deadlock.
886 		 */
887 		if (VTOR4(vpi)->r_deleg_type != OPEN_DELEGATE_NONE) {
888 			if (open_flag & FREAD &&
889 			    nfs4_access(vpi, VREAD, 0, cr, NULL) == 0)
890 				acc |= VREAD;
891 			if (open_flag & FWRITE &&
892 			    nfs4_access(vpi, VWRITE, 0, cr, NULL) == 0)
893 				acc |= VWRITE;
894 		}
895 	}
896 
897 	drp = VTOR4(dvp);
898 
899 	recov_state.rs_flags = 0;
900 	recov_state.rs_num_retry_despite_err = 0;
901 	cred_otw = cr;
902 
903 recov_retry:
904 	fh_differs = 0;
905 	nfs4_error_zinit(&e);
906 
907 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, vpi, &recov_state);
908 	if (e.error) {
909 		if (ncr != NULL)
910 			crfree(ncr);
911 		kmem_free(argop, argoplist_size);
912 		return (e.error);
913 	}
914 
915 	args.ctag = TAG_OPEN;
916 	args.array_len = numops;
917 	args.array = argop;
918 
919 	/* putfh directory fh */
920 	argop[0].argop = OP_CPUTFH;
921 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
922 
923 	/* OPEN: either op 1 or op 2 depending upon create/setgid flags */
924 	argop[idx_open].argop = OP_COPEN;
925 	open_args = &argop[idx_open].nfs_argop4_u.opcopen;
926 	open_args->claim = CLAIM_NULL;
927 
928 	/* name of file */
929 	open_args->open_claim4_u.cfile = file_name;
930 	open_args->owner.owner_len = 0;
931 	open_args->owner.owner_val = NULL;
932 
933 	if (create_flag) {
934 		/* CREATE a file */
935 		open_args->opentype = OPEN4_CREATE;
936 		open_args->mode = createmode;
937 		if (createmode == EXCLUSIVE4) {
938 			if (did_excl_setup == FALSE) {
939 				verf.seconds = zone_get_hostid(NULL);
940 				if (verf.seconds != 0)
941 					verf.nseconds = newnum();
942 				else {
943 					timestruc_t now;
944 
945 					gethrestime(&now);
946 					verf.seconds = now.tv_sec;
947 					verf.nseconds = now.tv_nsec;
948 				}
949 				/*
950 				 * Since the server will use this value for the
951 				 * mtime, make sure that it can't overflow. Zero
952 				 * out the MSB. The actual value does not matter
953 				 * here, only its uniqeness.
954 				 */
955 				verf.seconds &= INT32_MAX;
956 				did_excl_setup = TRUE;
957 			}
958 
959 			/* Now copy over verifier to OPEN4args. */
960 			open_args->createhow4_u.createverf = *(uint64_t *)&verf;
961 		} else {
962 			int v_error;
963 			bitmap4 supp_attrs;
964 			servinfo4_t *svp;
965 
966 			attr = &open_args->createhow4_u.createattrs;
967 
968 			svp = drp->r_server;
969 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
970 			supp_attrs = svp->sv_supp_attrs;
971 			nfs_rw_exit(&svp->sv_lock);
972 
973 			/* GUARDED4 or UNCHECKED4 */
974 			v_error = vattr_to_fattr4(in_va, NULL, attr, 0, OP_OPEN,
975 			    supp_attrs);
976 			if (v_error) {
977 				bzero(attr, sizeof (*attr));
978 				nfs4args_copen_free(open_args);
979 				nfs4_end_op(VTOMI4(dvp), dvp, vpi,
980 				    &recov_state, FALSE);
981 				if (ncr != NULL)
982 					crfree(ncr);
983 				kmem_free(argop, argoplist_size);
984 				return (v_error);
985 			}
986 		}
987 	} else {
988 		/* NO CREATE */
989 		open_args->opentype = OPEN4_NOCREATE;
990 	}
991 
992 	if (recov_state.rs_sp != NULL) {
993 		mutex_enter(&recov_state.rs_sp->s_lock);
994 		open_args->owner.clientid = recov_state.rs_sp->clientid;
995 		mutex_exit(&recov_state.rs_sp->s_lock);
996 	} else {
997 		/* XXX should we just fail here? */
998 		open_args->owner.clientid = 0;
999 	}
1000 
1001 	/*
1002 	 * This increments oop's ref count or creates a temporary 'just_created'
1003 	 * open owner that will become valid when this OPEN/OPEN_CONFIRM call
1004 	 * completes.
1005 	 */
1006 	mutex_enter(&VTOMI4(dvp)->mi_lock);
1007 
1008 	/* See if a permanent or just created open owner exists */
1009 	oop = find_open_owner_nolock(cr, NFS4_JUST_CREATED, VTOMI4(dvp));
1010 	if (!oop) {
1011 		/*
1012 		 * This open owner does not exist so create a temporary
1013 		 * just created one.
1014 		 */
1015 		oop = create_open_owner(cr, VTOMI4(dvp));
1016 		ASSERT(oop != NULL);
1017 	}
1018 	mutex_exit(&VTOMI4(dvp)->mi_lock);
1019 
1020 	/* this length never changes, do alloc before seqid sync */
1021 	open_args->owner.owner_len = sizeof (oop->oo_name);
1022 	open_args->owner.owner_val =
1023 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1024 
1025 	e.error = nfs4_start_open_seqid_sync(oop, VTOMI4(dvp));
1026 	if (e.error == EAGAIN) {
1027 		open_owner_rele(oop);
1028 		nfs4args_copen_free(open_args);
1029 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1030 		if (ncr != NULL) {
1031 			crfree(ncr);
1032 			ncr = NULL;
1033 		}
1034 		goto recov_retry;
1035 	}
1036 
1037 	/* Check to see if we need to do the OTW call */
1038 	if (!create_flag) {
1039 		if (!nfs4_is_otw_open_necessary(oop, open_flag, vpi,
1040 		    file_just_been_created, &e.error, acc, &recov_state)) {
1041 
1042 			/*
1043 			 * The OTW open is not necessary.  Either
1044 			 * the open can succeed without it (eg.
1045 			 * delegation, error == 0) or the open
1046 			 * must fail due to an access failure
1047 			 * (error != 0).  In either case, tidy
1048 			 * up and return.
1049 			 */
1050 
1051 			nfs4_end_open_seqid_sync(oop);
1052 			open_owner_rele(oop);
1053 			nfs4args_copen_free(open_args);
1054 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, FALSE);
1055 			if (ncr != NULL)
1056 				crfree(ncr);
1057 			kmem_free(argop, argoplist_size);
1058 			return (e.error);
1059 		}
1060 	}
1061 
1062 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1063 	    open_args->owner.owner_len);
1064 
1065 	seqid = nfs4_get_open_seqid(oop) + 1;
1066 	open_args->seqid = seqid;
1067 	open_args->share_access = 0;
1068 	if (open_flag & FREAD)
1069 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1070 	if (open_flag & FWRITE)
1071 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1072 	open_args->share_deny = OPEN4_SHARE_DENY_NONE;
1073 
1074 
1075 
1076 	/*
1077 	 * getfh w/sanity check for idx_open/idx_fattr
1078 	 */
1079 	ASSERT((idx_open + 1) == (idx_fattr - 1));
1080 	argop[idx_open + 1].argop = OP_GETFH;
1081 
1082 	/* getattr */
1083 	argop[idx_fattr].argop = OP_GETATTR;
1084 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1085 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1086 
1087 	if (setgid_flag) {
1088 		vattr_t	_v;
1089 		servinfo4_t *svp;
1090 		bitmap4	supp_attrs;
1091 
1092 		svp = drp->r_server;
1093 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
1094 		supp_attrs = svp->sv_supp_attrs;
1095 		nfs_rw_exit(&svp->sv_lock);
1096 
1097 		/*
1098 		 * For setgid case, we need to:
1099 		 * 4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
1100 		 */
1101 		argop[4].argop = OP_SAVEFH;
1102 
1103 		argop[5].argop = OP_CPUTFH;
1104 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
1105 
1106 		argop[6].argop = OP_GETATTR;
1107 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1108 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1109 
1110 		argop[7].argop = OP_RESTOREFH;
1111 
1112 		/*
1113 		 * nverify
1114 		 */
1115 		_v.va_mask = AT_GID;
1116 		_v.va_gid = in_va->va_gid;
1117 		if (!(e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
1118 		    supp_attrs))) {
1119 
1120 			/*
1121 			 * setattr
1122 			 *
1123 			 * We _know_ we're not messing with AT_SIZE or
1124 			 * AT_XTIME, so no need for stateid or flags.
1125 			 * Also we specify NULL rp since we're only
1126 			 * interested in setting owner_group attributes.
1127 			 */
1128 			nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr,
1129 			    supp_attrs, &e.error, 0);
1130 			if (e.error)
1131 				nfs4args_verify_free(&argop[8]);
1132 		}
1133 
1134 		if (e.error) {
1135 			/*
1136 			 * XXX - Revisit the last argument to nfs4_end_op()
1137 			 *	 once 5020486 is fixed.
1138 			 */
1139 			nfs4_end_open_seqid_sync(oop);
1140 			open_owner_rele(oop);
1141 			nfs4args_copen_free(open_args);
1142 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, TRUE);
1143 			if (ncr != NULL)
1144 				crfree(ncr);
1145 			kmem_free(argop, argoplist_size);
1146 			return (e.error);
1147 		}
1148 	} else if (create_flag) {
1149 		argop[1].argop = OP_SAVEFH;
1150 
1151 		argop[5].argop = OP_RESTOREFH;
1152 
1153 		argop[6].argop = OP_GETATTR;
1154 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1155 		argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
1156 	}
1157 
1158 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
1159 	    "nfs4open_otw: %s call, nm %s, rp %s",
1160 	    needrecov ? "recov" : "first", file_name,
1161 	    rnode4info(VTOR4(dvp))));
1162 
1163 	t = gethrtime();
1164 
1165 	rfs4call(VTOMI4(dvp), &args, &res, cred_otw, &doqueue, 0, &e);
1166 
1167 	if (!e.error && nfs4_need_to_bump_seqid(&res))
1168 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1169 
1170 	needrecov = nfs4_needs_recovery(&e, TRUE, dvp->v_vfsp);
1171 
1172 	if (e.error || needrecov) {
1173 		bool_t abort = FALSE;
1174 
1175 		if (needrecov) {
1176 			nfs4_bseqid_entry_t *bsep = NULL;
1177 
1178 			nfs4open_save_lost_rqst(e.error, &lost_rqst, oop,
1179 			    cred_otw, vpi, dvp, open_args);
1180 
1181 			if (!e.error && res.status == NFS4ERR_BAD_SEQID) {
1182 				bsep = nfs4_create_bseqid_entry(oop, NULL,
1183 				    vpi, 0, args.ctag, open_args->seqid);
1184 				num_bseqid_retry--;
1185 			}
1186 
1187 			abort = nfs4_start_recovery(&e, VTOMI4(dvp), dvp, vpi,
1188 			    NULL, lost_rqst.lr_op == OP_OPEN ?
1189 			    &lost_rqst : NULL, OP_OPEN, bsep, NULL, NULL);
1190 
1191 			if (bsep)
1192 				kmem_free(bsep, sizeof (*bsep));
1193 			/* give up if we keep getting BAD_SEQID */
1194 			if (num_bseqid_retry == 0)
1195 				abort = TRUE;
1196 			if (abort == TRUE && e.error == 0)
1197 				e.error = geterrno4(res.status);
1198 		}
1199 		nfs4_end_open_seqid_sync(oop);
1200 		open_owner_rele(oop);
1201 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1202 		nfs4args_copen_free(open_args);
1203 		if (setgid_flag) {
1204 			nfs4args_verify_free(&argop[8]);
1205 			nfs4args_setattr_free(&argop[9]);
1206 		}
1207 		if (!e.error)
1208 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1209 		if (ncr != NULL) {
1210 			crfree(ncr);
1211 			ncr = NULL;
1212 		}
1213 		if (!needrecov || abort == TRUE || e.error == EINTR ||
1214 		    NFS4_FRC_UNMT_ERR(e.error, dvp->v_vfsp)) {
1215 			kmem_free(argop, argoplist_size);
1216 			return (e.error);
1217 		}
1218 		goto recov_retry;
1219 	}
1220 
1221 	/*
1222 	 * Will check and update lease after checking the rflag for
1223 	 * OPEN_CONFIRM in the successful OPEN call.
1224 	 */
1225 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
1226 
1227 		/*
1228 		 * XXX what if we're crossing mount points from server1:/drp
1229 		 * to server2:/drp/rp.
1230 		 */
1231 
1232 		/* Signal our end of use of the open seqid */
1233 		nfs4_end_open_seqid_sync(oop);
1234 
1235 		/*
1236 		 * This will destroy the open owner if it was just created,
1237 		 * and no one else has put a reference on it.
1238 		 */
1239 		open_owner_rele(oop);
1240 		if (create_flag && (createmode != EXCLUSIVE4) &&
1241 		    res.status == NFS4ERR_BADOWNER)
1242 			nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1243 
1244 		e.error = geterrno4(res.status);
1245 		nfs4args_copen_free(open_args);
1246 		if (setgid_flag) {
1247 			nfs4args_verify_free(&argop[8]);
1248 			nfs4args_setattr_free(&argop[9]);
1249 		}
1250 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1251 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1252 		/*
1253 		 * If the reply is NFS4ERR_ACCESS, it may be because
1254 		 * we are root (no root net access).  If the real uid
1255 		 * is not root, then retry with the real uid instead.
1256 		 */
1257 		if (ncr != NULL) {
1258 			crfree(ncr);
1259 			ncr = NULL;
1260 		}
1261 		if (res.status == NFS4ERR_ACCESS &&
1262 		    (ncr = crnetadjust(cred_otw)) != NULL) {
1263 			cred_otw = ncr;
1264 			goto recov_retry;
1265 		}
1266 		kmem_free(argop, argoplist_size);
1267 		return (e.error);
1268 	}
1269 
1270 	resop = &res.array[idx_open];  /* open res */
1271 	op_res = &resop->nfs_resop4_u.opopen;
1272 
1273 #ifdef DEBUG
1274 	/*
1275 	 * verify attrset bitmap
1276 	 */
1277 	if (create_flag &&
1278 	    (createmode == UNCHECKED4 || createmode == GUARDED4)) {
1279 		/* make sure attrset returned is what we asked for */
1280 		/* XXX Ignore this 'error' for now */
1281 		if (attr->attrmask != op_res->attrset)
1282 			/* EMPTY */;
1283 	}
1284 #endif
1285 
1286 	if (op_res->rflags & OPEN4_RESULT_LOCKTYPE_POSIX) {
1287 		mutex_enter(&VTOMI4(dvp)->mi_lock);
1288 		VTOMI4(dvp)->mi_flags |= MI4_POSIX_LOCK;
1289 		mutex_exit(&VTOMI4(dvp)->mi_lock);
1290 	}
1291 
1292 	resop = &res.array[idx_open + 1];  /* getfh res */
1293 	gf_res = &resop->nfs_resop4_u.opgetfh;
1294 
1295 	otw_sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
1296 
1297 	/*
1298 	 * The open stateid has been updated on the server but not
1299 	 * on the client yet.  There is a path: makenfs4node->nfs4_attr_cache->
1300 	 * flush_pages->VOP_PUTPAGE->...->nfs4write where we will issue an OTW
1301 	 * WRITE call.  That, however, will use the old stateid, so go ahead
1302 	 * and upate the open stateid now, before any call to makenfs4node.
1303 	 */
1304 	if (vpi) {
1305 		nfs4_open_stream_t	*tmp_osp;
1306 		rnode4_t		*tmp_rp = VTOR4(vpi);
1307 
1308 		tmp_osp = find_open_stream(oop, tmp_rp);
1309 		if (tmp_osp) {
1310 			tmp_osp->open_stateid = op_res->stateid;
1311 			mutex_exit(&tmp_osp->os_sync_lock);
1312 			open_stream_rele(tmp_osp, tmp_rp);
1313 		}
1314 
1315 		/*
1316 		 * We must determine if the file handle given by the otw open
1317 		 * is the same as the file handle which was passed in with
1318 		 * *vpp.  This case can be reached if the file we are trying
1319 		 * to open has been removed and another file has been created
1320 		 * having the same file name.  The passed in vnode is released
1321 		 * later.
1322 		 */
1323 		orig_sfh = VTOR4(vpi)->r_fh;
1324 		fh_differs = nfs4cmpfh(&orig_sfh->sfh_fh, &otw_sfh->sfh_fh);
1325 	}
1326 
1327 	garp = &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res;
1328 
1329 	if (create_flag || fh_differs) {
1330 		int rnode_err = 0;
1331 
1332 		vp = makenfs4node(otw_sfh, garp, dvp->v_vfsp, t, cr,
1333 		    dvp, fn_get(VTOSV(dvp)->sv_name, file_name, otw_sfh));
1334 
1335 		if (e.error)
1336 			PURGE_ATTRCACHE4(vp);
1337 		/*
1338 		 * For the newly created vp case, make sure the rnode
1339 		 * isn't bad before using it.
1340 		 */
1341 		mutex_enter(&(VTOR4(vp))->r_statelock);
1342 		if (VTOR4(vp)->r_flags & R4RECOVERR)
1343 			rnode_err = EIO;
1344 		mutex_exit(&(VTOR4(vp))->r_statelock);
1345 
1346 		if (rnode_err) {
1347 			nfs4_end_open_seqid_sync(oop);
1348 			nfs4args_copen_free(open_args);
1349 			if (setgid_flag) {
1350 				nfs4args_verify_free(&argop[8]);
1351 				nfs4args_setattr_free(&argop[9]);
1352 			}
1353 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1354 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1355 			    needrecov);
1356 			open_owner_rele(oop);
1357 			VN_RELE(vp);
1358 			if (ncr != NULL)
1359 				crfree(ncr);
1360 			sfh4_rele(&otw_sfh);
1361 			kmem_free(argop, argoplist_size);
1362 			return (EIO);
1363 		}
1364 	} else {
1365 		vp = vpi;
1366 	}
1367 	sfh4_rele(&otw_sfh);
1368 
1369 	/*
1370 	 * It seems odd to get a full set of attrs and then not update
1371 	 * the object's attrcache in the non-create case.  Create case uses
1372 	 * the attrs since makenfs4node checks to see if the attrs need to
1373 	 * be updated (and then updates them).  The non-create case should
1374 	 * update attrs also.
1375 	 */
1376 	if (! create_flag && ! fh_differs && !e.error) {
1377 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
1378 	}
1379 
1380 	nfs4_error_zinit(&e);
1381 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
1382 		/* This does not do recovery for vp explicitly. */
1383 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cred_otw, FALSE,
1384 		    &retry_open, oop, FALSE, &e, &num_bseqid_retry);
1385 
1386 		if (e.error || e.stat) {
1387 			nfs4_end_open_seqid_sync(oop);
1388 			nfs4args_copen_free(open_args);
1389 			if (setgid_flag) {
1390 				nfs4args_verify_free(&argop[8]);
1391 				nfs4args_setattr_free(&argop[9]);
1392 			}
1393 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1394 			nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state,
1395 			    needrecov);
1396 			open_owner_rele(oop);
1397 			if (create_flag || fh_differs) {
1398 				/* rele the makenfs4node */
1399 				VN_RELE(vp);
1400 			}
1401 			if (ncr != NULL) {
1402 				crfree(ncr);
1403 				ncr = NULL;
1404 			}
1405 			if (retry_open == TRUE) {
1406 				NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1407 				    "nfs4open_otw: retry the open since OPEN "
1408 				    "CONFIRM failed with error %d stat %d",
1409 				    e.error, e.stat));
1410 				if (create_flag && createmode == GUARDED4) {
1411 					NFS4_DEBUG(nfs4_client_recov_debug,
1412 					    (CE_NOTE, "nfs4open_otw: switch "
1413 					    "createmode from GUARDED4 to "
1414 					    "UNCHECKED4"));
1415 					createmode = UNCHECKED4;
1416 				}
1417 				goto recov_retry;
1418 			}
1419 			if (!e.error) {
1420 				if (create_flag && (createmode != EXCLUSIVE4) &&
1421 				    e.stat == NFS4ERR_BADOWNER)
1422 					nfs4_log_badowner(VTOMI4(dvp), OP_OPEN);
1423 
1424 				e.error = geterrno4(e.stat);
1425 			}
1426 			kmem_free(argop, argoplist_size);
1427 			return (e.error);
1428 		}
1429 	}
1430 
1431 	rp = VTOR4(vp);
1432 
1433 	mutex_enter(&rp->r_statev4_lock);
1434 	if (create_flag)
1435 		rp->created_v4 = 1;
1436 	mutex_exit(&rp->r_statev4_lock);
1437 
1438 	mutex_enter(&oop->oo_lock);
1439 	/* Doesn't matter if 'oo_just_created' already was set as this */
1440 	oop->oo_just_created = NFS4_PERM_CREATED;
1441 	if (oop->oo_cred_otw)
1442 		crfree(oop->oo_cred_otw);
1443 	oop->oo_cred_otw = cred_otw;
1444 	crhold(oop->oo_cred_otw);
1445 	mutex_exit(&oop->oo_lock);
1446 
1447 	/* returns with 'os_sync_lock' held */
1448 	osp = find_or_create_open_stream(oop, rp, &created_osp);
1449 	if (!osp) {
1450 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1451 		    "nfs4open_otw: failed to create an open stream"));
1452 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4open_otw: "
1453 		    "signal our end of use of the open seqid"));
1454 
1455 		nfs4_end_open_seqid_sync(oop);
1456 		open_owner_rele(oop);
1457 		nfs4args_copen_free(open_args);
1458 		if (setgid_flag) {
1459 			nfs4args_verify_free(&argop[8]);
1460 			nfs4args_setattr_free(&argop[9]);
1461 		}
1462 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1463 		nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1464 		if (create_flag || fh_differs)
1465 			VN_RELE(vp);
1466 		if (ncr != NULL)
1467 			crfree(ncr);
1468 
1469 		kmem_free(argop, argoplist_size);
1470 		return (EINVAL);
1471 
1472 	}
1473 
1474 	osp->open_stateid = op_res->stateid;
1475 
1476 	if (open_flag & FREAD)
1477 		osp->os_share_acc_read++;
1478 	if (open_flag & FWRITE)
1479 		osp->os_share_acc_write++;
1480 	osp->os_share_deny_none++;
1481 
1482 	/*
1483 	 * Need to reset this bitfield for the possible case where we were
1484 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
1485 	 * we could retry the CLOSE, OPENed the file again.
1486 	 */
1487 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
1488 	osp->os_final_close = 0;
1489 	osp->os_force_close = 0;
1490 #ifdef DEBUG
1491 	if (osp->os_failed_reopen)
1492 		NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE, "nfs4open_otw:"
1493 		    " clearing os_failed_reopen for osp %p, cr %p, rp %s",
1494 		    (void *)osp, (void *)cr, rnode4info(rp)));
1495 #endif
1496 	osp->os_failed_reopen = 0;
1497 
1498 	mutex_exit(&osp->os_sync_lock);
1499 
1500 	nfs4_end_open_seqid_sync(oop);
1501 
1502 	if (created_osp && recov_state.rs_sp != NULL) {
1503 		mutex_enter(&recov_state.rs_sp->s_lock);
1504 		nfs4_inc_state_ref_count_nolock(recov_state.rs_sp, VTOMI4(dvp));
1505 		mutex_exit(&recov_state.rs_sp->s_lock);
1506 	}
1507 
1508 	/* get rid of our reference to find oop */
1509 	open_owner_rele(oop);
1510 
1511 	open_stream_rele(osp, rp);
1512 
1513 	/* accept delegation, if any */
1514 	nfs4_delegation_accept(rp, CLAIM_NULL, op_res, garp, cred_otw);
1515 
1516 	nfs4_end_op(VTOMI4(dvp), dvp, vpi, &recov_state, needrecov);
1517 
1518 	if (createmode == EXCLUSIVE4 &&
1519 	    (in_va->va_mask & ~(AT_GID | AT_SIZE))) {
1520 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4open_otw:"
1521 		    " EXCLUSIVE4: sending a SETATTR"));
1522 		/*
1523 		 * If doing an exclusive create, then generate
1524 		 * a SETATTR to set the initial attributes.
1525 		 * Try to set the mtime and the atime to the
1526 		 * server's current time.  It is somewhat
1527 		 * expected that these fields will be used to
1528 		 * store the exclusive create cookie.  If not,
1529 		 * server implementors will need to know that
1530 		 * a SETATTR will follow an exclusive create
1531 		 * and the cookie should be destroyed if
1532 		 * appropriate.
1533 		 *
1534 		 * The AT_GID and AT_SIZE bits are turned off
1535 		 * so that the SETATTR request will not attempt
1536 		 * to process these.  The gid will be set
1537 		 * separately if appropriate.  The size is turned
1538 		 * off because it is assumed that a new file will
1539 		 * be created empty and if the file wasn't empty,
1540 		 * then the exclusive create will have failed
1541 		 * because the file must have existed already.
1542 		 * Therefore, no truncate operation is needed.
1543 		 */
1544 		in_va->va_mask &= ~(AT_GID | AT_SIZE);
1545 		in_va->va_mask |= (AT_MTIME | AT_ATIME);
1546 
1547 		e.error = nfs4setattr(vp, in_va, 0, cr, NULL);
1548 		if (e.error) {
1549 			/*
1550 			 * Couldn't correct the attributes of
1551 			 * the newly created file and the
1552 			 * attributes are wrong.  Remove the
1553 			 * file and return an error to the
1554 			 * application.
1555 			 */
1556 			/* XXX will this take care of client state ? */
1557 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1558 			    "nfs4open_otw: EXCLUSIVE4: error %d on SETATTR:"
1559 			    " remove file", e.error));
1560 			VN_RELE(vp);
1561 			(void) nfs4_remove(dvp, file_name, cr, NULL, 0);
1562 			/*
1563 			 * Since we've reled the vnode and removed
1564 			 * the file we now need to return the error.
1565 			 * At this point we don't want to update the
1566 			 * dircaches, call nfs4_waitfor_purge_complete
1567 			 * or set vpp to vp so we need to skip these
1568 			 * as well.
1569 			 */
1570 			goto skip_update_dircaches;
1571 		}
1572 	}
1573 
1574 	/*
1575 	 * If we created or found the correct vnode, due to create_flag or
1576 	 * fh_differs being set, then update directory cache attribute, readdir
1577 	 * and dnlc caches.
1578 	 */
1579 	if (create_flag || fh_differs) {
1580 		dirattr_info_t dinfo, *dinfop;
1581 
1582 		/*
1583 		 * Make sure getattr succeeded before using results.
1584 		 * note: op 7 is getattr(dir) for both flavors of
1585 		 * open(create).
1586 		 */
1587 		if (create_flag && res.status == NFS4_OK) {
1588 			dinfo.di_time_call = t;
1589 			dinfo.di_cred = cr;
1590 			dinfo.di_garp =
1591 			    &res.array[6].nfs_resop4_u.opgetattr.ga_res;
1592 			dinfop = &dinfo;
1593 		} else {
1594 			dinfop = NULL;
1595 		}
1596 
1597 		nfs4_update_dircaches(&op_res->cinfo, dvp, vp, file_name,
1598 		    dinfop);
1599 	}
1600 
1601 	/*
1602 	 * If the page cache for this file was flushed from actions
1603 	 * above, it was done asynchronously and if that is true,
1604 	 * there is a need to wait here for it to complete.  This must
1605 	 * be done outside of start_fop/end_fop.
1606 	 */
1607 	(void) nfs4_waitfor_purge_complete(vp);
1608 
1609 	/*
1610 	 * It is implicit that we are in the open case (create_flag == 0) since
1611 	 * fh_differs can only be set to a non-zero value in the open case.
1612 	 */
1613 	if (fh_differs != 0 && vpi != NULL)
1614 		VN_RELE(vpi);
1615 
1616 	/*
1617 	 * Be sure to set *vpp to the correct value before returning.
1618 	 */
1619 	*vpp = vp;
1620 
1621 skip_update_dircaches:
1622 
1623 	nfs4args_copen_free(open_args);
1624 	if (setgid_flag) {
1625 		nfs4args_verify_free(&argop[8]);
1626 		nfs4args_setattr_free(&argop[9]);
1627 	}
1628 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1629 
1630 	if (ncr)
1631 		crfree(ncr);
1632 	kmem_free(argop, argoplist_size);
1633 	return (e.error);
1634 }
1635 
1636 /*
1637  * Reopen an open instance.  cf. nfs4open_otw().
1638  *
1639  * Errors are returned by the nfs4_error_t parameter.
1640  * - ep->error contains an errno value or zero.
1641  * - if it is zero, ep->stat is set to an NFS status code, if any.
1642  *   If the file could not be reopened, but the caller should continue, the
1643  *   file is marked dead and no error values are returned.  If the caller
1644  *   should stop recovering open files and start over, either the ep->error
1645  *   value or ep->stat will indicate an error (either something that requires
1646  *   recovery or EAGAIN).  Note that some recovery (e.g., expired volatile
1647  *   filehandles) may be handled silently by this routine.
1648  * - if it is EINTR, ETIMEDOUT, or NFS4_FRC_UNMT_ERR, recovery for lost state
1649  *   will be started, so the caller should not do it.
1650  *
1651  * Gotos:
1652  * - kill_file : reopen failed in such a fashion to constitute marking the
1653  *    file dead and setting the open stream's 'os_failed_reopen' as 1.  This
1654  *   is for cases where recovery is not possible.
1655  * - failed_reopen : same as above, except that the file has already been
1656  *   marked dead, so no need to do it again.
1657  * - bailout : reopen failed but we are able to recover and retry the reopen -
1658  *   either within this function immediately or via the calling function.
1659  */
1660 
1661 void
1662 nfs4_reopen(vnode_t *vp, nfs4_open_stream_t *osp, nfs4_error_t *ep,
1663     open_claim_type4 claim, bool_t frc_use_claim_previous,
1664     bool_t is_recov)
1665 {
1666 	COMPOUND4args_clnt args;
1667 	COMPOUND4res_clnt res;
1668 	nfs_argop4 argop[4];
1669 	nfs_resop4 *resop;
1670 	OPEN4res *op_res = NULL;
1671 	OPEN4cargs *open_args;
1672 	GETFH4res *gf_res;
1673 	rnode4_t *rp = VTOR4(vp);
1674 	int doqueue = 1;
1675 	cred_t *cr = NULL, *cred_otw = NULL;
1676 	nfs4_open_owner_t *oop = NULL;
1677 	seqid4 seqid;
1678 	nfs4_ga_res_t *garp;
1679 	char fn[MAXNAMELEN];
1680 	nfs4_recov_state_t recov = {NULL, 0};
1681 	nfs4_lost_rqst_t lost_rqst;
1682 	mntinfo4_t *mi = VTOMI4(vp);
1683 	bool_t abort;
1684 	char *failed_msg = "";
1685 	int fh_different;
1686 	hrtime_t t;
1687 	nfs4_bseqid_entry_t *bsep = NULL;
1688 
1689 	ASSERT(nfs4_consistent_type(vp));
1690 	ASSERT(nfs_zone() == mi->mi_zone);
1691 
1692 	nfs4_error_zinit(ep);
1693 
1694 	/* this is the cred used to find the open owner */
1695 	cr = state_to_cred(osp);
1696 	if (cr == NULL) {
1697 		failed_msg = "Couldn't reopen: no cred";
1698 		goto kill_file;
1699 	}
1700 	/* use this cred for OTW operations */
1701 	cred_otw = nfs4_get_otw_cred(cr, mi, osp->os_open_owner);
1702 
1703 top:
1704 	nfs4_error_zinit(ep);
1705 
1706 	if (mi->mi_vfsp->vfs_flag & VFS_UNMOUNTED) {
1707 		/* File system has been unmounted, quit */
1708 		ep->error = EIO;
1709 		failed_msg = "Couldn't reopen: file system has been unmounted";
1710 		goto kill_file;
1711 	}
1712 
1713 	oop = osp->os_open_owner;
1714 
1715 	ASSERT(oop != NULL);
1716 	if (oop == NULL) {	/* be defensive in non-DEBUG */
1717 		failed_msg = "can't reopen: no open owner";
1718 		goto kill_file;
1719 	}
1720 	open_owner_hold(oop);
1721 
1722 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
1723 	if (ep->error) {
1724 		open_owner_rele(oop);
1725 		oop = NULL;
1726 		goto bailout;
1727 	}
1728 
1729 	/*
1730 	 * If the rnode has a delegation and the delegation has been
1731 	 * recovered and the server didn't request a recall and the caller
1732 	 * didn't specifically ask for CLAIM_PREVIOUS (nfs4frlock during
1733 	 * recovery) and the rnode hasn't been marked dead, then install
1734 	 * the delegation stateid in the open stream.  Otherwise, proceed
1735 	 * with a CLAIM_PREVIOUS or CLAIM_NULL OPEN.
1736 	 */
1737 	mutex_enter(&rp->r_statev4_lock);
1738 	if (rp->r_deleg_type != OPEN_DELEGATE_NONE &&
1739 	    !rp->r_deleg_return_pending &&
1740 	    (rp->r_deleg_needs_recovery == OPEN_DELEGATE_NONE) &&
1741 	    !rp->r_deleg_needs_recall &&
1742 	    claim != CLAIM_DELEGATE_CUR && !frc_use_claim_previous &&
1743 	    !(rp->r_flags & R4RECOVERR)) {
1744 		mutex_enter(&osp->os_sync_lock);
1745 		osp->os_delegation = 1;
1746 		osp->open_stateid = rp->r_deleg_stateid;
1747 		mutex_exit(&osp->os_sync_lock);
1748 		mutex_exit(&rp->r_statev4_lock);
1749 		goto bailout;
1750 	}
1751 	mutex_exit(&rp->r_statev4_lock);
1752 
1753 	/*
1754 	 * If the file failed recovery, just quit.  This failure need not
1755 	 * affect other reopens, so don't return an error.
1756 	 */
1757 	mutex_enter(&rp->r_statelock);
1758 	if (rp->r_flags & R4RECOVERR) {
1759 		mutex_exit(&rp->r_statelock);
1760 		ep->error = 0;
1761 		goto failed_reopen;
1762 	}
1763 	mutex_exit(&rp->r_statelock);
1764 
1765 	/*
1766 	 * argop is empty here
1767 	 *
1768 	 * PUTFH, OPEN, GETATTR
1769 	 */
1770 	args.ctag = TAG_REOPEN;
1771 	args.array_len = 4;
1772 	args.array = argop;
1773 
1774 	NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
1775 	    "nfs4_reopen: file is type %d, id %s",
1776 	    vp->v_type, rnode4info(VTOR4(vp))));
1777 
1778 	argop[0].argop = OP_CPUTFH;
1779 
1780 	if (claim != CLAIM_PREVIOUS) {
1781 		/*
1782 		 * if this is a file mount then
1783 		 * use the mntinfo parentfh
1784 		 */
1785 		argop[0].nfs_argop4_u.opcputfh.sfh =
1786 		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
1787 		    VTOSV(vp)->sv_dfh;
1788 	} else {
1789 		/* putfh fh to reopen */
1790 		argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1791 	}
1792 
1793 	argop[1].argop = OP_COPEN;
1794 	open_args = &argop[1].nfs_argop4_u.opcopen;
1795 	open_args->claim = claim;
1796 
1797 	if (claim == CLAIM_NULL) {
1798 
1799 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1800 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1801 			    "failed for vp 0x%p for CLAIM_NULL with %m",
1802 			    (void *)vp);
1803 			failed_msg = "Couldn't reopen: vtoname failed for "
1804 			    "CLAIM_NULL";
1805 			/* nothing allocated yet */
1806 			goto kill_file;
1807 		}
1808 
1809 		open_args->open_claim4_u.cfile = fn;
1810 	} else if (claim == CLAIM_PREVIOUS) {
1811 
1812 		/*
1813 		 * We have two cases to deal with here:
1814 		 * 1) We're being called to reopen files in order to satisfy
1815 		 *    a lock operation request which requires us to explicitly
1816 		 *    reopen files which were opened under a delegation.  If
1817 		 *    we're in recovery, we *must* use CLAIM_PREVIOUS.  In
1818 		 *    that case, frc_use_claim_previous is TRUE and we must
1819 		 *    use the rnode's current delegation type (r_deleg_type).
1820 		 * 2) We're reopening files during some form of recovery.
1821 		 *    In this case, frc_use_claim_previous is FALSE and we
1822 		 *    use the delegation type appropriate for recovery
1823 		 *    (r_deleg_needs_recovery).
1824 		 */
1825 		mutex_enter(&rp->r_statev4_lock);
1826 		open_args->open_claim4_u.delegate_type =
1827 		    frc_use_claim_previous ?
1828 		    rp->r_deleg_type :
1829 		    rp->r_deleg_needs_recovery;
1830 		mutex_exit(&rp->r_statev4_lock);
1831 
1832 	} else if (claim == CLAIM_DELEGATE_CUR) {
1833 
1834 		if ((ep->error = vtoname(vp, fn, MAXNAMELEN)) != 0) {
1835 			nfs_cmn_err(ep->error, CE_WARN, "nfs4_reopen: vtoname "
1836 			    "failed for vp 0x%p for CLAIM_DELEGATE_CUR "
1837 			    "with %m", (void *)vp);
1838 			failed_msg = "Couldn't reopen: vtoname failed for "
1839 			    "CLAIM_DELEGATE_CUR";
1840 			/* nothing allocated yet */
1841 			goto kill_file;
1842 		}
1843 
1844 		mutex_enter(&rp->r_statev4_lock);
1845 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
1846 		    rp->r_deleg_stateid;
1847 		mutex_exit(&rp->r_statev4_lock);
1848 
1849 		open_args->open_claim4_u.delegate_cur_info.cfile = fn;
1850 	}
1851 	open_args->opentype = OPEN4_NOCREATE;
1852 	open_args->owner.clientid = mi2clientid(mi);
1853 	open_args->owner.owner_len = sizeof (oop->oo_name);
1854 	open_args->owner.owner_val =
1855 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
1856 	bcopy(&oop->oo_name, open_args->owner.owner_val,
1857 	    open_args->owner.owner_len);
1858 	open_args->share_access = 0;
1859 	open_args->share_deny = 0;
1860 
1861 	mutex_enter(&osp->os_sync_lock);
1862 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE, "nfs4_reopen: osp %p rp "
1863 	    "%p: read acc %"PRIu64" write acc %"PRIu64": open ref count %d: "
1864 	    "mmap read %"PRIu64" mmap write %"PRIu64" claim %d ",
1865 	    (void *)osp, (void *)rp, osp->os_share_acc_read,
1866 	    osp->os_share_acc_write, osp->os_open_ref_count,
1867 	    osp->os_mmap_read, osp->os_mmap_write, claim));
1868 
1869 	if (osp->os_share_acc_read || osp->os_mmap_read)
1870 		open_args->share_access |= OPEN4_SHARE_ACCESS_READ;
1871 	if (osp->os_share_acc_write || osp->os_mmap_write)
1872 		open_args->share_access |= OPEN4_SHARE_ACCESS_WRITE;
1873 	if (osp->os_share_deny_read)
1874 		open_args->share_deny |= OPEN4_SHARE_DENY_READ;
1875 	if (osp->os_share_deny_write)
1876 		open_args->share_deny |= OPEN4_SHARE_DENY_WRITE;
1877 	mutex_exit(&osp->os_sync_lock);
1878 
1879 	seqid = nfs4_get_open_seqid(oop) + 1;
1880 	open_args->seqid = seqid;
1881 
1882 	/* Construct the getfh part of the compound */
1883 	argop[2].argop = OP_GETFH;
1884 
1885 	/* Construct the getattr part of the compound */
1886 	argop[3].argop = OP_GETATTR;
1887 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1888 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
1889 
1890 	t = gethrtime();
1891 
1892 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1893 
1894 	if (ep->error) {
1895 		if (!is_recov && !frc_use_claim_previous &&
1896 		    (ep->error == EINTR || ep->error == ETIMEDOUT ||
1897 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp))) {
1898 			nfs4open_save_lost_rqst(ep->error, &lost_rqst, oop,
1899 			    cred_otw, vp, NULL, open_args);
1900 			abort = nfs4_start_recovery(ep,
1901 			    VTOMI4(vp), vp, NULL, NULL,
1902 			    lost_rqst.lr_op == OP_OPEN ?
1903 			    &lost_rqst : NULL, OP_OPEN, NULL, NULL, NULL);
1904 			nfs4args_copen_free(open_args);
1905 			goto bailout;
1906 		}
1907 
1908 		nfs4args_copen_free(open_args);
1909 
1910 		if (ep->error == EACCES && cred_otw != cr) {
1911 			crfree(cred_otw);
1912 			cred_otw = cr;
1913 			crhold(cred_otw);
1914 			nfs4_end_open_seqid_sync(oop);
1915 			open_owner_rele(oop);
1916 			oop = NULL;
1917 			goto top;
1918 		}
1919 		if (ep->error == ETIMEDOUT)
1920 			goto bailout;
1921 		failed_msg = "Couldn't reopen: rpc error";
1922 		goto kill_file;
1923 	}
1924 
1925 	if (nfs4_need_to_bump_seqid(&res))
1926 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1927 
1928 	switch (res.status) {
1929 	case NFS4_OK:
1930 		if (recov.rs_flags & NFS4_RS_DELAY_MSG) {
1931 			mutex_enter(&rp->r_statelock);
1932 			rp->r_delay_interval = 0;
1933 			mutex_exit(&rp->r_statelock);
1934 		}
1935 		break;
1936 	case NFS4ERR_BAD_SEQID:
1937 		bsep = nfs4_create_bseqid_entry(oop, NULL, vp, 0,
1938 		    args.ctag, open_args->seqid);
1939 
1940 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
1941 		    NULL, lost_rqst.lr_op == OP_OPEN ? &lost_rqst :
1942 		    NULL, OP_OPEN, bsep, NULL, NULL);
1943 
1944 		nfs4args_copen_free(open_args);
1945 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1946 		nfs4_end_open_seqid_sync(oop);
1947 		open_owner_rele(oop);
1948 		oop = NULL;
1949 		kmem_free(bsep, sizeof (*bsep));
1950 
1951 		goto kill_file;
1952 	case NFS4ERR_NO_GRACE:
1953 		nfs4args_copen_free(open_args);
1954 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1955 		nfs4_end_open_seqid_sync(oop);
1956 		open_owner_rele(oop);
1957 		oop = NULL;
1958 		if (claim == CLAIM_PREVIOUS) {
1959 			/*
1960 			 * Retry as a plain open. We don't need to worry about
1961 			 * checking the changeinfo: it is acceptable for a
1962 			 * client to re-open a file and continue processing
1963 			 * (in the absence of locks).
1964 			 */
1965 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1966 			    "nfs4_reopen: CLAIM_PREVIOUS: NFS4ERR_NO_GRACE; "
1967 			    "will retry as CLAIM_NULL"));
1968 			claim = CLAIM_NULL;
1969 			nfs4_mi_kstat_inc_no_grace(mi);
1970 			goto top;
1971 		}
1972 		failed_msg =
1973 		    "Couldn't reopen: tried reclaim outside grace period. ";
1974 		goto kill_file;
1975 	case NFS4ERR_GRACE:
1976 		nfs4_set_grace_wait(mi);
1977 		nfs4args_copen_free(open_args);
1978 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1979 		nfs4_end_open_seqid_sync(oop);
1980 		open_owner_rele(oop);
1981 		oop = NULL;
1982 		ep->error = nfs4_wait_for_grace(mi, &recov);
1983 		if (ep->error != 0)
1984 			goto bailout;
1985 		goto top;
1986 	case NFS4ERR_DELAY:
1987 		nfs4_set_delay_wait(vp);
1988 		nfs4args_copen_free(open_args);
1989 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1990 		nfs4_end_open_seqid_sync(oop);
1991 		open_owner_rele(oop);
1992 		oop = NULL;
1993 		ep->error = nfs4_wait_for_delay(vp, &recov);
1994 		nfs4_mi_kstat_inc_delay(mi);
1995 		if (ep->error != 0)
1996 			goto bailout;
1997 		goto top;
1998 	case NFS4ERR_FHEXPIRED:
1999 		/* recover filehandle and retry */
2000 		abort = nfs4_start_recovery(ep,
2001 		    mi, vp, NULL, NULL, NULL, OP_OPEN, NULL, NULL, NULL);
2002 		nfs4args_copen_free(open_args);
2003 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2004 		nfs4_end_open_seqid_sync(oop);
2005 		open_owner_rele(oop);
2006 		oop = NULL;
2007 		if (abort == FALSE)
2008 			goto top;
2009 		failed_msg = "Couldn't reopen: recovery aborted";
2010 		goto kill_file;
2011 	case NFS4ERR_RESOURCE:
2012 	case NFS4ERR_STALE_CLIENTID:
2013 	case NFS4ERR_WRONGSEC:
2014 	case NFS4ERR_EXPIRED:
2015 		/*
2016 		 * Do not mark the file dead and let the calling
2017 		 * function initiate recovery.
2018 		 */
2019 		nfs4args_copen_free(open_args);
2020 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2021 		nfs4_end_open_seqid_sync(oop);
2022 		open_owner_rele(oop);
2023 		oop = NULL;
2024 		goto bailout;
2025 	case NFS4ERR_ACCESS:
2026 		if (cred_otw != cr) {
2027 			crfree(cred_otw);
2028 			cred_otw = cr;
2029 			crhold(cred_otw);
2030 			nfs4args_copen_free(open_args);
2031 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2032 			nfs4_end_open_seqid_sync(oop);
2033 			open_owner_rele(oop);
2034 			oop = NULL;
2035 			goto top;
2036 		}
2037 		/* fall through */
2038 	default:
2039 		NFS4_DEBUG(nfs4_client_failover_debug, (CE_NOTE,
2040 		    "nfs4_reopen: r_server 0x%p, mi_curr_serv 0x%p, rnode %s",
2041 		    (void*)VTOR4(vp)->r_server, (void*)mi->mi_curr_serv,
2042 		    rnode4info(VTOR4(vp))));
2043 		failed_msg = "Couldn't reopen: NFSv4 error";
2044 		nfs4args_copen_free(open_args);
2045 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2046 		goto kill_file;
2047 	}
2048 
2049 	resop = &res.array[1];  /* open res */
2050 	op_res = &resop->nfs_resop4_u.opopen;
2051 
2052 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2053 
2054 	/*
2055 	 * Check if the path we reopened really is the same
2056 	 * file. We could end up in a situation where the file
2057 	 * was removed and a new file created with the same name.
2058 	 */
2059 	resop = &res.array[2];
2060 	gf_res = &resop->nfs_resop4_u.opgetfh;
2061 	(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2062 	fh_different = (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2063 	if (fh_different) {
2064 		if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2065 		    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2066 			/* Oops, we don't have the same file */
2067 			if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2068 				failed_msg = "Couldn't reopen: Persistent "
2069 				    "file handle changed";
2070 			else
2071 				failed_msg = "Couldn't reopen: Volatile "
2072 				    "(no expire on open) file handle changed";
2073 
2074 			nfs4args_copen_free(open_args);
2075 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2076 			nfs_rw_exit(&mi->mi_fh_lock);
2077 			goto kill_file;
2078 
2079 		} else {
2080 			/*
2081 			 * We have volatile file handles that don't compare.
2082 			 * If the fids are the same then we assume that the
2083 			 * file handle expired but the rnode still refers to
2084 			 * the same file object.
2085 			 *
2086 			 * First check that we have fids or not.
2087 			 * If we don't we have a dumb server so we will
2088 			 * just assume every thing is ok for now.
2089 			 */
2090 			if (!ep->error && garp->n4g_va.va_mask & AT_NODEID &&
2091 			    rp->r_attr.va_mask & AT_NODEID &&
2092 			    rp->r_attr.va_nodeid != garp->n4g_va.va_nodeid) {
2093 				/*
2094 				 * We have fids, but they don't
2095 				 * compare. So kill the file.
2096 				 */
2097 				failed_msg =
2098 				    "Couldn't reopen: file handle changed"
2099 				    " due to mismatched fids";
2100 				nfs4args_copen_free(open_args);
2101 				(void) xdr_free(xdr_COMPOUND4res_clnt,
2102 				    (caddr_t)&res);
2103 				nfs_rw_exit(&mi->mi_fh_lock);
2104 				goto kill_file;
2105 			} else {
2106 				/*
2107 				 * We have volatile file handles that refers
2108 				 * to the same file (at least they have the
2109 				 * same fid) or we don't have fids so we
2110 				 * can't tell. :(. We'll be a kind and accepting
2111 				 * client so we'll update the rnode's file
2112 				 * handle with the otw handle.
2113 				 *
2114 				 * We need to drop mi->mi_fh_lock since
2115 				 * sh4_update acquires it. Since there is
2116 				 * only one recovery thread there is no
2117 				 * race.
2118 				 */
2119 				nfs_rw_exit(&mi->mi_fh_lock);
2120 				sfh4_update(rp->r_fh, &gf_res->object);
2121 			}
2122 		}
2123 	} else {
2124 		nfs_rw_exit(&mi->mi_fh_lock);
2125 	}
2126 
2127 	ASSERT(nfs4_consistent_type(vp));
2128 
2129 	/*
2130 	 * If the server wanted an OPEN_CONFIRM but that fails, just start
2131 	 * over.  Presumably if there is a persistent error it will show up
2132 	 * when we resend the OPEN.
2133 	 */
2134 	if (op_res->rflags & OPEN4_RESULT_CONFIRM) {
2135 		bool_t retry_open = FALSE;
2136 
2137 		nfs4open_confirm(vp, &seqid, &op_res->stateid,
2138 		    cred_otw, is_recov, &retry_open,
2139 		    oop, FALSE, ep, NULL);
2140 		if (ep->error || ep->stat) {
2141 			nfs4args_copen_free(open_args);
2142 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2143 			nfs4_end_open_seqid_sync(oop);
2144 			open_owner_rele(oop);
2145 			oop = NULL;
2146 			goto top;
2147 		}
2148 	}
2149 
2150 	mutex_enter(&osp->os_sync_lock);
2151 	osp->open_stateid = op_res->stateid;
2152 	osp->os_delegation = 0;
2153 	/*
2154 	 * Need to reset this bitfield for the possible case where we were
2155 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2156 	 * we could retry the CLOSE, OPENed the file again.
2157 	 */
2158 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2159 	osp->os_final_close = 0;
2160 	osp->os_force_close = 0;
2161 	if (claim == CLAIM_DELEGATE_CUR || claim == CLAIM_PREVIOUS)
2162 		osp->os_dc_openacc = open_args->share_access;
2163 	mutex_exit(&osp->os_sync_lock);
2164 
2165 	nfs4_end_open_seqid_sync(oop);
2166 
2167 	/* accept delegation, if any */
2168 	nfs4_delegation_accept(rp, claim, op_res, garp, cred_otw);
2169 
2170 	nfs4args_copen_free(open_args);
2171 
2172 	nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2173 
2174 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2175 
2176 	ASSERT(nfs4_consistent_type(vp));
2177 
2178 	open_owner_rele(oop);
2179 	crfree(cr);
2180 	crfree(cred_otw);
2181 	return;
2182 
2183 kill_file:
2184 	nfs4_fail_recov(vp, failed_msg, ep->error, ep->stat);
2185 failed_reopen:
2186 	NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
2187 	    "nfs4_reopen: setting os_failed_reopen for osp %p, cr %p, rp %s",
2188 	    (void *)osp, (void *)cr, rnode4info(rp)));
2189 	mutex_enter(&osp->os_sync_lock);
2190 	osp->os_failed_reopen = 1;
2191 	mutex_exit(&osp->os_sync_lock);
2192 bailout:
2193 	if (oop != NULL) {
2194 		nfs4_end_open_seqid_sync(oop);
2195 		open_owner_rele(oop);
2196 	}
2197 	if (cr != NULL)
2198 		crfree(cr);
2199 	if (cred_otw != NULL)
2200 		crfree(cred_otw);
2201 }
2202 
2203 /* for . and .. OPENs */
2204 /* ARGSUSED */
2205 static int
2206 nfs4_open_non_reg_file(vnode_t **vpp, int flag, cred_t *cr)
2207 {
2208 	rnode4_t *rp;
2209 	nfs4_ga_res_t gar;
2210 
2211 	ASSERT(nfs_zone() == VTOMI4(*vpp)->mi_zone);
2212 
2213 	/*
2214 	 * If close-to-open consistency checking is turned off or
2215 	 * if there is no cached data, we can avoid
2216 	 * the over the wire getattr.  Otherwise, force a
2217 	 * call to the server to get fresh attributes and to
2218 	 * check caches. This is required for close-to-open
2219 	 * consistency.
2220 	 */
2221 	rp = VTOR4(*vpp);
2222 	if (VTOMI4(*vpp)->mi_flags & MI4_NOCTO ||
2223 	    (rp->r_dir == NULL && !nfs4_has_pages(*vpp)))
2224 		return (0);
2225 
2226 	gar.n4g_va.va_mask = AT_ALL;
2227 	return (nfs4_getattr_otw(*vpp, &gar, cr, 0));
2228 }
2229 
2230 /*
2231  * CLOSE a file
2232  */
2233 /* ARGSUSED */
2234 static int
2235 nfs4_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
2236 	caller_context_t *ct)
2237 {
2238 	rnode4_t	*rp;
2239 	int		 error = 0;
2240 	int		 r_error = 0;
2241 	int		 n4error = 0;
2242 	nfs4_error_t	 e = { 0, NFS4_OK, RPC_SUCCESS };
2243 
2244 	/*
2245 	 * Remove client state for this (lockowner, file) pair.
2246 	 * Issue otw v4 call to have the server do the same.
2247 	 */
2248 
2249 	rp = VTOR4(vp);
2250 
2251 	/*
2252 	 * zone_enter(2) prevents processes from changing zones with NFS files
2253 	 * open; if we happen to get here from the wrong zone we can't do
2254 	 * anything over the wire.
2255 	 */
2256 	if (VTOMI4(vp)->mi_zone != nfs_zone()) {
2257 		/*
2258 		 * We could attempt to clean up locks, except we're sure
2259 		 * that the current process didn't acquire any locks on
2260 		 * the file: any attempt to lock a file belong to another zone
2261 		 * will fail, and one can't lock an NFS file and then change
2262 		 * zones, as that fails too.
2263 		 *
2264 		 * Returning an error here is the sane thing to do.  A
2265 		 * subsequent call to VN_RELE() which translates to a
2266 		 * nfs4_inactive() will clean up state: if the zone of the
2267 		 * vnode's origin is still alive and kicking, the inactive
2268 		 * thread will handle the request (from the correct zone), and
2269 		 * everything (minus the OTW close call) should be OK.  If the
2270 		 * zone is going away nfs4_async_inactive() will throw away
2271 		 * delegations, open streams and cached pages inline.
2272 		 */
2273 		return (EIO);
2274 	}
2275 
2276 	/*
2277 	 * If we are using local locking for this filesystem, then
2278 	 * release all of the SYSV style record locks.  Otherwise,
2279 	 * we are doing network locking and we need to release all
2280 	 * of the network locks.  All of the locks held by this
2281 	 * process on this file are released no matter what the
2282 	 * incoming reference count is.
2283 	 */
2284 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK) {
2285 		cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
2286 		cleanshares(vp, ttoproc(curthread)->p_pid);
2287 	} else
2288 		e.error = nfs4_lockrelease(vp, flag, offset, cr);
2289 
2290 	if (e.error) {
2291 		struct lm_sysid *lmsid;
2292 		lmsid = nfs4_find_sysid(VTOMI4(vp));
2293 		if (lmsid == NULL) {
2294 			DTRACE_PROBE2(unknown__sysid, int, e.error,
2295 			    vnode_t *, vp);
2296 		} else {
2297 			cleanlocks(vp, ttoproc(curthread)->p_pid,
2298 			    (lm_sysidt(lmsid) | LM_SYSID_CLIENT));
2299 		}
2300 		return (e.error);
2301 	}
2302 
2303 	if (count > 1)
2304 		return (0);
2305 
2306 	/*
2307 	 * If the file has been `unlinked', then purge the
2308 	 * DNLC so that this vnode will get reycled quicker
2309 	 * and the .nfs* file on the server will get removed.
2310 	 */
2311 	if (rp->r_unldvp != NULL)
2312 		dnlc_purge_vp(vp);
2313 
2314 	/*
2315 	 * If the file was open for write and there are pages,
2316 	 * do a synchronous flush and commit of all of the
2317 	 * dirty and uncommitted pages.
2318 	 */
2319 	ASSERT(!e.error);
2320 	if ((flag & FWRITE) && nfs4_has_pages(vp))
2321 		error = nfs4_putpage_commit(vp, 0, 0, cr);
2322 
2323 	mutex_enter(&rp->r_statelock);
2324 	r_error = rp->r_error;
2325 	rp->r_error = 0;
2326 	mutex_exit(&rp->r_statelock);
2327 
2328 	/*
2329 	 * If this file type is one for which no explicit 'open' was
2330 	 * done, then bail now (ie. no need for protocol 'close'). If
2331 	 * there was an error w/the vm subsystem, return _that_ error,
2332 	 * otherwise, return any errors that may've been reported via
2333 	 * the rnode.
2334 	 */
2335 	if (vp->v_type != VREG)
2336 		return (error ? error : r_error);
2337 
2338 	/*
2339 	 * The sync putpage commit may have failed above, but since
2340 	 * we're working w/a regular file, we need to do the protocol
2341 	 * 'close' (nfs4close_one will figure out if an otw close is
2342 	 * needed or not). Report any errors _after_ doing the protocol
2343 	 * 'close'.
2344 	 */
2345 	nfs4close_one(vp, NULL, cr, flag, NULL, &e, CLOSE_NORM, 0, 0, 0);
2346 	n4error = e.error ? e.error : geterrno4(e.stat);
2347 
2348 	/*
2349 	 * Error reporting prio (Hi -> Lo)
2350 	 *
2351 	 *   i) nfs4_putpage_commit (error)
2352 	 *  ii) rnode's (r_error)
2353 	 * iii) nfs4close_one (n4error)
2354 	 */
2355 	return (error ? error : (r_error ? r_error : n4error));
2356 }
2357 
2358 /*
2359  * Initialize *lost_rqstp.
2360  */
2361 
2362 static void
2363 nfs4close_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
2364     nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
2365     vnode_t *vp)
2366 {
2367 	if (error != ETIMEDOUT && error != EINTR &&
2368 	    !NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
2369 		lost_rqstp->lr_op = 0;
2370 		return;
2371 	}
2372 
2373 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2374 	    "nfs4close_save_lost_rqst: error %d", error));
2375 
2376 	lost_rqstp->lr_op = OP_CLOSE;
2377 	/*
2378 	 * The vp is held and rele'd via the recovery code.
2379 	 * See nfs4_save_lost_rqst.
2380 	 */
2381 	lost_rqstp->lr_vp = vp;
2382 	lost_rqstp->lr_dvp = NULL;
2383 	lost_rqstp->lr_oop = oop;
2384 	lost_rqstp->lr_osp = osp;
2385 	ASSERT(osp != NULL);
2386 	ASSERT(mutex_owned(&osp->os_sync_lock));
2387 	osp->os_pending_close = 1;
2388 	lost_rqstp->lr_lop = NULL;
2389 	lost_rqstp->lr_cr = cr;
2390 	lost_rqstp->lr_flk = NULL;
2391 	lost_rqstp->lr_putfirst = FALSE;
2392 }
2393 
2394 /*
2395  * Assumes you already have the open seqid sync grabbed as well as the
2396  * 'os_sync_lock'.  Note: this will release the open seqid sync and
2397  * 'os_sync_lock' if client recovery starts.  Calling functions have to
2398  * be prepared to handle this.
2399  *
2400  * 'recov' is returned as 1 if the CLOSE operation detected client recovery
2401  * was needed and was started, and that the calling function should retry
2402  * this function; otherwise it is returned as 0.
2403  *
2404  * Errors are returned via the nfs4_error_t parameter.
2405  */
2406 static void
2407 nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop,
2408     nfs4_open_stream_t *osp, int *recov, int *did_start_seqid_syncp,
2409     nfs4_close_type_t close_type, nfs4_error_t *ep, int *have_sync_lockp)
2410 {
2411 	COMPOUND4args_clnt args;
2412 	COMPOUND4res_clnt res;
2413 	CLOSE4args *close_args;
2414 	nfs_resop4 *resop;
2415 	nfs_argop4 argop[3];
2416 	int doqueue = 1;
2417 	mntinfo4_t *mi;
2418 	seqid4 seqid;
2419 	vnode_t *vp;
2420 	bool_t needrecov = FALSE;
2421 	nfs4_lost_rqst_t lost_rqst;
2422 	hrtime_t t;
2423 
2424 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
2425 
2426 	ASSERT(MUTEX_HELD(&osp->os_sync_lock));
2427 
2428 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw"));
2429 
2430 	/* Only set this to 1 if recovery is started */
2431 	*recov = 0;
2432 
2433 	/* do the OTW call to close the file */
2434 
2435 	if (close_type == CLOSE_RESEND)
2436 		args.ctag = TAG_CLOSE_LOST;
2437 	else if (close_type == CLOSE_AFTER_RESEND)
2438 		args.ctag = TAG_CLOSE_UNDO;
2439 	else
2440 		args.ctag = TAG_CLOSE;
2441 
2442 	args.array_len = 3;
2443 	args.array = argop;
2444 
2445 	vp = RTOV4(rp);
2446 
2447 	mi = VTOMI4(vp);
2448 
2449 	/* putfh target fh */
2450 	argop[0].argop = OP_CPUTFH;
2451 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
2452 
2453 	argop[1].argop = OP_GETATTR;
2454 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2455 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
2456 
2457 	argop[2].argop = OP_CLOSE;
2458 	close_args = &argop[2].nfs_argop4_u.opclose;
2459 
2460 	seqid = nfs4_get_open_seqid(oop) + 1;
2461 
2462 	close_args->seqid = seqid;
2463 	close_args->open_stateid = osp->open_stateid;
2464 
2465 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
2466 	    "nfs4close_otw: %s call, rp %s", needrecov ? "recov" : "first",
2467 	    rnode4info(rp)));
2468 
2469 	t = gethrtime();
2470 
2471 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
2472 
2473 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
2474 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2475 	}
2476 
2477 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
2478 	if (ep->error && !needrecov) {
2479 		/*
2480 		 * if there was an error and no recovery is to be done
2481 		 * then then set up the file to flush its cache if
2482 		 * needed for the next caller.
2483 		 */
2484 		mutex_enter(&rp->r_statelock);
2485 		PURGE_ATTRCACHE4_LOCKED(rp);
2486 		rp->r_flags &= ~R4WRITEMODIFIED;
2487 		mutex_exit(&rp->r_statelock);
2488 		return;
2489 	}
2490 
2491 	if (needrecov) {
2492 		bool_t abort;
2493 		nfs4_bseqid_entry_t *bsep = NULL;
2494 
2495 		if (close_type != CLOSE_RESEND)
2496 			nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
2497 			    osp, cred_otw, vp);
2498 
2499 		if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
2500 			bsep = nfs4_create_bseqid_entry(oop, NULL, vp,
2501 			    0, args.ctag, close_args->seqid);
2502 
2503 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
2504 		    "nfs4close_otw: initiating recovery. error %d "
2505 		    "res.status %d", ep->error, res.status));
2506 
2507 		/*
2508 		 * Drop the 'os_sync_lock' here so we don't hit
2509 		 * a potential recursive mutex_enter via an
2510 		 * 'open_stream_hold()'.
2511 		 */
2512 		mutex_exit(&osp->os_sync_lock);
2513 		*have_sync_lockp = 0;
2514 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
2515 		    (close_type != CLOSE_RESEND &&
2516 		    lost_rqst.lr_op == OP_CLOSE) ? &lost_rqst : NULL,
2517 		    OP_CLOSE, bsep, NULL, NULL);
2518 
2519 		/* drop open seq sync, and let the calling function regrab it */
2520 		nfs4_end_open_seqid_sync(oop);
2521 		*did_start_seqid_syncp = 0;
2522 
2523 		if (bsep)
2524 			kmem_free(bsep, sizeof (*bsep));
2525 		/*
2526 		 * For signals, the caller wants to quit, so don't say to
2527 		 * retry.  For forced unmount, if it's a user thread, it
2528 		 * wants to quit.  If it's a recovery thread, the retry
2529 		 * will happen higher-up on the call stack.  Either way,
2530 		 * don't say to retry.
2531 		 */
2532 		if (abort == FALSE && ep->error != EINTR &&
2533 		    !NFS4_FRC_UNMT_ERR(ep->error, mi->mi_vfsp) &&
2534 		    close_type != CLOSE_RESEND &&
2535 		    close_type != CLOSE_AFTER_RESEND)
2536 			*recov = 1;
2537 		else
2538 			*recov = 0;
2539 
2540 		if (!ep->error)
2541 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2542 		return;
2543 	}
2544 
2545 	if (res.status) {
2546 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2547 		return;
2548 	}
2549 
2550 	mutex_enter(&rp->r_statev4_lock);
2551 	rp->created_v4 = 0;
2552 	mutex_exit(&rp->r_statev4_lock);
2553 
2554 	resop = &res.array[2];
2555 	osp->open_stateid = resop->nfs_resop4_u.opclose.open_stateid;
2556 	osp->os_valid = 0;
2557 
2558 	/*
2559 	 * This removes the reference obtained at OPEN; ie, when the
2560 	 * open stream structure was created.
2561 	 *
2562 	 * We don't have to worry about calling 'open_stream_rele'
2563 	 * since we our currently holding a reference to the open
2564 	 * stream which means the count cannot go to 0 with this
2565 	 * decrement.
2566 	 */
2567 	ASSERT(osp->os_ref_count >= 2);
2568 	osp->os_ref_count--;
2569 
2570 	if (!ep->error)
2571 		nfs4_attr_cache(vp,
2572 		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
2573 		    t, cred_otw, TRUE, NULL);
2574 
2575 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "nfs4close_otw:"
2576 	    " returning %d", ep->error));
2577 
2578 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2579 }
2580 
2581 /* ARGSUSED */
2582 static int
2583 nfs4_read(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2584     caller_context_t *ct)
2585 {
2586 	rnode4_t *rp;
2587 	u_offset_t off;
2588 	offset_t diff;
2589 	uint_t on;
2590 	uint_t n;
2591 	caddr_t base;
2592 	uint_t flags;
2593 	int error;
2594 	mntinfo4_t *mi;
2595 
2596 	rp = VTOR4(vp);
2597 
2598 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
2599 
2600 	if (IS_SHADOW(vp, rp))
2601 		vp = RTOV4(rp);
2602 
2603 	if (vp->v_type != VREG)
2604 		return (EISDIR);
2605 
2606 	mi = VTOMI4(vp);
2607 
2608 	if (nfs_zone() != mi->mi_zone)
2609 		return (EIO);
2610 
2611 	if (uiop->uio_resid == 0)
2612 		return (0);
2613 
2614 	if (uiop->uio_loffset < 0 || uiop->uio_loffset + uiop->uio_resid < 0)
2615 		return (EINVAL);
2616 
2617 	mutex_enter(&rp->r_statelock);
2618 	if (rp->r_flags & R4RECOVERRP)
2619 		error = (rp->r_error ? rp->r_error : EIO);
2620 	else
2621 		error = 0;
2622 	mutex_exit(&rp->r_statelock);
2623 	if (error)
2624 		return (error);
2625 
2626 	/*
2627 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2628 	 * using client-side direct I/O and the file is not mmap'd and
2629 	 * there are no cached pages.
2630 	 */
2631 	if ((vp->v_flag & VNOCACHE) ||
2632 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2633 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2634 		size_t resid = 0;
2635 
2636 		return (nfs4read(vp, NULL, uiop->uio_loffset,
2637 		    uiop->uio_resid, &resid, cr, FALSE, uiop));
2638 	}
2639 
2640 	error = 0;
2641 
2642 	do {
2643 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2644 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2645 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2646 
2647 		if (error = nfs4_validate_caches(vp, cr))
2648 			break;
2649 
2650 		mutex_enter(&rp->r_statelock);
2651 		while (rp->r_flags & R4INCACHEPURGE) {
2652 			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2653 				mutex_exit(&rp->r_statelock);
2654 				return (EINTR);
2655 			}
2656 		}
2657 		diff = rp->r_size - uiop->uio_loffset;
2658 		mutex_exit(&rp->r_statelock);
2659 		if (diff <= 0)
2660 			break;
2661 		if (diff < n)
2662 			n = (uint_t)diff;
2663 
2664 		if (vpm_enable) {
2665 			/*
2666 			 * Copy data.
2667 			 */
2668 			error = vpm_data_copy(vp, off + on, n, uiop,
2669 			    1, NULL, 0, S_READ);
2670 		} else {
2671 			base = segmap_getmapflt(segkmap, vp, off + on, n, 1,
2672 			    S_READ);
2673 
2674 			error = uiomove(base + on, n, UIO_READ, uiop);
2675 		}
2676 
2677 		if (!error) {
2678 			/*
2679 			 * If read a whole block or read to eof,
2680 			 * won't need this buffer again soon.
2681 			 */
2682 			mutex_enter(&rp->r_statelock);
2683 			if (n + on == MAXBSIZE ||
2684 			    uiop->uio_loffset == rp->r_size)
2685 				flags = SM_DONTNEED;
2686 			else
2687 				flags = 0;
2688 			mutex_exit(&rp->r_statelock);
2689 			if (vpm_enable) {
2690 				error = vpm_sync_pages(vp, off, n, flags);
2691 			} else {
2692 				error = segmap_release(segkmap, base, flags);
2693 			}
2694 		} else {
2695 			if (vpm_enable) {
2696 				(void) vpm_sync_pages(vp, off, n, 0);
2697 			} else {
2698 				(void) segmap_release(segkmap, base, 0);
2699 			}
2700 		}
2701 	} while (!error && uiop->uio_resid > 0);
2702 
2703 	return (error);
2704 }
2705 
2706 /* ARGSUSED */
2707 static int
2708 nfs4_write(vnode_t *vp, struct uio *uiop, int ioflag, cred_t *cr,
2709     caller_context_t *ct)
2710 {
2711 	rlim64_t limit = uiop->uio_llimit;
2712 	rnode4_t *rp;
2713 	u_offset_t off;
2714 	caddr_t base;
2715 	uint_t flags;
2716 	int remainder;
2717 	size_t n;
2718 	int on;
2719 	int error;
2720 	int resid;
2721 	u_offset_t offset;
2722 	mntinfo4_t *mi;
2723 	uint_t bsize;
2724 
2725 	rp = VTOR4(vp);
2726 
2727 	if (IS_SHADOW(vp, rp))
2728 		vp = RTOV4(rp);
2729 
2730 	if (vp->v_type != VREG)
2731 		return (EISDIR);
2732 
2733 	mi = VTOMI4(vp);
2734 
2735 	if (nfs_zone() != mi->mi_zone)
2736 		return (EIO);
2737 
2738 	if (uiop->uio_resid == 0)
2739 		return (0);
2740 
2741 	mutex_enter(&rp->r_statelock);
2742 	if (rp->r_flags & R4RECOVERRP)
2743 		error = (rp->r_error ? rp->r_error : EIO);
2744 	else
2745 		error = 0;
2746 	mutex_exit(&rp->r_statelock);
2747 	if (error)
2748 		return (error);
2749 
2750 	if (ioflag & FAPPEND) {
2751 		struct vattr va;
2752 
2753 		/*
2754 		 * Must serialize if appending.
2755 		 */
2756 		if (nfs_rw_lock_held(&rp->r_rwlock, RW_READER)) {
2757 			nfs_rw_exit(&rp->r_rwlock);
2758 			if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER,
2759 			    INTR4(vp)))
2760 				return (EINTR);
2761 		}
2762 
2763 		va.va_mask = AT_SIZE;
2764 		error = nfs4getattr(vp, &va, cr);
2765 		if (error)
2766 			return (error);
2767 		uiop->uio_loffset = va.va_size;
2768 	}
2769 
2770 	offset = uiop->uio_loffset + uiop->uio_resid;
2771 
2772 	if (uiop->uio_loffset < (offset_t)0 || offset < 0)
2773 		return (EINVAL);
2774 
2775 	if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
2776 		limit = MAXOFFSET_T;
2777 
2778 	/*
2779 	 * Check to make sure that the process will not exceed
2780 	 * its limit on file size.  It is okay to write up to
2781 	 * the limit, but not beyond.  Thus, the write which
2782 	 * reaches the limit will be short and the next write
2783 	 * will return an error.
2784 	 */
2785 	remainder = 0;
2786 	if (offset > uiop->uio_llimit) {
2787 		remainder = offset - uiop->uio_llimit;
2788 		uiop->uio_resid = uiop->uio_llimit - uiop->uio_loffset;
2789 		if (uiop->uio_resid <= 0) {
2790 			proc_t *p = ttoproc(curthread);
2791 
2792 			uiop->uio_resid += remainder;
2793 			mutex_enter(&p->p_lock);
2794 			(void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE],
2795 			    p->p_rctls, p, RCA_UNSAFE_SIGINFO);
2796 			mutex_exit(&p->p_lock);
2797 			return (EFBIG);
2798 		}
2799 	}
2800 
2801 	/* update the change attribute, if we have a write delegation */
2802 
2803 	mutex_enter(&rp->r_statev4_lock);
2804 	if (rp->r_deleg_type == OPEN_DELEGATE_WRITE)
2805 		rp->r_deleg_change++;
2806 
2807 	mutex_exit(&rp->r_statev4_lock);
2808 
2809 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp)))
2810 		return (EINTR);
2811 
2812 	/*
2813 	 * Bypass VM if caching has been disabled (e.g., locking) or if
2814 	 * using client-side direct I/O and the file is not mmap'd and
2815 	 * there are no cached pages.
2816 	 */
2817 	if ((vp->v_flag & VNOCACHE) ||
2818 	    (((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO)) &&
2819 	    rp->r_mapcnt == 0 && rp->r_inmap == 0 && !nfs4_has_pages(vp))) {
2820 		size_t bufsize;
2821 		int count;
2822 		u_offset_t org_offset;
2823 		stable_how4 stab_comm;
2824 nfs4_fwrite:
2825 		if (rp->r_flags & R4STALE) {
2826 			resid = uiop->uio_resid;
2827 			offset = uiop->uio_loffset;
2828 			error = rp->r_error;
2829 			/*
2830 			 * A close may have cleared r_error, if so,
2831 			 * propagate ESTALE error return properly
2832 			 */
2833 			if (error == 0)
2834 				error = ESTALE;
2835 			goto bottom;
2836 		}
2837 
2838 		bufsize = MIN(uiop->uio_resid, mi->mi_stsize);
2839 		base = kmem_alloc(bufsize, KM_SLEEP);
2840 		do {
2841 			if (ioflag & FDSYNC)
2842 				stab_comm = DATA_SYNC4;
2843 			else
2844 				stab_comm = FILE_SYNC4;
2845 			resid = uiop->uio_resid;
2846 			offset = uiop->uio_loffset;
2847 			count = MIN(uiop->uio_resid, bufsize);
2848 			org_offset = uiop->uio_loffset;
2849 			error = uiomove(base, count, UIO_WRITE, uiop);
2850 			if (!error) {
2851 				error = nfs4write(vp, base, org_offset,
2852 				    count, cr, &stab_comm);
2853 				if (!error) {
2854 					mutex_enter(&rp->r_statelock);
2855 					if (rp->r_size < uiop->uio_loffset)
2856 						rp->r_size = uiop->uio_loffset;
2857 					mutex_exit(&rp->r_statelock);
2858 				}
2859 			}
2860 		} while (!error && uiop->uio_resid > 0);
2861 		kmem_free(base, bufsize);
2862 		goto bottom;
2863 	}
2864 
2865 	bsize = vp->v_vfsp->vfs_bsize;
2866 
2867 	do {
2868 		off = uiop->uio_loffset & MAXBMASK; /* mapping offset */
2869 		on = uiop->uio_loffset & MAXBOFFSET; /* Relative offset */
2870 		n = MIN(MAXBSIZE - on, uiop->uio_resid);
2871 
2872 		resid = uiop->uio_resid;
2873 		offset = uiop->uio_loffset;
2874 
2875 		if (rp->r_flags & R4STALE) {
2876 			error = rp->r_error;
2877 			/*
2878 			 * A close may have cleared r_error, if so,
2879 			 * propagate ESTALE error return properly
2880 			 */
2881 			if (error == 0)
2882 				error = ESTALE;
2883 			break;
2884 		}
2885 
2886 		/*
2887 		 * Don't create dirty pages faster than they
2888 		 * can be cleaned so that the system doesn't
2889 		 * get imbalanced.  If the async queue is
2890 		 * maxed out, then wait for it to drain before
2891 		 * creating more dirty pages.  Also, wait for
2892 		 * any threads doing pagewalks in the vop_getattr
2893 		 * entry points so that they don't block for
2894 		 * long periods.
2895 		 */
2896 		mutex_enter(&rp->r_statelock);
2897 		while ((mi->mi_max_threads != 0 &&
2898 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
2899 		    rp->r_gcount > 0) {
2900 			if (INTR4(vp)) {
2901 				klwp_t *lwp = ttolwp(curthread);
2902 
2903 				if (lwp != NULL)
2904 					lwp->lwp_nostop++;
2905 				if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
2906 					mutex_exit(&rp->r_statelock);
2907 					if (lwp != NULL)
2908 						lwp->lwp_nostop--;
2909 					error = EINTR;
2910 					goto bottom;
2911 				}
2912 				if (lwp != NULL)
2913 					lwp->lwp_nostop--;
2914 			} else
2915 				cv_wait(&rp->r_cv, &rp->r_statelock);
2916 		}
2917 		mutex_exit(&rp->r_statelock);
2918 
2919 		/*
2920 		 * Touch the page and fault it in if it is not in core
2921 		 * before segmap_getmapflt or vpm_data_copy can lock it.
2922 		 * This is to avoid the deadlock if the buffer is mapped
2923 		 * to the same file through mmap which we want to write.
2924 		 */
2925 		uio_prefaultpages((long)n, uiop);
2926 
2927 		if (vpm_enable) {
2928 			/*
2929 			 * It will use kpm mappings, so no need to
2930 			 * pass an address.
2931 			 */
2932 			error = writerp4(rp, NULL, n, uiop, 0);
2933 		} else  {
2934 			if (segmap_kpm) {
2935 				int pon = uiop->uio_loffset & PAGEOFFSET;
2936 				size_t pn = MIN(PAGESIZE - pon,
2937 				    uiop->uio_resid);
2938 				int pagecreate;
2939 
2940 				mutex_enter(&rp->r_statelock);
2941 				pagecreate = (pon == 0) && (pn == PAGESIZE ||
2942 				    uiop->uio_loffset + pn >= rp->r_size);
2943 				mutex_exit(&rp->r_statelock);
2944 
2945 				base = segmap_getmapflt(segkmap, vp, off + on,
2946 				    pn, !pagecreate, S_WRITE);
2947 
2948 				error = writerp4(rp, base + pon, n, uiop,
2949 				    pagecreate);
2950 
2951 			} else {
2952 				base = segmap_getmapflt(segkmap, vp, off + on,
2953 				    n, 0, S_READ);
2954 				error = writerp4(rp, base + on, n, uiop, 0);
2955 			}
2956 		}
2957 
2958 		if (!error) {
2959 			if (mi->mi_flags & MI4_NOAC)
2960 				flags = SM_WRITE;
2961 			else if ((uiop->uio_loffset % bsize) == 0 ||
2962 			    IS_SWAPVP(vp)) {
2963 				/*
2964 				 * Have written a whole block.
2965 				 * Start an asynchronous write
2966 				 * and mark the buffer to
2967 				 * indicate that it won't be
2968 				 * needed again soon.
2969 				 */
2970 				flags = SM_WRITE | SM_ASYNC | SM_DONTNEED;
2971 			} else
2972 				flags = 0;
2973 			if ((ioflag & (FSYNC|FDSYNC)) ||
2974 			    (rp->r_flags & R4OUTOFSPACE)) {
2975 				flags &= ~SM_ASYNC;
2976 				flags |= SM_WRITE;
2977 			}
2978 			if (vpm_enable) {
2979 				error = vpm_sync_pages(vp, off, n, flags);
2980 			} else {
2981 				error = segmap_release(segkmap, base, flags);
2982 			}
2983 		} else {
2984 			if (vpm_enable) {
2985 				(void) vpm_sync_pages(vp, off, n, 0);
2986 			} else {
2987 				(void) segmap_release(segkmap, base, 0);
2988 			}
2989 			/*
2990 			 * In the event that we got an access error while
2991 			 * faulting in a page for a write-only file just
2992 			 * force a write.
2993 			 */
2994 			if (error == EACCES)
2995 				goto nfs4_fwrite;
2996 		}
2997 	} while (!error && uiop->uio_resid > 0);
2998 
2999 bottom:
3000 	if (error) {
3001 		uiop->uio_resid = resid + remainder;
3002 		uiop->uio_loffset = offset;
3003 	} else {
3004 		uiop->uio_resid += remainder;
3005 
3006 		mutex_enter(&rp->r_statev4_lock);
3007 		if (rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
3008 			gethrestime(&rp->r_attr.va_mtime);
3009 			rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3010 		}
3011 		mutex_exit(&rp->r_statev4_lock);
3012 	}
3013 
3014 	nfs_rw_exit(&rp->r_lkserlock);
3015 
3016 	return (error);
3017 }
3018 
3019 /*
3020  * Flags are composed of {B_ASYNC, B_INVAL, B_FREE, B_DONTNEED}
3021  */
3022 static int
3023 nfs4_rdwrlbn(vnode_t *vp, page_t *pp, u_offset_t off, size_t len,
3024     int flags, cred_t *cr)
3025 {
3026 	struct buf *bp;
3027 	int error;
3028 	page_t *savepp;
3029 	uchar_t fsdata;
3030 	stable_how4 stab_comm;
3031 
3032 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3033 	bp = pageio_setup(pp, len, vp, flags);
3034 	ASSERT(bp != NULL);
3035 
3036 	/*
3037 	 * pageio_setup should have set b_addr to 0.  This
3038 	 * is correct since we want to do I/O on a page
3039 	 * boundary.  bp_mapin will use this addr to calculate
3040 	 * an offset, and then set b_addr to the kernel virtual
3041 	 * address it allocated for us.
3042 	 */
3043 	ASSERT(bp->b_un.b_addr == 0);
3044 
3045 	bp->b_edev = 0;
3046 	bp->b_dev = 0;
3047 	bp->b_lblkno = lbtodb(off);
3048 	bp->b_file = vp;
3049 	bp->b_offset = (offset_t)off;
3050 	bp_mapin(bp);
3051 
3052 	if ((flags & (B_WRITE|B_ASYNC)) == (B_WRITE|B_ASYNC) &&
3053 	    freemem > desfree)
3054 		stab_comm = UNSTABLE4;
3055 	else
3056 		stab_comm = FILE_SYNC4;
3057 
3058 	error = nfs4_bio(bp, &stab_comm, cr, FALSE);
3059 
3060 	bp_mapout(bp);
3061 	pageio_done(bp);
3062 
3063 	if (stab_comm == UNSTABLE4)
3064 		fsdata = C_DELAYCOMMIT;
3065 	else
3066 		fsdata = C_NOCOMMIT;
3067 
3068 	savepp = pp;
3069 	do {
3070 		pp->p_fsdata = fsdata;
3071 	} while ((pp = pp->p_next) != savepp);
3072 
3073 	return (error);
3074 }
3075 
3076 /*
3077  */
3078 static int
3079 nfs4rdwr_check_osid(vnode_t *vp, nfs4_error_t *ep, cred_t *cr)
3080 {
3081 	nfs4_open_owner_t	*oop;
3082 	nfs4_open_stream_t	*osp;
3083 	rnode4_t		*rp = VTOR4(vp);
3084 	mntinfo4_t 		*mi = VTOMI4(vp);
3085 	int 			reopen_needed;
3086 
3087 	ASSERT(nfs_zone() == mi->mi_zone);
3088 
3089 
3090 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
3091 	if (!oop)
3092 		return (EIO);
3093 
3094 	/* returns with 'os_sync_lock' held */
3095 	osp = find_open_stream(oop, rp);
3096 	if (!osp) {
3097 		open_owner_rele(oop);
3098 		return (EIO);
3099 	}
3100 
3101 	if (osp->os_failed_reopen) {
3102 		mutex_exit(&osp->os_sync_lock);
3103 		open_stream_rele(osp, rp);
3104 		open_owner_rele(oop);
3105 		return (EIO);
3106 	}
3107 
3108 	/*
3109 	 * Determine whether a reopen is needed.  If this
3110 	 * is a delegation open stream, then the os_delegation bit
3111 	 * should be set.
3112 	 */
3113 
3114 	reopen_needed = osp->os_delegation;
3115 
3116 	mutex_exit(&osp->os_sync_lock);
3117 	open_owner_rele(oop);
3118 
3119 	if (reopen_needed) {
3120 		nfs4_error_zinit(ep);
3121 		nfs4_reopen(vp, osp, ep, CLAIM_NULL, FALSE, FALSE);
3122 		mutex_enter(&osp->os_sync_lock);
3123 		if (ep->error || ep->stat || osp->os_failed_reopen) {
3124 			mutex_exit(&osp->os_sync_lock);
3125 			open_stream_rele(osp, rp);
3126 			return (EIO);
3127 		}
3128 		mutex_exit(&osp->os_sync_lock);
3129 	}
3130 	open_stream_rele(osp, rp);
3131 
3132 	return (0);
3133 }
3134 
3135 /*
3136  * Write to file.  Writes to remote server in largest size
3137  * chunks that the server can handle.  Write is synchronous.
3138  */
3139 static int
3140 nfs4write(vnode_t *vp, caddr_t base, u_offset_t offset, int count, cred_t *cr,
3141     stable_how4 *stab_comm)
3142 {
3143 	mntinfo4_t *mi;
3144 	COMPOUND4args_clnt args;
3145 	COMPOUND4res_clnt res;
3146 	WRITE4args *wargs;
3147 	WRITE4res *wres;
3148 	nfs_argop4 argop[2];
3149 	nfs_resop4 *resop;
3150 	int tsize;
3151 	stable_how4 stable;
3152 	rnode4_t *rp;
3153 	int doqueue = 1;
3154 	bool_t needrecov;
3155 	nfs4_recov_state_t recov_state;
3156 	nfs4_stateid_types_t sid_types;
3157 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3158 	int recov;
3159 
3160 	rp = VTOR4(vp);
3161 	mi = VTOMI4(vp);
3162 
3163 	ASSERT(nfs_zone() == mi->mi_zone);
3164 
3165 	stable = *stab_comm;
3166 	*stab_comm = FILE_SYNC4;
3167 
3168 	needrecov = FALSE;
3169 	recov_state.rs_flags = 0;
3170 	recov_state.rs_num_retry_despite_err = 0;
3171 	nfs4_init_stateid_types(&sid_types);
3172 
3173 	/* Is curthread the recovery thread? */
3174 	mutex_enter(&mi->mi_lock);
3175 	recov = (mi->mi_recovthread == curthread);
3176 	mutex_exit(&mi->mi_lock);
3177 
3178 recov_retry:
3179 	args.ctag = TAG_WRITE;
3180 	args.array_len = 2;
3181 	args.array = argop;
3182 
3183 	if (!recov) {
3184 		e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3185 		    &recov_state, NULL);
3186 		if (e.error)
3187 			return (e.error);
3188 	}
3189 
3190 	/* 0. putfh target fh */
3191 	argop[0].argop = OP_CPUTFH;
3192 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3193 
3194 	/* 1. write */
3195 	nfs4args_write(&argop[1], stable, rp, cr, &wargs, &sid_types);
3196 
3197 	do {
3198 
3199 		wargs->offset = (offset4)offset;
3200 		wargs->data_val = base;
3201 
3202 		if (mi->mi_io_kstats) {
3203 			mutex_enter(&mi->mi_lock);
3204 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3205 			mutex_exit(&mi->mi_lock);
3206 		}
3207 
3208 		if ((vp->v_flag & VNOCACHE) ||
3209 		    (rp->r_flags & R4DIRECTIO) ||
3210 		    (mi->mi_flags & MI4_DIRECTIO))
3211 			tsize = MIN(mi->mi_stsize, count);
3212 		else
3213 			tsize = MIN(mi->mi_curwrite, count);
3214 		wargs->data_len = (uint_t)tsize;
3215 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3216 
3217 		if (mi->mi_io_kstats) {
3218 			mutex_enter(&mi->mi_lock);
3219 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3220 			mutex_exit(&mi->mi_lock);
3221 		}
3222 
3223 		if (!recov) {
3224 			needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3225 			if (e.error && !needrecov) {
3226 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3227 				    &recov_state, needrecov);
3228 				return (e.error);
3229 			}
3230 		} else {
3231 			if (e.error)
3232 				return (e.error);
3233 		}
3234 
3235 		/*
3236 		 * Do handling of OLD_STATEID outside
3237 		 * of the normal recovery framework.
3238 		 *
3239 		 * If write receives a BAD stateid error while using a
3240 		 * delegation stateid, retry using the open stateid (if it
3241 		 * exists).  If it doesn't have an open stateid, reopen the
3242 		 * file first, then retry.
3243 		 */
3244 		if (!e.error && res.status == NFS4ERR_OLD_STATEID &&
3245 		    sid_types.cur_sid_type != SPEC_SID) {
3246 			nfs4_save_stateid(&wargs->stateid, &sid_types);
3247 			if (!recov)
3248 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3249 				    &recov_state, needrecov);
3250 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3251 			goto recov_retry;
3252 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3253 		    sid_types.cur_sid_type == DEL_SID) {
3254 			nfs4_save_stateid(&wargs->stateid, &sid_types);
3255 			mutex_enter(&rp->r_statev4_lock);
3256 			rp->r_deleg_return_pending = TRUE;
3257 			mutex_exit(&rp->r_statev4_lock);
3258 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3259 				if (!recov)
3260 					nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3261 					    &recov_state, needrecov);
3262 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3263 				    (caddr_t)&res);
3264 				return (EIO);
3265 			}
3266 			if (!recov)
3267 				nfs4_end_fop(mi, vp, NULL, OH_WRITE,
3268 				    &recov_state, needrecov);
3269 			/* hold needed for nfs4delegreturn_thread */
3270 			VN_HOLD(vp);
3271 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3272 			    NFS4_DR_DISCARD), FALSE);
3273 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3274 			goto recov_retry;
3275 		}
3276 
3277 		if (needrecov) {
3278 			bool_t abort;
3279 
3280 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3281 			    "nfs4write: client got error %d, res.status %d"
3282 			    ", so start recovery", e.error, res.status));
3283 
3284 			abort = nfs4_start_recovery(&e,
3285 			    VTOMI4(vp), vp, NULL, &wargs->stateid,
3286 			    NULL, OP_WRITE, NULL, NULL, NULL);
3287 			if (!e.error) {
3288 				e.error = geterrno4(res.status);
3289 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3290 				    (caddr_t)&res);
3291 			}
3292 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3293 			    &recov_state, needrecov);
3294 			if (abort == FALSE)
3295 				goto recov_retry;
3296 			return (e.error);
3297 		}
3298 
3299 		if (res.status) {
3300 			e.error = geterrno4(res.status);
3301 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3302 			if (!recov)
3303 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3304 				    &recov_state, needrecov);
3305 			return (e.error);
3306 		}
3307 
3308 		resop = &res.array[1];	/* write res */
3309 		wres = &resop->nfs_resop4_u.opwrite;
3310 
3311 		if ((int)wres->count > tsize) {
3312 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3313 
3314 			zcmn_err(getzoneid(), CE_WARN,
3315 			    "nfs4write: server wrote %u, requested was %u",
3316 			    (int)wres->count, tsize);
3317 			if (!recov)
3318 				nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE,
3319 				    &recov_state, needrecov);
3320 			return (EIO);
3321 		}
3322 		if (wres->committed == UNSTABLE4) {
3323 			*stab_comm = UNSTABLE4;
3324 			if (wargs->stable == DATA_SYNC4 ||
3325 			    wargs->stable == FILE_SYNC4) {
3326 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3327 				    (caddr_t)&res);
3328 				zcmn_err(getzoneid(), CE_WARN,
3329 				    "nfs4write: server %s did not commit "
3330 				    "to stable storage",
3331 				    rp->r_server->sv_hostname);
3332 				if (!recov)
3333 					nfs4_end_fop(VTOMI4(vp), vp, NULL,
3334 					    OH_WRITE, &recov_state, needrecov);
3335 				return (EIO);
3336 			}
3337 		}
3338 
3339 		tsize = (int)wres->count;
3340 		count -= tsize;
3341 		base += tsize;
3342 		offset += tsize;
3343 		if (mi->mi_io_kstats) {
3344 			mutex_enter(&mi->mi_lock);
3345 			KSTAT_IO_PTR(mi->mi_io_kstats)->writes++;
3346 			KSTAT_IO_PTR(mi->mi_io_kstats)->nwritten +=
3347 			    tsize;
3348 			mutex_exit(&mi->mi_lock);
3349 		}
3350 		lwp_stat_update(LWP_STAT_OUBLK, 1);
3351 		mutex_enter(&rp->r_statelock);
3352 		if (rp->r_flags & R4HAVEVERF) {
3353 			if (rp->r_writeverf != wres->writeverf) {
3354 				nfs4_set_mod(vp);
3355 				rp->r_writeverf = wres->writeverf;
3356 			}
3357 		} else {
3358 			rp->r_writeverf = wres->writeverf;
3359 			rp->r_flags |= R4HAVEVERF;
3360 		}
3361 		PURGE_ATTRCACHE4_LOCKED(rp);
3362 		rp->r_flags |= R4WRITEMODIFIED;
3363 		gethrestime(&rp->r_attr.va_mtime);
3364 		rp->r_attr.va_ctime = rp->r_attr.va_mtime;
3365 		mutex_exit(&rp->r_statelock);
3366 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3367 	} while (count);
3368 
3369 	if (!recov)
3370 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_WRITE, &recov_state,
3371 		    needrecov);
3372 
3373 	return (e.error);
3374 }
3375 
3376 /*
3377  * Read from a file.  Reads data in largest chunks our interface can handle.
3378  */
3379 static int
3380 nfs4read(vnode_t *vp, caddr_t base, offset_t offset, int count,
3381     size_t *residp, cred_t *cr, bool_t async, struct uio *uiop)
3382 {
3383 	mntinfo4_t *mi;
3384 	COMPOUND4args_clnt args;
3385 	COMPOUND4res_clnt res;
3386 	READ4args *rargs;
3387 	nfs_argop4 argop[2];
3388 	int tsize;
3389 	int doqueue;
3390 	rnode4_t *rp;
3391 	int data_len;
3392 	bool_t is_eof;
3393 	bool_t needrecov = FALSE;
3394 	nfs4_recov_state_t recov_state;
3395 	nfs4_stateid_types_t sid_types;
3396 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3397 
3398 	rp = VTOR4(vp);
3399 	mi = VTOMI4(vp);
3400 	doqueue = 1;
3401 
3402 	ASSERT(nfs_zone() == mi->mi_zone);
3403 
3404 	args.ctag = async ? TAG_READAHEAD : TAG_READ;
3405 
3406 	args.array_len = 2;
3407 	args.array = argop;
3408 
3409 	nfs4_init_stateid_types(&sid_types);
3410 
3411 	recov_state.rs_flags = 0;
3412 	recov_state.rs_num_retry_despite_err = 0;
3413 
3414 recov_retry:
3415 	e.error = nfs4_start_fop(mi, vp, NULL, OH_READ,
3416 	    &recov_state, NULL);
3417 	if (e.error)
3418 		return (e.error);
3419 
3420 	/* putfh target fh */
3421 	argop[0].argop = OP_CPUTFH;
3422 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3423 
3424 	/* read */
3425 	argop[1].argop = OP_READ;
3426 	rargs = &argop[1].nfs_argop4_u.opread;
3427 	rargs->stateid = nfs4_get_stateid(cr, rp, curproc->p_pidp->pid_id, mi,
3428 	    OP_READ, &sid_types, async);
3429 
3430 	do {
3431 		if (mi->mi_io_kstats) {
3432 			mutex_enter(&mi->mi_lock);
3433 			kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
3434 			mutex_exit(&mi->mi_lock);
3435 		}
3436 
3437 		NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
3438 		    "nfs4read: %s call, rp %s",
3439 		    needrecov ? "recov" : "first",
3440 		    rnode4info(rp)));
3441 
3442 		if ((vp->v_flag & VNOCACHE) ||
3443 		    (rp->r_flags & R4DIRECTIO) ||
3444 		    (mi->mi_flags & MI4_DIRECTIO))
3445 			tsize = MIN(mi->mi_tsize, count);
3446 		else
3447 			tsize = MIN(mi->mi_curread, count);
3448 
3449 		rargs->offset = (offset4)offset;
3450 		rargs->count = (count4)tsize;
3451 		rargs->res_data_val_alt = NULL;
3452 		rargs->res_mblk = NULL;
3453 		rargs->res_uiop = NULL;
3454 		rargs->res_maxsize = 0;
3455 		rargs->wlist = NULL;
3456 
3457 		if (uiop)
3458 			rargs->res_uiop = uiop;
3459 		else
3460 			rargs->res_data_val_alt = base;
3461 		rargs->res_maxsize = tsize;
3462 
3463 		rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
3464 #ifdef	DEBUG
3465 		if (nfs4read_error_inject) {
3466 			res.status = nfs4read_error_inject;
3467 			nfs4read_error_inject = 0;
3468 		}
3469 #endif
3470 
3471 		if (mi->mi_io_kstats) {
3472 			mutex_enter(&mi->mi_lock);
3473 			kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
3474 			mutex_exit(&mi->mi_lock);
3475 		}
3476 
3477 		needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
3478 		if (e.error != 0 && !needrecov) {
3479 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3480 			    &recov_state, needrecov);
3481 			return (e.error);
3482 		}
3483 
3484 		/*
3485 		 * Do proper retry for OLD and BAD stateid errors outside
3486 		 * of the normal recovery framework.  There are two differences
3487 		 * between async and sync reads.  The first is that we allow
3488 		 * retry on BAD_STATEID for async reads, but not sync reads.
3489 		 * The second is that we mark the file dead for a failed
3490 		 * attempt with a special stateid for sync reads, but just
3491 		 * return EIO for async reads.
3492 		 *
3493 		 * If a sync read receives a BAD stateid error while using a
3494 		 * delegation stateid, retry using the open stateid (if it
3495 		 * exists).  If it doesn't have an open stateid, reopen the
3496 		 * file first, then retry.
3497 		 */
3498 		if (e.error == 0 && (res.status == NFS4ERR_OLD_STATEID ||
3499 		    res.status == NFS4ERR_BAD_STATEID) && async) {
3500 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3501 			    &recov_state, needrecov);
3502 			if (sid_types.cur_sid_type == SPEC_SID) {
3503 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3504 				    (caddr_t)&res);
3505 				return (EIO);
3506 			}
3507 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3508 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3509 			goto recov_retry;
3510 		} else if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3511 		    !async && sid_types.cur_sid_type != SPEC_SID) {
3512 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3513 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3514 			    &recov_state, needrecov);
3515 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3516 			goto recov_retry;
3517 		} else if (e.error == 0 && res.status == NFS4ERR_BAD_STATEID &&
3518 		    sid_types.cur_sid_type == DEL_SID) {
3519 			nfs4_save_stateid(&rargs->stateid, &sid_types);
3520 			mutex_enter(&rp->r_statev4_lock);
3521 			rp->r_deleg_return_pending = TRUE;
3522 			mutex_exit(&rp->r_statev4_lock);
3523 			if (nfs4rdwr_check_osid(vp, &e, cr)) {
3524 				nfs4_end_fop(mi, vp, NULL, OH_READ,
3525 				    &recov_state, needrecov);
3526 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3527 				    (caddr_t)&res);
3528 				return (EIO);
3529 			}
3530 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3531 			    &recov_state, needrecov);
3532 			/* hold needed for nfs4delegreturn_thread */
3533 			VN_HOLD(vp);
3534 			nfs4delegreturn_async(rp, (NFS4_DR_PUSH|NFS4_DR_REOPEN|
3535 			    NFS4_DR_DISCARD), FALSE);
3536 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3537 			goto recov_retry;
3538 		}
3539 		if (needrecov) {
3540 			bool_t abort;
3541 
3542 			NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
3543 			    "nfs4read: initiating recovery\n"));
3544 			abort = nfs4_start_recovery(&e,
3545 			    mi, vp, NULL, &rargs->stateid,
3546 			    NULL, OP_READ, NULL, NULL, NULL);
3547 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3548 			    &recov_state, needrecov);
3549 			/*
3550 			 * Do not retry if we got OLD_STATEID using a special
3551 			 * stateid.  This avoids looping with a broken server.
3552 			 */
3553 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3554 			    sid_types.cur_sid_type == SPEC_SID)
3555 				abort = TRUE;
3556 
3557 			if (abort == FALSE) {
3558 				/*
3559 				 * Need to retry all possible stateids in
3560 				 * case the recovery error wasn't stateid
3561 				 * related or the stateids have become
3562 				 * stale (server reboot).
3563 				 */
3564 				nfs4_init_stateid_types(&sid_types);
3565 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3566 				    (caddr_t)&res);
3567 				goto recov_retry;
3568 			}
3569 
3570 			if (!e.error) {
3571 				e.error = geterrno4(res.status);
3572 				(void) xdr_free(xdr_COMPOUND4res_clnt,
3573 				    (caddr_t)&res);
3574 			}
3575 			return (e.error);
3576 		}
3577 
3578 		if (res.status) {
3579 			e.error = geterrno4(res.status);
3580 			nfs4_end_fop(mi, vp, NULL, OH_READ,
3581 			    &recov_state, needrecov);
3582 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3583 			return (e.error);
3584 		}
3585 
3586 		data_len = res.array[1].nfs_resop4_u.opread.data_len;
3587 		count -= data_len;
3588 		if (base)
3589 			base += data_len;
3590 		offset += data_len;
3591 		if (mi->mi_io_kstats) {
3592 			mutex_enter(&mi->mi_lock);
3593 			KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
3594 			KSTAT_IO_PTR(mi->mi_io_kstats)->nread += data_len;
3595 			mutex_exit(&mi->mi_lock);
3596 		}
3597 		lwp_stat_update(LWP_STAT_INBLK, 1);
3598 		is_eof = res.array[1].nfs_resop4_u.opread.eof;
3599 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3600 
3601 	} while (count && !is_eof);
3602 
3603 	*residp = count;
3604 
3605 	nfs4_end_fop(mi, vp, NULL, OH_READ, &recov_state, needrecov);
3606 
3607 	return (e.error);
3608 }
3609 
3610 /* ARGSUSED */
3611 static int
3612 nfs4_ioctl(vnode_t *vp, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp,
3613 	caller_context_t *ct)
3614 {
3615 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3616 		return (EIO);
3617 	switch (cmd) {
3618 		case _FIODIRECTIO:
3619 			return (nfs4_directio(vp, (int)arg, cr));
3620 		default:
3621 			return (ENOTTY);
3622 	}
3623 }
3624 
3625 /* ARGSUSED */
3626 int
3627 nfs4_getattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3628     caller_context_t *ct)
3629 {
3630 	int error;
3631 	rnode4_t *rp = VTOR4(vp);
3632 
3633 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3634 		return (EIO);
3635 	/*
3636 	 * If it has been specified that the return value will
3637 	 * just be used as a hint, and we are only being asked
3638 	 * for size, fsid or rdevid, then return the client's
3639 	 * notion of these values without checking to make sure
3640 	 * that the attribute cache is up to date.
3641 	 * The whole point is to avoid an over the wire GETATTR
3642 	 * call.
3643 	 */
3644 	if (flags & ATTR_HINT) {
3645 		if (!(vap->va_mask & ~(AT_SIZE | AT_FSID | AT_RDEV))) {
3646 			mutex_enter(&rp->r_statelock);
3647 			if (vap->va_mask & AT_SIZE)
3648 				vap->va_size = rp->r_size;
3649 			if (vap->va_mask & AT_FSID)
3650 				vap->va_fsid = rp->r_attr.va_fsid;
3651 			if (vap->va_mask & AT_RDEV)
3652 				vap->va_rdev = rp->r_attr.va_rdev;
3653 			mutex_exit(&rp->r_statelock);
3654 			return (0);
3655 		}
3656 	}
3657 
3658 	/*
3659 	 * Only need to flush pages if asking for the mtime
3660 	 * and if there any dirty pages or any outstanding
3661 	 * asynchronous (write) requests for this file.
3662 	 */
3663 	if (vap->va_mask & AT_MTIME) {
3664 		rp = VTOR4(vp);
3665 		if (nfs4_has_pages(vp)) {
3666 			mutex_enter(&rp->r_statev4_lock);
3667 			if (rp->r_deleg_type != OPEN_DELEGATE_WRITE) {
3668 				mutex_exit(&rp->r_statev4_lock);
3669 				if (rp->r_flags & R4DIRTY ||
3670 				    rp->r_awcount > 0) {
3671 					mutex_enter(&rp->r_statelock);
3672 					rp->r_gcount++;
3673 					mutex_exit(&rp->r_statelock);
3674 					error =
3675 					    nfs4_putpage(vp, (u_offset_t)0,
3676 					    0, 0, cr, NULL);
3677 					mutex_enter(&rp->r_statelock);
3678 					if (error && (error == ENOSPC ||
3679 					    error == EDQUOT)) {
3680 						if (!rp->r_error)
3681 							rp->r_error = error;
3682 					}
3683 					if (--rp->r_gcount == 0)
3684 						cv_broadcast(&rp->r_cv);
3685 					mutex_exit(&rp->r_statelock);
3686 				}
3687 			} else {
3688 				mutex_exit(&rp->r_statev4_lock);
3689 			}
3690 		}
3691 	}
3692 	return (nfs4getattr(vp, vap, cr));
3693 }
3694 
3695 int
3696 nfs4_compare_modes(mode_t from_server, mode_t on_client)
3697 {
3698 	/*
3699 	 * If these are the only two bits cleared
3700 	 * on the server then return 0 (OK) else
3701 	 * return 1 (BAD).
3702 	 */
3703 	on_client &= ~(S_ISUID|S_ISGID);
3704 	if (on_client == from_server)
3705 		return (0);
3706 	else
3707 		return (1);
3708 }
3709 
3710 /*ARGSUSED4*/
3711 static int
3712 nfs4_setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3713     caller_context_t *ct)
3714 {
3715 	if (vap->va_mask & AT_NOSET)
3716 		return (EINVAL);
3717 
3718 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
3719 		return (EIO);
3720 
3721 	/*
3722 	 * Don't call secpolicy_vnode_setattr, the client cannot
3723 	 * use its cached attributes to make security decisions
3724 	 * as the server may be faking mode bits or mapping uid/gid.
3725 	 * Always just let the server to the checking.
3726 	 * If we provide the ability to remove basic priviledges
3727 	 * to setattr (e.g. basic without chmod) then we will
3728 	 * need to add a check here before calling the server.
3729 	 */
3730 
3731 	return (nfs4setattr(vp, vap, flags, cr, NULL));
3732 }
3733 
3734 /*
3735  * To replace the "guarded" version 3 setattr, we use two types of compound
3736  * setattr requests:
3737  * 1. The "normal" setattr, used when the size of the file isn't being
3738  *    changed - { Putfh <fh>; Setattr; Getattr }/
3739  * 2. If the size is changed, precede Setattr with: Getattr; Verify
3740  *    with only ctime as the argument. If the server ctime differs from
3741  *    what is cached on the client, the verify will fail, but we would
3742  *    already have the ctime from the preceding getattr, so just set it
3743  *    and retry. Thus the compound here is - { Putfh <fh>; Getattr; Verify;
3744  *	Setattr; Getattr }.
3745  *
3746  * The vsecattr_t * input parameter will be non-NULL if ACLs are being set in
3747  * this setattr and NULL if they are not.
3748  */
3749 static int
3750 nfs4setattr(vnode_t *vp, struct vattr *vap, int flags, cred_t *cr,
3751     vsecattr_t *vsap)
3752 {
3753 	COMPOUND4args_clnt args;
3754 	COMPOUND4res_clnt res, *resp = NULL;
3755 	nfs4_ga_res_t *garp = NULL;
3756 	int numops = 3;			/* { Putfh; Setattr; Getattr } */
3757 	nfs_argop4 argop[5];
3758 	int verify_argop = -1;
3759 	int setattr_argop = 1;
3760 	nfs_resop4 *resop;
3761 	vattr_t va;
3762 	rnode4_t *rp;
3763 	int doqueue = 1;
3764 	uint_t mask = vap->va_mask;
3765 	mode_t omode;
3766 	vsecattr_t *vsp;
3767 	timestruc_t ctime;
3768 	bool_t needrecov = FALSE;
3769 	nfs4_recov_state_t recov_state;
3770 	nfs4_stateid_types_t sid_types;
3771 	stateid4 stateid;
3772 	hrtime_t t;
3773 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
3774 	servinfo4_t *svp;
3775 	bitmap4 supp_attrs;
3776 
3777 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
3778 	rp = VTOR4(vp);
3779 	nfs4_init_stateid_types(&sid_types);
3780 
3781 	/*
3782 	 * Only need to flush pages if there are any pages and
3783 	 * if the file is marked as dirty in some fashion.  The
3784 	 * file must be flushed so that we can accurately
3785 	 * determine the size of the file and the cached data
3786 	 * after the SETATTR returns.  A file is considered to
3787 	 * be dirty if it is either marked with R4DIRTY, has
3788 	 * outstanding i/o's active, or is mmap'd.  In this
3789 	 * last case, we can't tell whether there are dirty
3790 	 * pages, so we flush just to be sure.
3791 	 */
3792 	if (nfs4_has_pages(vp) &&
3793 	    ((rp->r_flags & R4DIRTY) ||
3794 	    rp->r_count > 0 ||
3795 	    rp->r_mapcnt > 0)) {
3796 		ASSERT(vp->v_type != VCHR);
3797 		e.error = nfs4_putpage(vp, (offset_t)0, 0, 0, cr, NULL);
3798 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
3799 			mutex_enter(&rp->r_statelock);
3800 			if (!rp->r_error)
3801 				rp->r_error = e.error;
3802 			mutex_exit(&rp->r_statelock);
3803 		}
3804 	}
3805 
3806 	if (mask & AT_SIZE) {
3807 		/*
3808 		 * Verification setattr compound for non-deleg AT_SIZE:
3809 		 *	{ Putfh; Getattr; Verify; Setattr; Getattr }
3810 		 * Set ctime local here (outside the do_again label)
3811 		 * so that subsequent retries (after failed VERIFY)
3812 		 * will use ctime from GETATTR results (from failed
3813 		 * verify compound) as VERIFY arg.
3814 		 * If file has delegation, then VERIFY(time_metadata)
3815 		 * is of little added value, so don't bother.
3816 		 */
3817 		mutex_enter(&rp->r_statev4_lock);
3818 		if (rp->r_deleg_type == OPEN_DELEGATE_NONE ||
3819 		    rp->r_deleg_return_pending) {
3820 			numops = 5;
3821 			ctime = rp->r_attr.va_ctime;
3822 		}
3823 		mutex_exit(&rp->r_statev4_lock);
3824 	}
3825 
3826 	recov_state.rs_flags = 0;
3827 	recov_state.rs_num_retry_despite_err = 0;
3828 
3829 	args.ctag = TAG_SETATTR;
3830 do_again:
3831 recov_retry:
3832 	setattr_argop = numops - 2;
3833 
3834 	args.array = argop;
3835 	args.array_len = numops;
3836 
3837 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
3838 	if (e.error)
3839 		return (e.error);
3840 
3841 
3842 	/* putfh target fh */
3843 	argop[0].argop = OP_CPUTFH;
3844 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
3845 
3846 	if (numops == 5) {
3847 		/*
3848 		 * We only care about the ctime, but need to get mtime
3849 		 * and size for proper cache update.
3850 		 */
3851 		/* getattr */
3852 		argop[1].argop = OP_GETATTR;
3853 		argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3854 		argop[1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3855 
3856 		/* verify - set later in loop */
3857 		verify_argop = 2;
3858 	}
3859 
3860 	/* setattr */
3861 	svp = rp->r_server;
3862 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3863 	supp_attrs = svp->sv_supp_attrs;
3864 	nfs_rw_exit(&svp->sv_lock);
3865 
3866 	nfs4args_setattr(&argop[setattr_argop], vap, vsap, flags, rp, cr,
3867 	    supp_attrs, &e.error, &sid_types);
3868 	stateid = argop[setattr_argop].nfs_argop4_u.opsetattr.stateid;
3869 	if (e.error) {
3870 		/* req time field(s) overflow - return immediately */
3871 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
3872 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3873 		    opsetattr.obj_attributes);
3874 		return (e.error);
3875 	}
3876 	omode = rp->r_attr.va_mode;
3877 
3878 	/* getattr */
3879 	argop[numops-1].argop = OP_GETATTR;
3880 	argop[numops-1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
3881 	/*
3882 	 * If we are setting the ACL (indicated only by vsap != NULL), request
3883 	 * the ACL in this getattr.  The ACL returned from this getattr will be
3884 	 * used in updating the ACL cache.
3885 	 */
3886 	if (vsap != NULL)
3887 		argop[numops-1].nfs_argop4_u.opgetattr.attr_request |=
3888 		    FATTR4_ACL_MASK;
3889 	argop[numops-1].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
3890 
3891 	/*
3892 	 * setattr iterates if the object size is set and the cached ctime
3893 	 * does not match the file ctime. In that case, verify the ctime first.
3894 	 */
3895 
3896 	do {
3897 		if (verify_argop != -1) {
3898 			/*
3899 			 * Verify that the ctime match before doing setattr.
3900 			 */
3901 			va.va_mask = AT_CTIME;
3902 			va.va_ctime = ctime;
3903 			svp = rp->r_server;
3904 			(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
3905 			supp_attrs = svp->sv_supp_attrs;
3906 			nfs_rw_exit(&svp->sv_lock);
3907 			e.error = nfs4args_verify(&argop[verify_argop], &va,
3908 			    OP_VERIFY, supp_attrs);
3909 			if (e.error) {
3910 				/* req time field(s) overflow - return */
3911 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3912 				    needrecov);
3913 				break;
3914 			}
3915 		}
3916 
3917 		doqueue = 1;
3918 
3919 		t = gethrtime();
3920 
3921 		rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
3922 
3923 		/*
3924 		 * Purge the access cache and ACL cache if changing either the
3925 		 * owner of the file, the group owner, or the mode.  These may
3926 		 * change the access permissions of the file, so purge old
3927 		 * information and start over again.
3928 		 */
3929 		if (mask & (AT_UID | AT_GID | AT_MODE)) {
3930 			(void) nfs4_access_purge_rp(rp);
3931 			if (rp->r_secattr != NULL) {
3932 				mutex_enter(&rp->r_statelock);
3933 				vsp = rp->r_secattr;
3934 				rp->r_secattr = NULL;
3935 				mutex_exit(&rp->r_statelock);
3936 				if (vsp != NULL)
3937 					nfs4_acl_free_cache(vsp);
3938 			}
3939 		}
3940 
3941 		/*
3942 		 * If res.array_len == numops, then everything succeeded,
3943 		 * except for possibly the final getattr.  If only the
3944 		 * last getattr failed, give up, and don't try recovery.
3945 		 */
3946 		if (res.array_len == numops) {
3947 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3948 			    needrecov);
3949 			if (! e.error)
3950 				resp = &res;
3951 			break;
3952 		}
3953 
3954 		/*
3955 		 * if either rpc call failed or completely succeeded - done
3956 		 */
3957 		needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
3958 		if (e.error) {
3959 			PURGE_ATTRCACHE4(vp);
3960 			if (!needrecov) {
3961 				nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3962 				    needrecov);
3963 				break;
3964 			}
3965 		}
3966 
3967 		/*
3968 		 * Do proper retry for OLD_STATEID outside of the normal
3969 		 * recovery framework.
3970 		 */
3971 		if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
3972 		    sid_types.cur_sid_type != SPEC_SID &&
3973 		    sid_types.cur_sid_type != NO_SID) {
3974 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3975 			    needrecov);
3976 			nfs4_save_stateid(&stateid, &sid_types);
3977 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
3978 			    opsetattr.obj_attributes);
3979 			if (verify_argop != -1) {
3980 				nfs4args_verify_free(&argop[verify_argop]);
3981 				verify_argop = -1;
3982 			}
3983 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
3984 			goto recov_retry;
3985 		}
3986 
3987 		if (needrecov) {
3988 			bool_t abort;
3989 
3990 			abort = nfs4_start_recovery(&e,
3991 			    VTOMI4(vp), vp, NULL, NULL, NULL,
3992 			    OP_SETATTR, NULL, NULL, NULL);
3993 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
3994 			    needrecov);
3995 			/*
3996 			 * Do not retry if we failed with OLD_STATEID using
3997 			 * a special stateid.  This is done to avoid looping
3998 			 * with a broken server.
3999 			 */
4000 			if (e.error == 0 && res.status == NFS4ERR_OLD_STATEID &&
4001 			    (sid_types.cur_sid_type == SPEC_SID ||
4002 			    sid_types.cur_sid_type == NO_SID))
4003 				abort = TRUE;
4004 			if (!e.error) {
4005 				if (res.status == NFS4ERR_BADOWNER)
4006 					nfs4_log_badowner(VTOMI4(vp),
4007 					    OP_SETATTR);
4008 
4009 				e.error = geterrno4(res.status);
4010 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4011 				    (caddr_t)&res);
4012 			}
4013 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4014 			    opsetattr.obj_attributes);
4015 			if (verify_argop != -1) {
4016 				nfs4args_verify_free(&argop[verify_argop]);
4017 				verify_argop = -1;
4018 			}
4019 			if (abort == FALSE) {
4020 				/*
4021 				 * Need to retry all possible stateids in
4022 				 * case the recovery error wasn't stateid
4023 				 * related or the stateids have become
4024 				 * stale (server reboot).
4025 				 */
4026 				nfs4_init_stateid_types(&sid_types);
4027 				goto recov_retry;
4028 			}
4029 			return (e.error);
4030 		}
4031 
4032 		/*
4033 		 * Need to call nfs4_end_op before nfs4getattr to
4034 		 * avoid potential nfs4_start_op deadlock. See RFE
4035 		 * 4777612.  Calls to nfs4_invalidate_pages() and
4036 		 * nfs4_purge_stale_fh() might also generate over the
4037 		 * wire calls which my cause nfs4_start_op() deadlock.
4038 		 */
4039 		nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4040 
4041 		/*
4042 		 * Check to update lease.
4043 		 */
4044 		resp = &res;
4045 		if (res.status == NFS4_OK) {
4046 			break;
4047 		}
4048 
4049 		/*
4050 		 * Check if verify failed to see if try again
4051 		 */
4052 		if ((verify_argop == -1) || (res.array_len != 3)) {
4053 			/*
4054 			 * can't continue...
4055 			 */
4056 			if (res.status == NFS4ERR_BADOWNER)
4057 				nfs4_log_badowner(VTOMI4(vp), OP_SETATTR);
4058 
4059 			e.error = geterrno4(res.status);
4060 		} else {
4061 			/*
4062 			 * When the verify request fails, the client ctime is
4063 			 * not in sync with the server. This is the same as
4064 			 * the version 3 "not synchronized" error, and we
4065 			 * handle it in a similar manner (XXX do we need to???).
4066 			 * Use the ctime returned in the first getattr for
4067 			 * the input to the next verify.
4068 			 * If we couldn't get the attributes, then we give up
4069 			 * because we can't complete the operation as required.
4070 			 */
4071 			garp = &res.array[1].nfs_resop4_u.opgetattr.ga_res;
4072 		}
4073 		if (e.error) {
4074 			PURGE_ATTRCACHE4(vp);
4075 			nfs4_purge_stale_fh(e.error, vp, cr);
4076 		} else {
4077 			/*
4078 			 * retry with a new verify value
4079 			 */
4080 			ctime = garp->n4g_va.va_ctime;
4081 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4082 			resp = NULL;
4083 		}
4084 		if (!e.error) {
4085 			nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4086 			    opsetattr.obj_attributes);
4087 			if (verify_argop != -1) {
4088 				nfs4args_verify_free(&argop[verify_argop]);
4089 				verify_argop = -1;
4090 			}
4091 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4092 			goto do_again;
4093 		}
4094 	} while (!e.error);
4095 
4096 	if (e.error) {
4097 		/*
4098 		 * If we are here, rfs4call has an irrecoverable error - return
4099 		 */
4100 		nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4101 		    opsetattr.obj_attributes);
4102 		if (verify_argop != -1) {
4103 			nfs4args_verify_free(&argop[verify_argop]);
4104 			verify_argop = -1;
4105 		}
4106 		if (resp)
4107 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4108 		return (e.error);
4109 	}
4110 
4111 
4112 
4113 	/*
4114 	 * If changing the size of the file, invalidate
4115 	 * any local cached data which is no longer part
4116 	 * of the file.  We also possibly invalidate the
4117 	 * last page in the file.  We could use
4118 	 * pvn_vpzero(), but this would mark the page as
4119 	 * modified and require it to be written back to
4120 	 * the server for no particularly good reason.
4121 	 * This way, if we access it, then we bring it
4122 	 * back in.  A read should be cheaper than a
4123 	 * write.
4124 	 */
4125 	if (mask & AT_SIZE) {
4126 		nfs4_invalidate_pages(vp, (vap->va_size & PAGEMASK), cr);
4127 	}
4128 
4129 	/* either no error or one of the postop getattr failed */
4130 
4131 	/*
4132 	 * XXX Perform a simplified version of wcc checking. Instead of
4133 	 * have another getattr to get pre-op, just purge cache if
4134 	 * any of the ops prior to and including the getattr failed.
4135 	 * If the getattr succeeded then update the attrcache accordingly.
4136 	 */
4137 
4138 	garp = NULL;
4139 	if (res.status == NFS4_OK) {
4140 		/*
4141 		 * Last getattr
4142 		 */
4143 		resop = &res.array[numops - 1];
4144 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4145 	}
4146 	/*
4147 	 * In certain cases, nfs4_update_attrcache() will purge the attrcache,
4148 	 * rather than filling it.  See the function itself for details.
4149 	 */
4150 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4151 	if (garp != NULL) {
4152 		if (garp->n4g_resbmap & FATTR4_ACL_MASK) {
4153 			nfs4_acl_fill_cache(rp, &garp->n4g_vsa);
4154 			vs_ace4_destroy(&garp->n4g_vsa);
4155 		} else {
4156 			if (vsap != NULL) {
4157 				/*
4158 				 * The ACL was supposed to be set and to be
4159 				 * returned in the last getattr of this
4160 				 * compound, but for some reason the getattr
4161 				 * result doesn't contain the ACL.  In this
4162 				 * case, purge the ACL cache.
4163 				 */
4164 				if (rp->r_secattr != NULL) {
4165 					mutex_enter(&rp->r_statelock);
4166 					vsp = rp->r_secattr;
4167 					rp->r_secattr = NULL;
4168 					mutex_exit(&rp->r_statelock);
4169 					if (vsp != NULL)
4170 						nfs4_acl_free_cache(vsp);
4171 				}
4172 			}
4173 		}
4174 	}
4175 
4176 	if (res.status == NFS4_OK && (mask & AT_SIZE)) {
4177 		/*
4178 		 * Set the size, rather than relying on getting it updated
4179 		 * via a GETATTR.  With delegations the client tries to
4180 		 * suppress GETATTR calls.
4181 		 */
4182 		mutex_enter(&rp->r_statelock);
4183 		rp->r_size = vap->va_size;
4184 		mutex_exit(&rp->r_statelock);
4185 	}
4186 
4187 	/*
4188 	 * Can free up request args and res
4189 	 */
4190 	nfs4_fattr4_free(&argop[setattr_argop].nfs_argop4_u.
4191 	    opsetattr.obj_attributes);
4192 	if (verify_argop != -1) {
4193 		nfs4args_verify_free(&argop[verify_argop]);
4194 		verify_argop = -1;
4195 	}
4196 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4197 
4198 	/*
4199 	 * Some servers will change the mode to clear the setuid
4200 	 * and setgid bits when changing the uid or gid.  The
4201 	 * client needs to compensate appropriately.
4202 	 */
4203 	if (mask & (AT_UID | AT_GID)) {
4204 		int terror, do_setattr;
4205 
4206 		do_setattr = 0;
4207 		va.va_mask = AT_MODE;
4208 		terror = nfs4getattr(vp, &va, cr);
4209 		if (!terror &&
4210 		    (((mask & AT_MODE) && va.va_mode != vap->va_mode) ||
4211 		    (!(mask & AT_MODE) && va.va_mode != omode))) {
4212 			va.va_mask = AT_MODE;
4213 			if (mask & AT_MODE) {
4214 				/*
4215 				 * We asked the mode to be changed and what
4216 				 * we just got from the server in getattr is
4217 				 * not what we wanted it to be, so set it now.
4218 				 */
4219 				va.va_mode = vap->va_mode;
4220 				do_setattr = 1;
4221 			} else {
4222 				/*
4223 				 * We did not ask the mode to be changed,
4224 				 * Check to see that the server just cleared
4225 				 * I_SUID and I_GUID from it. If not then
4226 				 * set mode to omode with UID/GID cleared.
4227 				 */
4228 				if (nfs4_compare_modes(va.va_mode, omode)) {
4229 					omode &= ~(S_ISUID|S_ISGID);
4230 					va.va_mode = omode;
4231 					do_setattr = 1;
4232 				}
4233 			}
4234 
4235 			if (do_setattr)
4236 				(void) nfs4setattr(vp, &va, 0, cr, NULL);
4237 		}
4238 	}
4239 
4240 	return (e.error);
4241 }
4242 
4243 /* ARGSUSED */
4244 static int
4245 nfs4_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct)
4246 {
4247 	COMPOUND4args_clnt args;
4248 	COMPOUND4res_clnt res;
4249 	int doqueue;
4250 	uint32_t acc, resacc, argacc;
4251 	rnode4_t *rp;
4252 	cred_t *cred, *ncr, *ncrfree = NULL;
4253 	nfs4_access_type_t cacc;
4254 	int num_ops;
4255 	nfs_argop4 argop[3];
4256 	nfs_resop4 *resop;
4257 	bool_t needrecov = FALSE, do_getattr;
4258 	nfs4_recov_state_t recov_state;
4259 	int rpc_error;
4260 	hrtime_t t;
4261 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4262 	mntinfo4_t *mi = VTOMI4(vp);
4263 
4264 	if (nfs_zone() != mi->mi_zone)
4265 		return (EIO);
4266 
4267 	acc = 0;
4268 	if (mode & VREAD)
4269 		acc |= ACCESS4_READ;
4270 	if (mode & VWRITE) {
4271 		if ((vp->v_vfsp->vfs_flag & VFS_RDONLY) && !ISVDEV(vp->v_type))
4272 			return (EROFS);
4273 		if (vp->v_type == VDIR)
4274 			acc |= ACCESS4_DELETE;
4275 		acc |= ACCESS4_MODIFY | ACCESS4_EXTEND;
4276 	}
4277 	if (mode & VEXEC) {
4278 		if (vp->v_type == VDIR)
4279 			acc |= ACCESS4_LOOKUP;
4280 		else
4281 			acc |= ACCESS4_EXECUTE;
4282 	}
4283 
4284 	if (VTOR4(vp)->r_acache != NULL) {
4285 		e.error = nfs4_validate_caches(vp, cr);
4286 		if (e.error)
4287 			return (e.error);
4288 	}
4289 
4290 	rp = VTOR4(vp);
4291 	if (vp->v_type == VDIR)
4292 		argacc = ACCESS4_READ | ACCESS4_DELETE | ACCESS4_MODIFY |
4293 		    ACCESS4_EXTEND | ACCESS4_LOOKUP;
4294 	else
4295 		argacc = ACCESS4_READ | ACCESS4_MODIFY | ACCESS4_EXTEND |
4296 		    ACCESS4_EXECUTE;
4297 	recov_state.rs_flags = 0;
4298 	recov_state.rs_num_retry_despite_err = 0;
4299 
4300 	cred = cr;
4301 	/*
4302 	 * ncr and ncrfree both initially
4303 	 * point to the memory area returned
4304 	 * by crnetadjust();
4305 	 * ncrfree not NULL when exiting means
4306 	 * that we need to release it
4307 	 */
4308 	ncr = crnetadjust(cred);
4309 	ncrfree = ncr;
4310 
4311 tryagain:
4312 	cacc = nfs4_access_check(rp, acc, cred);
4313 	if (cacc == NFS4_ACCESS_ALLOWED) {
4314 		if (ncrfree != NULL)
4315 			crfree(ncrfree);
4316 		return (0);
4317 	}
4318 	if (cacc == NFS4_ACCESS_DENIED) {
4319 		/*
4320 		 * If the cred can be adjusted, try again
4321 		 * with the new cred.
4322 		 */
4323 		if (ncr != NULL) {
4324 			cred = ncr;
4325 			ncr = NULL;
4326 			goto tryagain;
4327 		}
4328 		if (ncrfree != NULL)
4329 			crfree(ncrfree);
4330 		return (EACCES);
4331 	}
4332 
4333 recov_retry:
4334 	/*
4335 	 * Don't take with r_statev4_lock here. r_deleg_type could
4336 	 * change as soon as lock is released.  Since it is an int,
4337 	 * there is no atomicity issue.
4338 	 */
4339 	do_getattr = (rp->r_deleg_type == OPEN_DELEGATE_NONE);
4340 	num_ops = do_getattr ? 3 : 2;
4341 
4342 	args.ctag = TAG_ACCESS;
4343 
4344 	args.array_len = num_ops;
4345 	args.array = argop;
4346 
4347 	if (e.error = nfs4_start_fop(mi, vp, NULL, OH_ACCESS,
4348 	    &recov_state, NULL)) {
4349 		if (ncrfree != NULL)
4350 			crfree(ncrfree);
4351 		return (e.error);
4352 	}
4353 
4354 	/* putfh target fh */
4355 	argop[0].argop = OP_CPUTFH;
4356 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4357 
4358 	/* access */
4359 	argop[1].argop = OP_ACCESS;
4360 	argop[1].nfs_argop4_u.opaccess.access = argacc;
4361 
4362 	/* getattr */
4363 	if (do_getattr) {
4364 		argop[2].argop = OP_GETATTR;
4365 		argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4366 		argop[2].nfs_argop4_u.opgetattr.mi = mi;
4367 	}
4368 
4369 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4370 	    "nfs4_access: %s call, rp %s", needrecov ? "recov" : "first",
4371 	    rnode4info(VTOR4(vp))));
4372 
4373 	doqueue = 1;
4374 	t = gethrtime();
4375 	rfs4call(VTOMI4(vp), &args, &res, cred, &doqueue, 0, &e);
4376 	rpc_error = e.error;
4377 
4378 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4379 	if (needrecov) {
4380 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4381 		    "nfs4_access: initiating recovery\n"));
4382 
4383 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4384 		    NULL, OP_ACCESS, NULL, NULL, NULL) == FALSE) {
4385 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_ACCESS,
4386 			    &recov_state, needrecov);
4387 			if (!e.error)
4388 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4389 				    (caddr_t)&res);
4390 			goto recov_retry;
4391 		}
4392 	}
4393 	nfs4_end_fop(mi, vp, NULL, OH_ACCESS, &recov_state, needrecov);
4394 
4395 	if (e.error)
4396 		goto out;
4397 
4398 	if (res.status) {
4399 		e.error = geterrno4(res.status);
4400 		/*
4401 		 * This might generate over the wire calls throught
4402 		 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4403 		 * here to avoid a deadlock.
4404 		 */
4405 		nfs4_purge_stale_fh(e.error, vp, cr);
4406 		goto out;
4407 	}
4408 	resop = &res.array[1];	/* access res */
4409 
4410 	resacc = resop->nfs_resop4_u.opaccess.access;
4411 
4412 	if (do_getattr) {
4413 		resop++;	/* getattr res */
4414 		nfs4_attr_cache(vp, &resop->nfs_resop4_u.opgetattr.ga_res,
4415 		    t, cr, FALSE, NULL);
4416 	}
4417 
4418 	if (!e.error) {
4419 		nfs4_access_cache(rp, argacc, resacc, cred);
4420 		/*
4421 		 * we just cached results with cred; if cred is the
4422 		 * adjusted credentials from crnetadjust, we do not want
4423 		 * to release them before exiting: hence setting ncrfree
4424 		 * to NULL
4425 		 */
4426 		if (cred != cr)
4427 			ncrfree = NULL;
4428 		/* XXX check the supported bits too? */
4429 		if ((acc & resacc) != acc) {
4430 			/*
4431 			 * The following code implements the semantic
4432 			 * that a setuid root program has *at least* the
4433 			 * permissions of the user that is running the
4434 			 * program.  See rfs3call() for more portions
4435 			 * of the implementation of this functionality.
4436 			 */
4437 			/* XXX-LP */
4438 			if (ncr != NULL) {
4439 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4440 				    (caddr_t)&res);
4441 				cred = ncr;
4442 				ncr = NULL;
4443 				goto tryagain;
4444 			}
4445 			e.error = EACCES;
4446 		}
4447 	}
4448 
4449 out:
4450 	if (!rpc_error)
4451 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4452 
4453 	if (ncrfree != NULL)
4454 		crfree(ncrfree);
4455 
4456 	return (e.error);
4457 }
4458 
4459 /* ARGSUSED */
4460 static int
4461 nfs4_readlink(vnode_t *vp, struct uio *uiop, cred_t *cr, caller_context_t *ct)
4462 {
4463 	COMPOUND4args_clnt args;
4464 	COMPOUND4res_clnt res;
4465 	int doqueue;
4466 	rnode4_t *rp;
4467 	nfs_argop4 argop[3];
4468 	nfs_resop4 *resop;
4469 	READLINK4res *lr_res;
4470 	nfs4_ga_res_t *garp;
4471 	uint_t len;
4472 	char *linkdata;
4473 	bool_t needrecov = FALSE;
4474 	nfs4_recov_state_t recov_state;
4475 	hrtime_t t;
4476 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4477 
4478 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4479 		return (EIO);
4480 	/*
4481 	 * Can't readlink anything other than a symbolic link.
4482 	 */
4483 	if (vp->v_type != VLNK)
4484 		return (EINVAL);
4485 
4486 	rp = VTOR4(vp);
4487 	if (nfs4_do_symlink_cache && rp->r_symlink.contents != NULL) {
4488 		e.error = nfs4_validate_caches(vp, cr);
4489 		if (e.error)
4490 			return (e.error);
4491 		mutex_enter(&rp->r_statelock);
4492 		if (rp->r_symlink.contents != NULL) {
4493 			e.error = uiomove(rp->r_symlink.contents,
4494 			    rp->r_symlink.len, UIO_READ, uiop);
4495 			mutex_exit(&rp->r_statelock);
4496 			return (e.error);
4497 		}
4498 		mutex_exit(&rp->r_statelock);
4499 	}
4500 	recov_state.rs_flags = 0;
4501 	recov_state.rs_num_retry_despite_err = 0;
4502 
4503 recov_retry:
4504 	args.array_len = 3;
4505 	args.array = argop;
4506 	args.ctag = TAG_READLINK;
4507 
4508 	e.error = nfs4_start_op(VTOMI4(vp), vp, NULL, &recov_state);
4509 	if (e.error) {
4510 		return (e.error);
4511 	}
4512 
4513 	/* 0. putfh symlink fh */
4514 	argop[0].argop = OP_CPUTFH;
4515 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
4516 
4517 	/* 1. readlink */
4518 	argop[1].argop = OP_READLINK;
4519 
4520 	/* 2. getattr */
4521 	argop[2].argop = OP_GETATTR;
4522 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
4523 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(vp);
4524 
4525 	doqueue = 1;
4526 
4527 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
4528 	    "nfs4_readlink: %s call, rp %s", needrecov ? "recov" : "first",
4529 	    rnode4info(VTOR4(vp))));
4530 
4531 	t = gethrtime();
4532 
4533 	rfs4call(VTOMI4(vp), &args, &res, cr, &doqueue, 0, &e);
4534 
4535 	needrecov = nfs4_needs_recovery(&e, FALSE, vp->v_vfsp);
4536 	if (needrecov) {
4537 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
4538 		    "nfs4_readlink: initiating recovery\n"));
4539 
4540 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
4541 		    NULL, OP_READLINK, NULL, NULL, NULL) == FALSE) {
4542 			if (!e.error)
4543 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4544 				    (caddr_t)&res);
4545 
4546 			nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state,
4547 			    needrecov);
4548 			goto recov_retry;
4549 		}
4550 	}
4551 
4552 	nfs4_end_op(VTOMI4(vp), vp, NULL, &recov_state, needrecov);
4553 
4554 	if (e.error)
4555 		return (e.error);
4556 
4557 	/*
4558 	 * There is an path in the code below which calls
4559 	 * nfs4_purge_stale_fh(), which may generate otw calls through
4560 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
4561 	 * here to avoid nfs4_start_op() deadlock.
4562 	 */
4563 
4564 	if (res.status && (res.array_len < args.array_len)) {
4565 		/*
4566 		 * either Putfh or Link failed
4567 		 */
4568 		e.error = geterrno4(res.status);
4569 		nfs4_purge_stale_fh(e.error, vp, cr);
4570 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4571 		return (e.error);
4572 	}
4573 
4574 	resop = &res.array[1];	/* readlink res */
4575 	lr_res = &resop->nfs_resop4_u.opreadlink;
4576 
4577 	/*
4578 	 * treat symlink names as data
4579 	 */
4580 	linkdata = utf8_to_str(&lr_res->link, &len, NULL);
4581 	if (linkdata != NULL) {
4582 		int uio_len = len - 1;
4583 		/* len includes null byte, which we won't uiomove */
4584 		e.error = uiomove(linkdata, uio_len, UIO_READ, uiop);
4585 		if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
4586 			mutex_enter(&rp->r_statelock);
4587 			if (rp->r_symlink.contents == NULL) {
4588 				rp->r_symlink.contents = linkdata;
4589 				rp->r_symlink.len = uio_len;
4590 				rp->r_symlink.size = len;
4591 				mutex_exit(&rp->r_statelock);
4592 			} else {
4593 				mutex_exit(&rp->r_statelock);
4594 				kmem_free(linkdata, len);
4595 			}
4596 		} else {
4597 			kmem_free(linkdata, len);
4598 		}
4599 	}
4600 	if (res.status == NFS4_OK) {
4601 		resop++;	/* getattr res */
4602 		garp = &resop->nfs_resop4_u.opgetattr.ga_res;
4603 	}
4604 	e.error = nfs4_update_attrcache(res.status, garp, t, vp, cr);
4605 
4606 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
4607 
4608 	/*
4609 	 * The over the wire error for attempting to readlink something
4610 	 * other than a symbolic link is ENXIO.  However, we need to
4611 	 * return EINVAL instead of ENXIO, so we map it here.
4612 	 */
4613 	return (e.error == ENXIO ? EINVAL : e.error);
4614 }
4615 
4616 /*
4617  * Flush local dirty pages to stable storage on the server.
4618  *
4619  * If FNODSYNC is specified, then there is nothing to do because
4620  * metadata changes are not cached on the client before being
4621  * sent to the server.
4622  */
4623 /* ARGSUSED */
4624 static int
4625 nfs4_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
4626 {
4627 	int error;
4628 
4629 	if ((syncflag & FNODSYNC) || IS_SWAPVP(vp))
4630 		return (0);
4631 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
4632 		return (EIO);
4633 	error = nfs4_putpage_commit(vp, (offset_t)0, 0, cr);
4634 	if (!error)
4635 		error = VTOR4(vp)->r_error;
4636 	return (error);
4637 }
4638 
4639 /*
4640  * Weirdness: if the file was removed or the target of a rename
4641  * operation while it was open, it got renamed instead.  Here we
4642  * remove the renamed file.
4643  */
4644 /* ARGSUSED */
4645 void
4646 nfs4_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4647 {
4648 	rnode4_t *rp;
4649 
4650 	ASSERT(vp != DNLC_NO_VNODE);
4651 
4652 	rp = VTOR4(vp);
4653 
4654 	if (IS_SHADOW(vp, rp)) {
4655 		sv_inactive(vp);
4656 		return;
4657 	}
4658 
4659 	/*
4660 	 * If this is coming from the wrong zone, we let someone in the right
4661 	 * zone take care of it asynchronously.  We can get here due to
4662 	 * VN_RELE() being called from pageout() or fsflush().  This call may
4663 	 * potentially turn into an expensive no-op if, for instance, v_count
4664 	 * gets incremented in the meantime, but it's still correct.
4665 	 */
4666 	if (nfs_zone() != VTOMI4(vp)->mi_zone) {
4667 		nfs4_async_inactive(vp, cr);
4668 		return;
4669 	}
4670 
4671 	/*
4672 	 * Some of the cleanup steps might require over-the-wire
4673 	 * operations.  Since VOP_INACTIVE can get called as a result of
4674 	 * other over-the-wire operations (e.g., an attribute cache update
4675 	 * can lead to a DNLC purge), doing those steps now would lead to a
4676 	 * nested call to the recovery framework, which can deadlock.  So
4677 	 * do any over-the-wire cleanups asynchronously, in a separate
4678 	 * thread.
4679 	 */
4680 
4681 	mutex_enter(&rp->r_os_lock);
4682 	mutex_enter(&rp->r_statelock);
4683 	mutex_enter(&rp->r_statev4_lock);
4684 
4685 	if (vp->v_type == VREG && list_head(&rp->r_open_streams) != NULL) {
4686 		mutex_exit(&rp->r_statev4_lock);
4687 		mutex_exit(&rp->r_statelock);
4688 		mutex_exit(&rp->r_os_lock);
4689 		nfs4_async_inactive(vp, cr);
4690 		return;
4691 	}
4692 
4693 	if (rp->r_deleg_type == OPEN_DELEGATE_READ ||
4694 	    rp->r_deleg_type == OPEN_DELEGATE_WRITE) {
4695 		mutex_exit(&rp->r_statev4_lock);
4696 		mutex_exit(&rp->r_statelock);
4697 		mutex_exit(&rp->r_os_lock);
4698 		nfs4_async_inactive(vp, cr);
4699 		return;
4700 	}
4701 
4702 	if (rp->r_unldvp != NULL) {
4703 		mutex_exit(&rp->r_statev4_lock);
4704 		mutex_exit(&rp->r_statelock);
4705 		mutex_exit(&rp->r_os_lock);
4706 		nfs4_async_inactive(vp, cr);
4707 		return;
4708 	}
4709 	mutex_exit(&rp->r_statev4_lock);
4710 	mutex_exit(&rp->r_statelock);
4711 	mutex_exit(&rp->r_os_lock);
4712 
4713 	rp4_addfree(rp, cr);
4714 }
4715 
4716 /*
4717  * nfs4_inactive_otw - nfs4_inactive, plus over-the-wire calls to free up
4718  * various bits of state.  The caller must not refer to vp after this call.
4719  */
4720 
4721 void
4722 nfs4_inactive_otw(vnode_t *vp, cred_t *cr)
4723 {
4724 	rnode4_t *rp = VTOR4(vp);
4725 	nfs4_recov_state_t recov_state;
4726 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
4727 	vnode_t *unldvp;
4728 	char *unlname;
4729 	cred_t *unlcred;
4730 	COMPOUND4args_clnt args;
4731 	COMPOUND4res_clnt res, *resp;
4732 	nfs_argop4 argop[2];
4733 	int doqueue;
4734 #ifdef DEBUG
4735 	char *name;
4736 #endif
4737 
4738 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
4739 	ASSERT(!IS_SHADOW(vp, rp));
4740 
4741 #ifdef DEBUG
4742 	name = fn_name(VTOSV(vp)->sv_name);
4743 	NFS4_DEBUG(nfs4_client_inactive_debug, (CE_NOTE, "nfs4_inactive_otw: "
4744 	    "release vnode %s", name));
4745 	kmem_free(name, MAXNAMELEN);
4746 #endif
4747 
4748 	if (vp->v_type == VREG) {
4749 		bool_t recov_failed = FALSE;
4750 
4751 		e.error = nfs4close_all(vp, cr);
4752 		if (e.error) {
4753 			/* Check to see if recovery failed */
4754 			mutex_enter(&(VTOMI4(vp)->mi_lock));
4755 			if (VTOMI4(vp)->mi_flags & MI4_RECOV_FAIL)
4756 				recov_failed = TRUE;
4757 			mutex_exit(&(VTOMI4(vp)->mi_lock));
4758 			if (!recov_failed) {
4759 				mutex_enter(&rp->r_statelock);
4760 				if (rp->r_flags & R4RECOVERR)
4761 					recov_failed = TRUE;
4762 				mutex_exit(&rp->r_statelock);
4763 			}
4764 			if (recov_failed) {
4765 				NFS4_DEBUG(nfs4_client_recov_debug,
4766 				    (CE_NOTE, "nfs4_inactive_otw: "
4767 				    "close failed (recovery failure)"));
4768 			}
4769 		}
4770 	}
4771 
4772 redo:
4773 	if (rp->r_unldvp == NULL) {
4774 		rp4_addfree(rp, cr);
4775 		return;
4776 	}
4777 
4778 	/*
4779 	 * Save the vnode pointer for the directory where the
4780 	 * unlinked-open file got renamed, then set it to NULL
4781 	 * to prevent another thread from getting here before
4782 	 * we're done with the remove.  While we have the
4783 	 * statelock, make local copies of the pertinent rnode
4784 	 * fields.  If we weren't to do this in an atomic way, the
4785 	 * the unl* fields could become inconsistent with respect
4786 	 * to each other due to a race condition between this
4787 	 * code and nfs_remove().  See bug report 1034328.
4788 	 */
4789 	mutex_enter(&rp->r_statelock);
4790 	if (rp->r_unldvp == NULL) {
4791 		mutex_exit(&rp->r_statelock);
4792 		rp4_addfree(rp, cr);
4793 		return;
4794 	}
4795 
4796 	unldvp = rp->r_unldvp;
4797 	rp->r_unldvp = NULL;
4798 	unlname = rp->r_unlname;
4799 	rp->r_unlname = NULL;
4800 	unlcred = rp->r_unlcred;
4801 	rp->r_unlcred = NULL;
4802 	mutex_exit(&rp->r_statelock);
4803 
4804 	/*
4805 	 * If there are any dirty pages left, then flush
4806 	 * them.  This is unfortunate because they just
4807 	 * may get thrown away during the remove operation,
4808 	 * but we have to do this for correctness.
4809 	 */
4810 	if (nfs4_has_pages(vp) &&
4811 	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
4812 		ASSERT(vp->v_type != VCHR);
4813 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, NULL);
4814 		if (e.error) {
4815 			mutex_enter(&rp->r_statelock);
4816 			if (!rp->r_error)
4817 				rp->r_error = e.error;
4818 			mutex_exit(&rp->r_statelock);
4819 		}
4820 	}
4821 
4822 	recov_state.rs_flags = 0;
4823 	recov_state.rs_num_retry_despite_err = 0;
4824 recov_retry_remove:
4825 	/*
4826 	 * Do the remove operation on the renamed file
4827 	 */
4828 	args.ctag = TAG_INACTIVE;
4829 
4830 	/*
4831 	 * Remove ops: putfh dir; remove
4832 	 */
4833 	args.array_len = 2;
4834 	args.array = argop;
4835 
4836 	e.error = nfs4_start_op(VTOMI4(unldvp), unldvp, NULL, &recov_state);
4837 	if (e.error) {
4838 		kmem_free(unlname, MAXNAMELEN);
4839 		crfree(unlcred);
4840 		VN_RELE(unldvp);
4841 		/*
4842 		 * Try again; this time around r_unldvp will be NULL, so we'll
4843 		 * just call rp4_addfree() and return.
4844 		 */
4845 		goto redo;
4846 	}
4847 
4848 	/* putfh directory */
4849 	argop[0].argop = OP_CPUTFH;
4850 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(unldvp)->r_fh;
4851 
4852 	/* remove */
4853 	argop[1].argop = OP_CREMOVE;
4854 	argop[1].nfs_argop4_u.opcremove.ctarget = unlname;
4855 
4856 	doqueue = 1;
4857 	resp = &res;
4858 
4859 #if 0 /* notyet */
4860 	/*
4861 	 * Can't do this yet.  We may be being called from
4862 	 * dnlc_purge_XXX while that routine is holding a
4863 	 * mutex lock to the nc_rele list.  The calls to
4864 	 * nfs3_cache_wcc_data may result in calls to
4865 	 * dnlc_purge_XXX.  This will result in a deadlock.
4866 	 */
4867 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4868 	if (e.error) {
4869 		PURGE_ATTRCACHE4(unldvp);
4870 		resp = NULL;
4871 	} else if (res.status) {
4872 		e.error = geterrno4(res.status);
4873 		PURGE_ATTRCACHE4(unldvp);
4874 		/*
4875 		 * This code is inactive right now
4876 		 * but if made active there should
4877 		 * be a nfs4_end_op() call before
4878 		 * nfs4_purge_stale_fh to avoid start_op()
4879 		 * deadlock. See BugId: 4948726
4880 		 */
4881 		nfs4_purge_stale_fh(error, unldvp, cr);
4882 	} else {
4883 		nfs_resop4 *resop;
4884 		REMOVE4res *rm_res;
4885 
4886 		resop = &res.array[1];
4887 		rm_res = &resop->nfs_resop4_u.opremove;
4888 		/*
4889 		 * Update directory cache attribute,
4890 		 * readdir and dnlc caches.
4891 		 */
4892 		nfs4_update_dircaches(&rm_res->cinfo, unldvp, NULL, NULL, NULL);
4893 	}
4894 #else
4895 	rfs4call(VTOMI4(unldvp), &args, &res, unlcred, &doqueue, 0, &e);
4896 
4897 	PURGE_ATTRCACHE4(unldvp);
4898 #endif
4899 
4900 	if (nfs4_needs_recovery(&e, FALSE, unldvp->v_vfsp)) {
4901 		if (nfs4_start_recovery(&e, VTOMI4(unldvp), unldvp, NULL,
4902 		    NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
4903 			if (!e.error)
4904 				(void) xdr_free(xdr_COMPOUND4res_clnt,
4905 				    (caddr_t)&res);
4906 			nfs4_end_op(VTOMI4(unldvp), unldvp, NULL,
4907 			    &recov_state, TRUE);
4908 			goto recov_retry_remove;
4909 		}
4910 	}
4911 	nfs4_end_op(VTOMI4(unldvp), unldvp, NULL, &recov_state, FALSE);
4912 
4913 	/*
4914 	 * Release stuff held for the remove
4915 	 */
4916 	VN_RELE(unldvp);
4917 	if (!e.error && resp)
4918 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
4919 
4920 	kmem_free(unlname, MAXNAMELEN);
4921 	crfree(unlcred);
4922 	goto redo;
4923 }
4924 
4925 /*
4926  * Remote file system operations having to do with directory manipulation.
4927  */
4928 /* ARGSUSED3 */
4929 int
4930 nfs4_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
4931     int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
4932     int *direntflags, pathname_t *realpnp)
4933 {
4934 	int error;
4935 	vnode_t *vp, *avp = NULL;
4936 	rnode4_t *drp;
4937 
4938 	*vpp = NULL;
4939 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
4940 		return (EPERM);
4941 	/*
4942 	 * if LOOKUP_XATTR, must replace dvp (object) with
4943 	 * object's attrdir before continuing with lookup
4944 	 */
4945 	if (flags & LOOKUP_XATTR) {
4946 		error = nfs4lookup_xattr(dvp, nm, &avp, flags, cr);
4947 		if (error)
4948 			return (error);
4949 
4950 		dvp = avp;
4951 
4952 		/*
4953 		 * If lookup is for "", just return dvp now.  The attrdir
4954 		 * has already been activated (from nfs4lookup_xattr), and
4955 		 * the caller will RELE the original dvp -- not
4956 		 * the attrdir.  So, set vpp and return.
4957 		 * Currently, when the LOOKUP_XATTR flag is
4958 		 * passed to VOP_LOOKUP, the name is always empty, and
4959 		 * shortcircuiting here avoids 3 unneeded lock/unlock
4960 		 * pairs.
4961 		 *
4962 		 * If a non-empty name was provided, then it is the
4963 		 * attribute name, and it will be looked up below.
4964 		 */
4965 		if (*nm == '\0') {
4966 			*vpp = dvp;
4967 			return (0);
4968 		}
4969 
4970 		/*
4971 		 * The vfs layer never sends a name when asking for the
4972 		 * attrdir, so we should never get here (unless of course
4973 		 * name is passed at some time in future -- at which time
4974 		 * we'll blow up here).
4975 		 */
4976 		ASSERT(0);
4977 	}
4978 
4979 	drp = VTOR4(dvp);
4980 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
4981 		return (EINTR);
4982 
4983 	error = nfs4lookup(dvp, nm, vpp, cr, 0);
4984 	nfs_rw_exit(&drp->r_rwlock);
4985 
4986 	/*
4987 	 * If vnode is a device, create special vnode.
4988 	 */
4989 	if (!error && ISVDEV((*vpp)->v_type)) {
4990 		vp = *vpp;
4991 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
4992 		VN_RELE(vp);
4993 	}
4994 
4995 	return (error);
4996 }
4997 
4998 /* ARGSUSED */
4999 static int
5000 nfs4lookup_xattr(vnode_t *dvp, char *nm, vnode_t **vpp, int flags, cred_t *cr)
5001 {
5002 	int error;
5003 	rnode4_t *drp;
5004 	int cflag = ((flags & CREATE_XATTR_DIR) != 0);
5005 	mntinfo4_t *mi;
5006 
5007 	mi = VTOMI4(dvp);
5008 	if (!(mi->mi_vfsp->vfs_flag & VFS_XATTR) &&
5009 	    !vfs_has_feature(mi->mi_vfsp, VFSFT_SYSATTR_VIEWS))
5010 		return (EINVAL);
5011 
5012 	drp = VTOR4(dvp);
5013 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp)))
5014 		return (EINTR);
5015 
5016 	mutex_enter(&drp->r_statelock);
5017 	/*
5018 	 * If the server doesn't support xattrs just return EINVAL
5019 	 */
5020 	if (drp->r_xattr_dir == NFS4_XATTR_DIR_NOTSUPP) {
5021 		mutex_exit(&drp->r_statelock);
5022 		nfs_rw_exit(&drp->r_rwlock);
5023 		return (EINVAL);
5024 	}
5025 
5026 	/*
5027 	 * If there is a cached xattr directory entry,
5028 	 * use it as long as the attributes are valid. If the
5029 	 * attributes are not valid, take the simple approach and
5030 	 * free the cached value and re-fetch a new value.
5031 	 *
5032 	 * We don't negative entry cache for now, if we did we
5033 	 * would need to check if the file has changed on every
5034 	 * lookup. But xattrs don't exist very often and failing
5035 	 * an openattr is not much more expensive than and NVERIFY or GETATTR
5036 	 * so do an openattr over the wire for now.
5037 	 */
5038 	if (drp->r_xattr_dir != NULL) {
5039 		if (ATTRCACHE4_VALID(dvp)) {
5040 			VN_HOLD(drp->r_xattr_dir);
5041 			*vpp = drp->r_xattr_dir;
5042 			mutex_exit(&drp->r_statelock);
5043 			nfs_rw_exit(&drp->r_rwlock);
5044 			return (0);
5045 		}
5046 		VN_RELE(drp->r_xattr_dir);
5047 		drp->r_xattr_dir = NULL;
5048 	}
5049 	mutex_exit(&drp->r_statelock);
5050 
5051 	error = nfs4openattr(dvp, vpp, cflag, cr);
5052 
5053 	nfs_rw_exit(&drp->r_rwlock);
5054 
5055 	return (error);
5056 }
5057 
5058 static int
5059 nfs4lookup(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr, int skipdnlc)
5060 {
5061 	int error;
5062 	rnode4_t *drp;
5063 
5064 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5065 
5066 	/*
5067 	 * If lookup is for "", just return dvp.  Don't need
5068 	 * to send it over the wire, look it up in the dnlc,
5069 	 * or perform any access checks.
5070 	 */
5071 	if (*nm == '\0') {
5072 		VN_HOLD(dvp);
5073 		*vpp = dvp;
5074 		return (0);
5075 	}
5076 
5077 	/*
5078 	 * Can't do lookups in non-directories.
5079 	 */
5080 	if (dvp->v_type != VDIR)
5081 		return (ENOTDIR);
5082 
5083 	/*
5084 	 * If lookup is for ".", just return dvp.  Don't need
5085 	 * to send it over the wire or look it up in the dnlc,
5086 	 * just need to check access.
5087 	 */
5088 	if (nm[0] == '.' && nm[1] == '\0') {
5089 		error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5090 		if (error)
5091 			return (error);
5092 		VN_HOLD(dvp);
5093 		*vpp = dvp;
5094 		return (0);
5095 	}
5096 
5097 	drp = VTOR4(dvp);
5098 	if (!(drp->r_flags & R4LOOKUP)) {
5099 		mutex_enter(&drp->r_statelock);
5100 		drp->r_flags |= R4LOOKUP;
5101 		mutex_exit(&drp->r_statelock);
5102 	}
5103 
5104 	*vpp = NULL;
5105 	/*
5106 	 * Lookup this name in the DNLC.  If there is no entry
5107 	 * lookup over the wire.
5108 	 */
5109 	if (!skipdnlc)
5110 		*vpp = dnlc_lookup(dvp, nm);
5111 	if (*vpp == NULL) {
5112 		/*
5113 		 * We need to go over the wire to lookup the name.
5114 		 */
5115 		return (nfs4lookupnew_otw(dvp, nm, vpp, cr));
5116 	}
5117 
5118 	/*
5119 	 * We hit on the dnlc
5120 	 */
5121 	if (*vpp != DNLC_NO_VNODE ||
5122 	    (dvp->v_vfsp->vfs_flag & VFS_RDONLY)) {
5123 		/*
5124 		 * But our attrs may not be valid.
5125 		 */
5126 		if (ATTRCACHE4_VALID(dvp)) {
5127 			error = nfs4_waitfor_purge_complete(dvp);
5128 			if (error) {
5129 				VN_RELE(*vpp);
5130 				*vpp = NULL;
5131 				return (error);
5132 			}
5133 
5134 			/*
5135 			 * If after the purge completes, check to make sure
5136 			 * our attrs are still valid.
5137 			 */
5138 			if (ATTRCACHE4_VALID(dvp)) {
5139 				/*
5140 				 * If we waited for a purge we may have
5141 				 * lost our vnode so look it up again.
5142 				 */
5143 				VN_RELE(*vpp);
5144 				*vpp = dnlc_lookup(dvp, nm);
5145 				if (*vpp == NULL)
5146 					return (nfs4lookupnew_otw(dvp,
5147 					    nm, vpp, cr));
5148 
5149 				/*
5150 				 * The access cache should almost always hit
5151 				 */
5152 				error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5153 
5154 				if (error) {
5155 					VN_RELE(*vpp);
5156 					*vpp = NULL;
5157 					return (error);
5158 				}
5159 				if (*vpp == DNLC_NO_VNODE) {
5160 					VN_RELE(*vpp);
5161 					*vpp = NULL;
5162 					return (ENOENT);
5163 				}
5164 				return (0);
5165 			}
5166 		}
5167 	}
5168 
5169 	ASSERT(*vpp != NULL);
5170 
5171 	/*
5172 	 * We may have gotten here we have one of the following cases:
5173 	 *	1) vpp != DNLC_NO_VNODE, our attrs have timed out so we
5174 	 *		need to validate them.
5175 	 *	2) vpp == DNLC_NO_VNODE, a negative entry that we always
5176 	 *		must validate.
5177 	 *
5178 	 * Go to the server and check if the directory has changed, if
5179 	 * it hasn't we are done and can use the dnlc entry.
5180 	 */
5181 	return (nfs4lookupvalidate_otw(dvp, nm, vpp, cr));
5182 }
5183 
5184 /*
5185  * Go to the server and check if the directory has changed, if
5186  * it hasn't we are done and can use the dnlc entry.  If it
5187  * has changed we get a new copy of its attributes and check
5188  * the access for VEXEC, then relookup the filename and
5189  * get its filehandle and attributes.
5190  *
5191  * PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR
5192  *	if the NVERIFY failed we must
5193  *		purge the caches
5194  *		cache new attributes (will set r_time_attr_inval)
5195  *		cache new access
5196  *		recheck VEXEC access
5197  *		add name to dnlc, possibly negative
5198  *		if LOOKUP succeeded
5199  *			cache new attributes
5200  *	else
5201  *		set a new r_time_attr_inval for dvp
5202  *		check to make sure we have access
5203  *
5204  * The vpp returned is the vnode passed in if the directory is valid,
5205  * a new vnode if successful lookup, or NULL on error.
5206  */
5207 static int
5208 nfs4lookupvalidate_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5209 {
5210 	COMPOUND4args_clnt args;
5211 	COMPOUND4res_clnt res;
5212 	fattr4 *ver_fattr;
5213 	fattr4_change dchange;
5214 	int32_t *ptr;
5215 	int argoplist_size  = 7 * sizeof (nfs_argop4);
5216 	nfs_argop4 *argop;
5217 	int doqueue;
5218 	mntinfo4_t *mi;
5219 	nfs4_recov_state_t recov_state;
5220 	hrtime_t t;
5221 	int isdotdot;
5222 	vnode_t *nvp;
5223 	nfs_fh4 *fhp;
5224 	nfs4_sharedfh_t *sfhp;
5225 	nfs4_access_type_t cacc;
5226 	rnode4_t *nrp;
5227 	rnode4_t *drp = VTOR4(dvp);
5228 	nfs4_ga_res_t *garp = NULL;
5229 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5230 
5231 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5232 	ASSERT(nm != NULL);
5233 	ASSERT(nm[0] != '\0');
5234 	ASSERT(dvp->v_type == VDIR);
5235 	ASSERT(nm[0] != '.' || nm[1] != '\0');
5236 	ASSERT(*vpp != NULL);
5237 
5238 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5239 		isdotdot = 1;
5240 		args.ctag = TAG_LOOKUP_VPARENT;
5241 	} else {
5242 		/*
5243 		 * If dvp were a stub, it should have triggered and caused
5244 		 * a mount for us to get this far.
5245 		 */
5246 		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5247 
5248 		isdotdot = 0;
5249 		args.ctag = TAG_LOOKUP_VALID;
5250 	}
5251 
5252 	mi = VTOMI4(dvp);
5253 	recov_state.rs_flags = 0;
5254 	recov_state.rs_num_retry_despite_err = 0;
5255 
5256 	nvp = NULL;
5257 
5258 	/* Save the original mount point security information */
5259 	(void) save_mnt_secinfo(mi->mi_curr_serv);
5260 
5261 recov_retry:
5262 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5263 	    &recov_state, NULL);
5264 	if (e.error) {
5265 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5266 		VN_RELE(*vpp);
5267 		*vpp = NULL;
5268 		return (e.error);
5269 	}
5270 
5271 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5272 
5273 	/* PUTFH dfh NVERIFY GETATTR ACCESS LOOKUP GETFH GETATTR */
5274 	args.array_len = 7;
5275 	args.array = argop;
5276 
5277 	/* 0. putfh file */
5278 	argop[0].argop = OP_CPUTFH;
5279 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5280 
5281 	/* 1. nverify the change info */
5282 	argop[1].argop = OP_NVERIFY;
5283 	ver_fattr = &argop[1].nfs_argop4_u.opnverify.obj_attributes;
5284 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5285 	ver_fattr->attrlist4 = (char *)&dchange;
5286 	ptr = (int32_t *)&dchange;
5287 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5288 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5289 
5290 	/* 2. getattr directory */
5291 	argop[2].argop = OP_GETATTR;
5292 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5293 	argop[2].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5294 
5295 	/* 3. access directory */
5296 	argop[3].argop = OP_ACCESS;
5297 	argop[3].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5298 	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5299 
5300 	/* 4. lookup name */
5301 	if (isdotdot) {
5302 		argop[4].argop = OP_LOOKUPP;
5303 	} else {
5304 		argop[4].argop = OP_CLOOKUP;
5305 		argop[4].nfs_argop4_u.opclookup.cname = nm;
5306 	}
5307 
5308 	/* 5. resulting file handle */
5309 	argop[5].argop = OP_GETFH;
5310 
5311 	/* 6. resulting file attributes */
5312 	argop[6].argop = OP_GETATTR;
5313 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5314 	argop[6].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5315 
5316 	doqueue = 1;
5317 	t = gethrtime();
5318 
5319 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5320 
5321 	if (!isdotdot && res.status == NFS4ERR_MOVED) {
5322 		e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5323 		if (e.error != 0 && *vpp != NULL)
5324 			VN_RELE(*vpp);
5325 		nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5326 		    &recov_state, FALSE);
5327 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5328 		kmem_free(argop, argoplist_size);
5329 		return (e.error);
5330 	}
5331 
5332 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5333 		/*
5334 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5335 		 * from this thread, do not go thru the recovery thread since
5336 		 * we need the nm information.
5337 		 *
5338 		 * Not doing dotdot case because there is no specification
5339 		 * for (PUTFH, SECINFO "..") yet.
5340 		 */
5341 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5342 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5343 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5344 				    &recov_state, FALSE);
5345 			else
5346 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5347 				    &recov_state, TRUE);
5348 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5349 			kmem_free(argop, argoplist_size);
5350 			if (!e.error)
5351 				goto recov_retry;
5352 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5353 			VN_RELE(*vpp);
5354 			*vpp = NULL;
5355 			return (e.error);
5356 		}
5357 
5358 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5359 		    OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5360 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5361 			    &recov_state, TRUE);
5362 
5363 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5364 			kmem_free(argop, argoplist_size);
5365 			goto recov_retry;
5366 		}
5367 	}
5368 
5369 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5370 
5371 	if (e.error || res.array_len == 0) {
5372 		/*
5373 		 * If e.error isn't set, then reply has no ops (or we couldn't
5374 		 * be here).  The only legal way to reply without an op array
5375 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5376 		 * be in the reply for all other status values.
5377 		 *
5378 		 * For valid replies without an ops array, return ENOTSUP
5379 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5380 		 * return EIO -- don't trust status.
5381 		 */
5382 		if (e.error == 0)
5383 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5384 			    ENOTSUP : EIO;
5385 		VN_RELE(*vpp);
5386 		*vpp = NULL;
5387 		kmem_free(argop, argoplist_size);
5388 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5389 		return (e.error);
5390 	}
5391 
5392 	if (res.status != NFS4ERR_SAME) {
5393 		e.error = geterrno4(res.status);
5394 
5395 		/*
5396 		 * The NVERIFY "failed" so the directory has changed
5397 		 * First make sure PUTFH succeeded and NVERIFY "failed"
5398 		 * cleanly.
5399 		 */
5400 		if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5401 		    (res.array[1].nfs_resop4_u.opnverify.status != NFS4_OK)) {
5402 			nfs4_purge_stale_fh(e.error, dvp, cr);
5403 			VN_RELE(*vpp);
5404 			*vpp = NULL;
5405 			goto exit;
5406 		}
5407 
5408 		/*
5409 		 * We know the NVERIFY "failed" so we must:
5410 		 *	purge the caches (access and indirectly dnlc if needed)
5411 		 */
5412 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5413 
5414 		if (res.array[2].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5415 			nfs4_purge_stale_fh(e.error, dvp, cr);
5416 			VN_RELE(*vpp);
5417 			*vpp = NULL;
5418 			goto exit;
5419 		}
5420 
5421 		/*
5422 		 * Install new cached attributes for the directory
5423 		 */
5424 		nfs4_attr_cache(dvp,
5425 		    &res.array[2].nfs_resop4_u.opgetattr.ga_res,
5426 		    t, cr, FALSE, NULL);
5427 
5428 		if (res.array[3].nfs_resop4_u.opaccess.status != NFS4_OK) {
5429 			nfs4_purge_stale_fh(e.error, dvp, cr);
5430 			VN_RELE(*vpp);
5431 			*vpp = NULL;
5432 			e.error = geterrno4(res.status);
5433 			goto exit;
5434 		}
5435 
5436 		/*
5437 		 * Now we know the directory is valid,
5438 		 * cache new directory access
5439 		 */
5440 		nfs4_access_cache(drp,
5441 		    args.array[3].nfs_argop4_u.opaccess.access,
5442 		    res.array[3].nfs_resop4_u.opaccess.access, cr);
5443 
5444 		/*
5445 		 * recheck VEXEC access
5446 		 */
5447 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5448 		if (cacc != NFS4_ACCESS_ALLOWED) {
5449 			/*
5450 			 * Directory permissions might have been revoked
5451 			 */
5452 			if (cacc == NFS4_ACCESS_DENIED) {
5453 				e.error = EACCES;
5454 				VN_RELE(*vpp);
5455 				*vpp = NULL;
5456 				goto exit;
5457 			}
5458 
5459 			/*
5460 			 * Somehow we must not have asked for enough
5461 			 * so try a singleton ACCESS, should never happen.
5462 			 */
5463 			e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5464 			if (e.error) {
5465 				VN_RELE(*vpp);
5466 				*vpp = NULL;
5467 				goto exit;
5468 			}
5469 		}
5470 
5471 		e.error = geterrno4(res.status);
5472 		if (res.array[4].nfs_resop4_u.oplookup.status != NFS4_OK) {
5473 			/*
5474 			 * The lookup failed, probably no entry
5475 			 */
5476 			if (e.error == ENOENT && nfs4_lookup_neg_cache) {
5477 				dnlc_update(dvp, nm, DNLC_NO_VNODE);
5478 			} else {
5479 				/*
5480 				 * Might be some other error, so remove
5481 				 * the dnlc entry to make sure we start all
5482 				 * over again, next time.
5483 				 */
5484 				dnlc_remove(dvp, nm);
5485 			}
5486 			VN_RELE(*vpp);
5487 			*vpp = NULL;
5488 			goto exit;
5489 		}
5490 
5491 		if (res.array[5].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5492 			/*
5493 			 * The file exists but we can't get its fh for
5494 			 * some unknown reason.  Remove it from the dnlc
5495 			 * and error out to be safe.
5496 			 */
5497 			dnlc_remove(dvp, nm);
5498 			VN_RELE(*vpp);
5499 			*vpp = NULL;
5500 			goto exit;
5501 		}
5502 		fhp = &res.array[5].nfs_resop4_u.opgetfh.object;
5503 		if (fhp->nfs_fh4_len == 0) {
5504 			/*
5505 			 * The file exists but a bogus fh
5506 			 * some unknown reason.  Remove it from the dnlc
5507 			 * and error out to be safe.
5508 			 */
5509 			e.error = ENOENT;
5510 			dnlc_remove(dvp, nm);
5511 			VN_RELE(*vpp);
5512 			*vpp = NULL;
5513 			goto exit;
5514 		}
5515 		sfhp = sfh4_get(fhp, mi);
5516 
5517 		if (res.array[6].nfs_resop4_u.opgetattr.status == NFS4_OK)
5518 			garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
5519 
5520 		/*
5521 		 * Make the new rnode
5522 		 */
5523 		if (isdotdot) {
5524 			e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5525 			if (e.error) {
5526 				sfh4_rele(&sfhp);
5527 				VN_RELE(*vpp);
5528 				*vpp = NULL;
5529 				goto exit;
5530 			}
5531 			/*
5532 			 * XXX if nfs4_make_dotdot uses an existing rnode
5533 			 * XXX it doesn't update the attributes.
5534 			 * XXX for now just save them again to save an OTW
5535 			 */
5536 			nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
5537 		} else {
5538 			nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
5539 			    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
5540 			/*
5541 			 * If v_type == VNON, then garp was NULL because
5542 			 * the last op in the compound failed and makenfs4node
5543 			 * could not find the vnode for sfhp. It created
5544 			 * a new vnode, so we have nothing to purge here.
5545 			 */
5546 			if (nvp->v_type == VNON) {
5547 				vattr_t vattr;
5548 
5549 				vattr.va_mask = AT_TYPE;
5550 				/*
5551 				 * N.B. We've already called nfs4_end_fop above.
5552 				 */
5553 				e.error = nfs4getattr(nvp, &vattr, cr);
5554 				if (e.error) {
5555 					sfh4_rele(&sfhp);
5556 					VN_RELE(*vpp);
5557 					*vpp = NULL;
5558 					VN_RELE(nvp);
5559 					goto exit;
5560 				}
5561 				nvp->v_type = vattr.va_type;
5562 			}
5563 		}
5564 		sfh4_rele(&sfhp);
5565 
5566 		nrp = VTOR4(nvp);
5567 		mutex_enter(&nrp->r_statev4_lock);
5568 		if (!nrp->created_v4) {
5569 			mutex_exit(&nrp->r_statev4_lock);
5570 			dnlc_update(dvp, nm, nvp);
5571 		} else
5572 			mutex_exit(&nrp->r_statev4_lock);
5573 
5574 		VN_RELE(*vpp);
5575 		*vpp = nvp;
5576 	} else {
5577 		hrtime_t now;
5578 		hrtime_t delta = 0;
5579 
5580 		e.error = 0;
5581 
5582 		/*
5583 		 * Because the NVERIFY "succeeded" we know that the
5584 		 * directory attributes are still valid
5585 		 * so update r_time_attr_inval
5586 		 */
5587 		now = gethrtime();
5588 		mutex_enter(&drp->r_statelock);
5589 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5590 			delta = now - drp->r_time_attr_saved;
5591 			if (delta < mi->mi_acdirmin)
5592 				delta = mi->mi_acdirmin;
5593 			else if (delta > mi->mi_acdirmax)
5594 				delta = mi->mi_acdirmax;
5595 		}
5596 		drp->r_time_attr_inval = now + delta;
5597 		mutex_exit(&drp->r_statelock);
5598 		dnlc_update(dvp, nm, *vpp);
5599 
5600 		/*
5601 		 * Even though we have a valid directory attr cache
5602 		 * and dnlc entry, we may not have access.
5603 		 * This should almost always hit the cache.
5604 		 */
5605 		e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5606 		if (e.error) {
5607 			VN_RELE(*vpp);
5608 			*vpp = NULL;
5609 		}
5610 
5611 		if (*vpp == DNLC_NO_VNODE) {
5612 			VN_RELE(*vpp);
5613 			*vpp = NULL;
5614 			e.error = ENOENT;
5615 		}
5616 	}
5617 
5618 exit:
5619 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5620 	kmem_free(argop, argoplist_size);
5621 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5622 	return (e.error);
5623 }
5624 
5625 /*
5626  * We need to go over the wire to lookup the name, but
5627  * while we are there verify the directory has not
5628  * changed but if it has, get new attributes and check access
5629  *
5630  * PUTFH dfh SAVEFH LOOKUP nm GETFH GETATTR RESTOREFH
5631  *					NVERIFY GETATTR ACCESS
5632  *
5633  * With the results:
5634  *	if the NVERIFY failed we must purge the caches, add new attributes,
5635  *		and cache new access.
5636  *	set a new r_time_attr_inval
5637  *	add name to dnlc, possibly negative
5638  *	if LOOKUP succeeded
5639  *		cache new attributes
5640  */
5641 static int
5642 nfs4lookupnew_otw(vnode_t *dvp, char *nm, vnode_t **vpp, cred_t *cr)
5643 {
5644 	COMPOUND4args_clnt args;
5645 	COMPOUND4res_clnt res;
5646 	fattr4 *ver_fattr;
5647 	fattr4_change dchange;
5648 	int32_t *ptr;
5649 	nfs4_ga_res_t *garp = NULL;
5650 	int argoplist_size  = 9 * sizeof (nfs_argop4);
5651 	nfs_argop4 *argop;
5652 	int doqueue;
5653 	mntinfo4_t *mi;
5654 	nfs4_recov_state_t recov_state;
5655 	hrtime_t t;
5656 	int isdotdot;
5657 	vnode_t *nvp;
5658 	nfs_fh4 *fhp;
5659 	nfs4_sharedfh_t *sfhp;
5660 	nfs4_access_type_t cacc;
5661 	rnode4_t *nrp;
5662 	rnode4_t *drp = VTOR4(dvp);
5663 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
5664 
5665 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
5666 	ASSERT(nm != NULL);
5667 	ASSERT(nm[0] != '\0');
5668 	ASSERT(dvp->v_type == VDIR);
5669 	ASSERT(nm[0] != '.' || nm[1] != '\0');
5670 	ASSERT(*vpp == NULL);
5671 
5672 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0') {
5673 		isdotdot = 1;
5674 		args.ctag = TAG_LOOKUP_PARENT;
5675 	} else {
5676 		/*
5677 		 * If dvp were a stub, it should have triggered and caused
5678 		 * a mount for us to get this far.
5679 		 */
5680 		ASSERT(!RP_ISSTUB(VTOR4(dvp)));
5681 
5682 		isdotdot = 0;
5683 		args.ctag = TAG_LOOKUP;
5684 	}
5685 
5686 	mi = VTOMI4(dvp);
5687 	recov_state.rs_flags = 0;
5688 	recov_state.rs_num_retry_despite_err = 0;
5689 
5690 	nvp = NULL;
5691 
5692 	/* Save the original mount point security information */
5693 	(void) save_mnt_secinfo(mi->mi_curr_serv);
5694 
5695 recov_retry:
5696 	e.error = nfs4_start_fop(mi, dvp, NULL, OH_LOOKUP,
5697 	    &recov_state, NULL);
5698 	if (e.error) {
5699 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5700 		return (e.error);
5701 	}
5702 
5703 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
5704 
5705 	/* PUTFH SAVEFH LOOKUP GETFH GETATTR RESTOREFH NVERIFY GETATTR ACCESS */
5706 	args.array_len = 9;
5707 	args.array = argop;
5708 
5709 	/* 0. putfh file */
5710 	argop[0].argop = OP_CPUTFH;
5711 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
5712 
5713 	/* 1. savefh for the nverify */
5714 	argop[1].argop = OP_SAVEFH;
5715 
5716 	/* 2. lookup name */
5717 	if (isdotdot) {
5718 		argop[2].argop = OP_LOOKUPP;
5719 	} else {
5720 		argop[2].argop = OP_CLOOKUP;
5721 		argop[2].nfs_argop4_u.opclookup.cname = nm;
5722 	}
5723 
5724 	/* 3. resulting file handle */
5725 	argop[3].argop = OP_GETFH;
5726 
5727 	/* 4. resulting file attributes */
5728 	argop[4].argop = OP_GETATTR;
5729 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5730 	argop[4].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5731 
5732 	/* 5. restorefh back the directory for the nverify */
5733 	argop[5].argop = OP_RESTOREFH;
5734 
5735 	/* 6. nverify the change info */
5736 	argop[6].argop = OP_NVERIFY;
5737 	ver_fattr = &argop[6].nfs_argop4_u.opnverify.obj_attributes;
5738 	ver_fattr->attrmask = FATTR4_CHANGE_MASK;
5739 	ver_fattr->attrlist4 = (char *)&dchange;
5740 	ptr = (int32_t *)&dchange;
5741 	IXDR_PUT_HYPER(ptr, VTOR4(dvp)->r_change);
5742 	ver_fattr->attrlist4_len = sizeof (fattr4_change);
5743 
5744 	/* 7. getattr directory */
5745 	argop[7].argop = OP_GETATTR;
5746 	argop[7].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
5747 	argop[7].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
5748 
5749 	/* 8. access directory */
5750 	argop[8].argop = OP_ACCESS;
5751 	argop[8].nfs_argop4_u.opaccess.access = ACCESS4_READ | ACCESS4_DELETE |
5752 	    ACCESS4_MODIFY | ACCESS4_EXTEND | ACCESS4_LOOKUP;
5753 
5754 	doqueue = 1;
5755 	t = gethrtime();
5756 
5757 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
5758 
5759 	if (!isdotdot && res.status == NFS4ERR_MOVED) {
5760 		e.error = nfs4_setup_referral(dvp, nm, vpp, cr);
5761 		if (e.error != 0 && *vpp != NULL)
5762 			VN_RELE(*vpp);
5763 		nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5764 		    &recov_state, FALSE);
5765 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5766 		kmem_free(argop, argoplist_size);
5767 		return (e.error);
5768 	}
5769 
5770 	if (nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp)) {
5771 		/*
5772 		 * For WRONGSEC of a non-dotdot case, send secinfo directly
5773 		 * from this thread, do not go thru the recovery thread since
5774 		 * we need the nm information.
5775 		 *
5776 		 * Not doing dotdot case because there is no specification
5777 		 * for (PUTFH, SECINFO "..") yet.
5778 		 */
5779 		if (!isdotdot && res.status == NFS4ERR_WRONGSEC) {
5780 			if ((e.error = nfs4_secinfo_vnode_otw(dvp, nm, cr)))
5781 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5782 				    &recov_state, FALSE);
5783 			else
5784 				nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5785 				    &recov_state, TRUE);
5786 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5787 			kmem_free(argop, argoplist_size);
5788 			if (!e.error)
5789 				goto recov_retry;
5790 			(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5791 			return (e.error);
5792 		}
5793 
5794 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
5795 		    OP_LOOKUP, NULL, NULL, NULL) == FALSE) {
5796 			nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP,
5797 			    &recov_state, TRUE);
5798 
5799 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
5800 			kmem_free(argop, argoplist_size);
5801 			goto recov_retry;
5802 		}
5803 	}
5804 
5805 	nfs4_end_fop(mi, dvp, NULL, OH_LOOKUP, &recov_state, FALSE);
5806 
5807 	if (e.error || res.array_len == 0) {
5808 		/*
5809 		 * If e.error isn't set, then reply has no ops (or we couldn't
5810 		 * be here).  The only legal way to reply without an op array
5811 		 * is via NFS4ERR_MINOR_VERS_MISMATCH.  An ops array should
5812 		 * be in the reply for all other status values.
5813 		 *
5814 		 * For valid replies without an ops array, return ENOTSUP
5815 		 * (geterrno4 xlation of VERS_MISMATCH).  For illegal replies,
5816 		 * return EIO -- don't trust status.
5817 		 */
5818 		if (e.error == 0)
5819 			e.error = (res.status == NFS4ERR_MINOR_VERS_MISMATCH) ?
5820 			    ENOTSUP : EIO;
5821 
5822 		kmem_free(argop, argoplist_size);
5823 		(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
5824 		return (e.error);
5825 	}
5826 
5827 	e.error = geterrno4(res.status);
5828 
5829 	/*
5830 	 * The PUTFH and SAVEFH may have failed.
5831 	 */
5832 	if ((res.array[0].nfs_resop4_u.opputfh.status != NFS4_OK) ||
5833 	    (res.array[1].nfs_resop4_u.opsavefh.status != NFS4_OK)) {
5834 		nfs4_purge_stale_fh(e.error, dvp, cr);
5835 		goto exit;
5836 	}
5837 
5838 	/*
5839 	 * Check if the file exists, if it does delay entering
5840 	 * into the dnlc until after we update the directory
5841 	 * attributes so we don't cause it to get purged immediately.
5842 	 */
5843 	if (res.array[2].nfs_resop4_u.oplookup.status != NFS4_OK) {
5844 		/*
5845 		 * The lookup failed, probably no entry
5846 		 */
5847 		if (e.error == ENOENT && nfs4_lookup_neg_cache)
5848 			dnlc_update(dvp, nm, DNLC_NO_VNODE);
5849 		goto exit;
5850 	}
5851 
5852 	if (res.array[3].nfs_resop4_u.opgetfh.status != NFS4_OK) {
5853 		/*
5854 		 * The file exists but we can't get its fh for
5855 		 * some unknown reason. Error out to be safe.
5856 		 */
5857 		goto exit;
5858 	}
5859 
5860 	fhp = &res.array[3].nfs_resop4_u.opgetfh.object;
5861 	if (fhp->nfs_fh4_len == 0) {
5862 		/*
5863 		 * The file exists but a bogus fh
5864 		 * some unknown reason.  Error out to be safe.
5865 		 */
5866 		e.error = EIO;
5867 		goto exit;
5868 	}
5869 	sfhp = sfh4_get(fhp, mi);
5870 
5871 	if (res.array[4].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5872 		sfh4_rele(&sfhp);
5873 		goto exit;
5874 	}
5875 	garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
5876 
5877 	/*
5878 	 * The RESTOREFH may have failed
5879 	 */
5880 	if (res.array[5].nfs_resop4_u.oprestorefh.status != NFS4_OK) {
5881 		sfh4_rele(&sfhp);
5882 		e.error = EIO;
5883 		goto exit;
5884 	}
5885 
5886 	if (res.array[6].nfs_resop4_u.opnverify.status != NFS4ERR_SAME) {
5887 		/*
5888 		 * First make sure the NVERIFY failed as we expected,
5889 		 * if it didn't then be conservative and error out
5890 		 * as we can't trust the directory.
5891 		 */
5892 		if (res.array[6].nfs_resop4_u.opnverify.status != NFS4_OK) {
5893 			sfh4_rele(&sfhp);
5894 			e.error = EIO;
5895 			goto exit;
5896 		}
5897 
5898 		/*
5899 		 * We know the NVERIFY "failed" so the directory has changed,
5900 		 * so we must:
5901 		 *	purge the caches (access and indirectly dnlc if needed)
5902 		 */
5903 		nfs4_purge_caches(dvp, NFS4_NOPURGE_DNLC, cr, TRUE);
5904 
5905 		if (res.array[7].nfs_resop4_u.opgetattr.status != NFS4_OK) {
5906 			sfh4_rele(&sfhp);
5907 			goto exit;
5908 		}
5909 		nfs4_attr_cache(dvp,
5910 		    &res.array[7].nfs_resop4_u.opgetattr.ga_res,
5911 		    t, cr, FALSE, NULL);
5912 
5913 		if (res.array[8].nfs_resop4_u.opaccess.status != NFS4_OK) {
5914 			nfs4_purge_stale_fh(e.error, dvp, cr);
5915 			sfh4_rele(&sfhp);
5916 			e.error = geterrno4(res.status);
5917 			goto exit;
5918 		}
5919 
5920 		/*
5921 		 * Now we know the directory is valid,
5922 		 * cache new directory access
5923 		 */
5924 		nfs4_access_cache(drp,
5925 		    args.array[8].nfs_argop4_u.opaccess.access,
5926 		    res.array[8].nfs_resop4_u.opaccess.access, cr);
5927 
5928 		/*
5929 		 * recheck VEXEC access
5930 		 */
5931 		cacc = nfs4_access_check(drp, ACCESS4_LOOKUP, cr);
5932 		if (cacc != NFS4_ACCESS_ALLOWED) {
5933 			/*
5934 			 * Directory permissions might have been revoked
5935 			 */
5936 			if (cacc == NFS4_ACCESS_DENIED) {
5937 				sfh4_rele(&sfhp);
5938 				e.error = EACCES;
5939 				goto exit;
5940 			}
5941 
5942 			/*
5943 			 * Somehow we must not have asked for enough
5944 			 * so try a singleton ACCESS should never happen
5945 			 */
5946 			e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5947 			if (e.error) {
5948 				sfh4_rele(&sfhp);
5949 				goto exit;
5950 			}
5951 		}
5952 
5953 		e.error = geterrno4(res.status);
5954 	} else {
5955 		hrtime_t now;
5956 		hrtime_t delta = 0;
5957 
5958 		e.error = 0;
5959 
5960 		/*
5961 		 * Because the NVERIFY "succeeded" we know that the
5962 		 * directory attributes are still valid
5963 		 * so update r_time_attr_inval
5964 		 */
5965 		now = gethrtime();
5966 		mutex_enter(&drp->r_statelock);
5967 		if (!(mi->mi_flags & MI4_NOAC) && !(dvp->v_flag & VNOCACHE)) {
5968 			delta = now - drp->r_time_attr_saved;
5969 			if (delta < mi->mi_acdirmin)
5970 				delta = mi->mi_acdirmin;
5971 			else if (delta > mi->mi_acdirmax)
5972 				delta = mi->mi_acdirmax;
5973 		}
5974 		drp->r_time_attr_inval = now + delta;
5975 		mutex_exit(&drp->r_statelock);
5976 
5977 		/*
5978 		 * Even though we have a valid directory attr cache,
5979 		 * we may not have access.
5980 		 * This should almost always hit the cache.
5981 		 */
5982 		e.error = nfs4_access(dvp, VEXEC, 0, cr, NULL);
5983 		if (e.error) {
5984 			sfh4_rele(&sfhp);
5985 			goto exit;
5986 		}
5987 	}
5988 
5989 	/*
5990 	 * Now we have successfully completed the lookup, if the
5991 	 * directory has changed we now have the valid attributes.
5992 	 * We also know we have directory access.
5993 	 * Create the new rnode and insert it in the dnlc.
5994 	 */
5995 	if (isdotdot) {
5996 		e.error = nfs4_make_dotdot(sfhp, t, dvp, cr, &nvp, 1);
5997 		if (e.error) {
5998 			sfh4_rele(&sfhp);
5999 			goto exit;
6000 		}
6001 		/*
6002 		 * XXX if nfs4_make_dotdot uses an existing rnode
6003 		 * XXX it doesn't update the attributes.
6004 		 * XXX for now just save them again to save an OTW
6005 		 */
6006 		nfs4_attr_cache(nvp, garp, t, cr, FALSE, NULL);
6007 	} else {
6008 		nvp = makenfs4node(sfhp, garp, dvp->v_vfsp, t, cr,
6009 		    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
6010 	}
6011 	sfh4_rele(&sfhp);
6012 
6013 	nrp = VTOR4(nvp);
6014 	mutex_enter(&nrp->r_statev4_lock);
6015 	if (!nrp->created_v4) {
6016 		mutex_exit(&nrp->r_statev4_lock);
6017 		dnlc_update(dvp, nm, nvp);
6018 	} else
6019 		mutex_exit(&nrp->r_statev4_lock);
6020 
6021 	*vpp = nvp;
6022 
6023 exit:
6024 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6025 	kmem_free(argop, argoplist_size);
6026 	(void) check_mnt_secinfo(mi->mi_curr_serv, nvp);
6027 	return (e.error);
6028 }
6029 
6030 #ifdef DEBUG
6031 void
6032 nfs4lookup_dump_compound(char *where, nfs_argop4 *argbase, int argcnt)
6033 {
6034 	uint_t i, len;
6035 	zoneid_t zoneid = getzoneid();
6036 	char *s;
6037 
6038 	zcmn_err(zoneid, CE_NOTE, "%s: dumping cmpd", where);
6039 	for (i = 0; i < argcnt; i++) {
6040 		nfs_argop4 *op = &argbase[i];
6041 		switch (op->argop) {
6042 		case OP_CPUTFH:
6043 		case OP_PUTFH:
6044 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putfh", i);
6045 			break;
6046 		case OP_PUTROOTFH:
6047 			zcmn_err(zoneid, CE_NOTE, "\t op %d, putrootfh", i);
6048 			break;
6049 		case OP_CLOOKUP:
6050 			s = op->nfs_argop4_u.opclookup.cname;
6051 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6052 			break;
6053 		case OP_LOOKUP:
6054 			s = utf8_to_str(&op->nfs_argop4_u.oplookup.objname,
6055 			    &len, NULL);
6056 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookup %s", i, s);
6057 			kmem_free(s, len);
6058 			break;
6059 		case OP_LOOKUPP:
6060 			zcmn_err(zoneid, CE_NOTE, "\t op %d, lookupp ..", i);
6061 			break;
6062 		case OP_GETFH:
6063 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getfh", i);
6064 			break;
6065 		case OP_GETATTR:
6066 			zcmn_err(zoneid, CE_NOTE, "\t op %d, getattr", i);
6067 			break;
6068 		case OP_OPENATTR:
6069 			zcmn_err(zoneid, CE_NOTE, "\t op %d, openattr", i);
6070 			break;
6071 		default:
6072 			zcmn_err(zoneid, CE_NOTE, "\t op %d, opcode %d", i,
6073 			    op->argop);
6074 			break;
6075 		}
6076 	}
6077 }
6078 #endif
6079 
6080 /*
6081  * nfs4lookup_setup - constructs a multi-lookup compound request.
6082  *
6083  * Given the path "nm1/nm2/.../nmn", the following compound requests
6084  * may be created:
6085  *
6086  * Note: Getfh is not be needed because filehandle attr is mandatory, but it
6087  * is faster, for now.
6088  *
6089  * l4_getattrs indicates the type of compound requested.
6090  *
6091  * LKP4_NO_ATTRIBUTE - no attributes (used by secinfo):
6092  *
6093  *	compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ...  Lookup {nmn} }
6094  *
6095  *   total number of ops is n + 1.
6096  *
6097  * LKP4_LAST_NAMED_ATTR - multi-component path for a named
6098  *      attribute: create lookups plus one OPENATTR/GETFH/GETATTR
6099  *      before the last component, and only get attributes
6100  *      for the last component.  Note that the second-to-last
6101  *	pathname component is XATTR_RPATH, which does NOT go
6102  *	over-the-wire as a lookup.
6103  *
6104  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Lookup {nmn-2};
6105  *		Openattr; Getfh; Getattr; Lookup {nmn}; Getfh; Getattr }
6106  *
6107  *   and total number of ops is n + 5.
6108  *
6109  * LKP4_LAST_ATTRDIR - multi-component path for the hidden named
6110  *      attribute directory: create lookups plus an OPENATTR
6111  *	replacing the last lookup.  Note that the last pathname
6112  *	component is XATTR_RPATH, which does NOT go over-the-wire
6113  *	as a lookup.
6114  *
6115  *      compound { Put*fh; Lookup {nm1}; Lookup {nm2}; ... Getfh; Getattr;
6116  *		Openattr; Getfh; Getattr }
6117  *
6118  *   and total number of ops is n + 5.
6119  *
6120  * LKP4_ALL_ATTRIBUTES - create lookups and get attributes for intermediate
6121  *	nodes too.
6122  *
6123  *	compound { Put*fh; Lookup {nm1}; Getfh; Getattr;
6124  *		Lookup {nm2}; ...  Lookup {nmn}; Getfh; Getattr }
6125  *
6126  *   and total number of ops is 3*n + 1.
6127  *
6128  * All cases: returns the index in the arg array of the final LOOKUP op, or
6129  * -1 if no LOOKUPs were used.
6130  */
6131 int
6132 nfs4lookup_setup(char *nm, lookup4_param_t *lookupargp, int needgetfh)
6133 {
6134 	enum lkp4_attr_setup l4_getattrs = lookupargp->l4_getattrs;
6135 	nfs_argop4 *argbase, *argop;
6136 	int arglen, argcnt;
6137 	int n = 1;	/* number of components */
6138 	int nga = 1;	/* number of Getattr's in request */
6139 	char c = '\0', *s, *p;
6140 	int lookup_idx = -1;
6141 	int argoplist_size;
6142 
6143 	/* set lookuparg response result to 0 */
6144 	lookupargp->resp->status = NFS4_OK;
6145 
6146 	/* skip leading "/" or "." e.g. ".//./" if there is */
6147 	for (; ; nm++) {
6148 		if (*nm != '/' && *nm != '.')
6149 			break;
6150 
6151 		/* ".." is counted as 1 component */
6152 		if (*nm == '.' && *(nm + 1) != '/')
6153 			break;
6154 	}
6155 
6156 	/*
6157 	 * Find n = number of components - nm must be null terminated
6158 	 * Skip "." components.
6159 	 */
6160 	if (*nm != '\0')
6161 		for (n = 1, s = nm; *s != '\0'; s++) {
6162 			if ((*s == '/') && (*(s + 1) != '/') &&
6163 			    (*(s + 1) != '\0') &&
6164 			    !(*(s + 1) == '.' && (*(s + 2) == '/' ||
6165 			    *(s + 2) == '\0')))
6166 				n++;
6167 		}
6168 	else
6169 		n = 0;
6170 
6171 	/*
6172 	 * nga is number of components that need Getfh+Getattr
6173 	 */
6174 	switch (l4_getattrs) {
6175 	case LKP4_NO_ATTRIBUTES:
6176 		nga = 0;
6177 		break;
6178 	case LKP4_ALL_ATTRIBUTES:
6179 		nga = n;
6180 		/*
6181 		 * Always have at least 1 getfh, getattr pair
6182 		 */
6183 		if (nga == 0)
6184 			nga++;
6185 		break;
6186 	case LKP4_LAST_ATTRDIR:
6187 	case LKP4_LAST_NAMED_ATTR:
6188 		nga = n+1;
6189 		break;
6190 	}
6191 
6192 	/*
6193 	 * If change to use the filehandle attr instead of getfh
6194 	 * the following line can be deleted.
6195 	 */
6196 	nga *= 2;
6197 
6198 	/*
6199 	 * calculate number of ops in request as
6200 	 * header + trailer + lookups + getattrs
6201 	 */
6202 	arglen = lookupargp->header_len + lookupargp->trailer_len + n + nga;
6203 
6204 	argoplist_size = arglen * sizeof (nfs_argop4);
6205 	argop = argbase = kmem_alloc(argoplist_size, KM_SLEEP);
6206 	lookupargp->argsp->array = argop;
6207 
6208 	argcnt = lookupargp->header_len;
6209 	argop += argcnt;
6210 
6211 	/*
6212 	 * loop and create a lookup op and possibly getattr/getfh for
6213 	 * each component. Skip "." components.
6214 	 */
6215 	for (s = nm; *s != '\0'; s = p) {
6216 		/*
6217 		 * Set up a pathname struct for each component if needed
6218 		 */
6219 		while (*s == '/')
6220 			s++;
6221 		if (*s == '\0')
6222 			break;
6223 
6224 		for (p = s; (*p != '/') && (*p != '\0'); p++)
6225 			;
6226 		c = *p;
6227 		*p = '\0';
6228 
6229 		if (s[0] == '.' && s[1] == '\0') {
6230 			*p = c;
6231 			continue;
6232 		}
6233 		if (l4_getattrs == LKP4_LAST_ATTRDIR &&
6234 		    strcmp(s, XATTR_RPATH) == 0) {
6235 			/* getfh XXX may not be needed in future */
6236 			argop->argop = OP_GETFH;
6237 			argop++;
6238 			argcnt++;
6239 
6240 			/* getattr */
6241 			argop->argop = OP_GETATTR;
6242 			argop->nfs_argop4_u.opgetattr.attr_request =
6243 			    lookupargp->ga_bits;
6244 			argop->nfs_argop4_u.opgetattr.mi =
6245 			    lookupargp->mi;
6246 			argop++;
6247 			argcnt++;
6248 
6249 			/* openattr */
6250 			argop->argop = OP_OPENATTR;
6251 		} else if (l4_getattrs == LKP4_LAST_NAMED_ATTR &&
6252 		    strcmp(s, XATTR_RPATH) == 0) {
6253 			/* openattr */
6254 			argop->argop = OP_OPENATTR;
6255 			argop++;
6256 			argcnt++;
6257 
6258 			/* getfh XXX may not be needed in future */
6259 			argop->argop = OP_GETFH;
6260 			argop++;
6261 			argcnt++;
6262 
6263 			/* getattr */
6264 			argop->argop = OP_GETATTR;
6265 			argop->nfs_argop4_u.opgetattr.attr_request =
6266 			    lookupargp->ga_bits;
6267 			argop->nfs_argop4_u.opgetattr.mi =
6268 			    lookupargp->mi;
6269 			argop++;
6270 			argcnt++;
6271 			*p = c;
6272 			continue;
6273 		} else if (s[0] == '.' && s[1] == '.' && s[2] == '\0') {
6274 			/* lookupp */
6275 			argop->argop = OP_LOOKUPP;
6276 		} else {
6277 			/* lookup */
6278 			argop->argop = OP_LOOKUP;
6279 			(void) str_to_utf8(s,
6280 			    &argop->nfs_argop4_u.oplookup.objname);
6281 		}
6282 		lookup_idx = argcnt;
6283 		argop++;
6284 		argcnt++;
6285 
6286 		*p = c;
6287 
6288 		if (l4_getattrs == LKP4_ALL_ATTRIBUTES) {
6289 			/* getfh XXX may not be needed in future */
6290 			argop->argop = OP_GETFH;
6291 			argop++;
6292 			argcnt++;
6293 
6294 			/* getattr */
6295 			argop->argop = OP_GETATTR;
6296 			argop->nfs_argop4_u.opgetattr.attr_request =
6297 			    lookupargp->ga_bits;
6298 			argop->nfs_argop4_u.opgetattr.mi =
6299 			    lookupargp->mi;
6300 			argop++;
6301 			argcnt++;
6302 		}
6303 	}
6304 
6305 	if ((l4_getattrs != LKP4_NO_ATTRIBUTES) &&
6306 	    ((l4_getattrs != LKP4_ALL_ATTRIBUTES) || (lookup_idx < 0))) {
6307 		if (needgetfh) {
6308 			/* stick in a post-lookup getfh */
6309 			argop->argop = OP_GETFH;
6310 			argcnt++;
6311 			argop++;
6312 		}
6313 		/* post-lookup getattr */
6314 		argop->argop = OP_GETATTR;
6315 		argop->nfs_argop4_u.opgetattr.attr_request =
6316 		    lookupargp->ga_bits;
6317 		argop->nfs_argop4_u.opgetattr.mi = lookupargp->mi;
6318 		argcnt++;
6319 	}
6320 	argcnt += lookupargp->trailer_len;	/* actual op count */
6321 	lookupargp->argsp->array_len = argcnt;
6322 	lookupargp->arglen = arglen;
6323 
6324 #ifdef DEBUG
6325 	if (nfs4_client_lookup_debug)
6326 		nfs4lookup_dump_compound("nfs4lookup_setup", argbase, argcnt);
6327 #endif
6328 
6329 	return (lookup_idx);
6330 }
6331 
6332 static int
6333 nfs4openattr(vnode_t *dvp, vnode_t **avp, int cflag, cred_t *cr)
6334 {
6335 	COMPOUND4args_clnt	args;
6336 	COMPOUND4res_clnt	res;
6337 	GETFH4res	*gf_res = NULL;
6338 	nfs_argop4	argop[4];
6339 	nfs_resop4	*resop = NULL;
6340 	nfs4_sharedfh_t *sfhp;
6341 	hrtime_t t;
6342 	nfs4_error_t	e;
6343 
6344 	rnode4_t	*drp;
6345 	int		doqueue = 1;
6346 	vnode_t		*vp;
6347 	int		needrecov = 0;
6348 	nfs4_recov_state_t recov_state;
6349 
6350 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
6351 
6352 	*avp = NULL;
6353 	recov_state.rs_flags = 0;
6354 	recov_state.rs_num_retry_despite_err = 0;
6355 
6356 recov_retry:
6357 	/* COMPOUND: putfh, openattr, getfh, getattr */
6358 	args.array_len = 4;
6359 	args.array = argop;
6360 	args.ctag = TAG_OPENATTR;
6361 
6362 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
6363 	if (e.error)
6364 		return (e.error);
6365 
6366 	drp = VTOR4(dvp);
6367 
6368 	/* putfh */
6369 	argop[0].argop = OP_CPUTFH;
6370 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6371 
6372 	/* openattr */
6373 	argop[1].argop = OP_OPENATTR;
6374 	argop[1].nfs_argop4_u.opopenattr.createdir = (cflag ? TRUE : FALSE);
6375 
6376 	/* getfh */
6377 	argop[2].argop = OP_GETFH;
6378 
6379 	/* getattr */
6380 	argop[3].argop = OP_GETATTR;
6381 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6382 	argop[3].nfs_argop4_u.opgetattr.mi = VTOMI4(dvp);
6383 
6384 	NFS4_DEBUG(nfs4_client_call_debug, (CE_NOTE,
6385 	    "nfs4openattr: %s call, drp %s", needrecov ? "recov" : "first",
6386 	    rnode4info(drp)));
6387 
6388 	t = gethrtime();
6389 
6390 	rfs4call(VTOMI4(dvp), &args, &res, cr, &doqueue, 0, &e);
6391 
6392 	needrecov = nfs4_needs_recovery(&e, FALSE, dvp->v_vfsp);
6393 	if (needrecov) {
6394 		bool_t abort;
6395 
6396 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
6397 		    "nfs4openattr: initiating recovery\n"));
6398 
6399 		abort = nfs4_start_recovery(&e,
6400 		    VTOMI4(dvp), dvp, NULL, NULL, NULL,
6401 		    OP_OPENATTR, NULL, NULL, NULL);
6402 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6403 		if (!e.error) {
6404 			e.error = geterrno4(res.status);
6405 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6406 		}
6407 		if (abort == FALSE)
6408 			goto recov_retry;
6409 		return (e.error);
6410 	}
6411 
6412 	if (e.error) {
6413 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6414 		return (e.error);
6415 	}
6416 
6417 	if (res.status) {
6418 		/*
6419 		 * If OTW errro is NOTSUPP, then it should be
6420 		 * translated to EINVAL.  All Solaris file system
6421 		 * implementations return EINVAL to the syscall layer
6422 		 * when the attrdir cannot be created due to an
6423 		 * implementation restriction or noxattr mount option.
6424 		 */
6425 		if (res.status == NFS4ERR_NOTSUPP) {
6426 			mutex_enter(&drp->r_statelock);
6427 			if (drp->r_xattr_dir)
6428 				VN_RELE(drp->r_xattr_dir);
6429 			VN_HOLD(NFS4_XATTR_DIR_NOTSUPP);
6430 			drp->r_xattr_dir = NFS4_XATTR_DIR_NOTSUPP;
6431 			mutex_exit(&drp->r_statelock);
6432 
6433 			e.error = EINVAL;
6434 		} else {
6435 			e.error = geterrno4(res.status);
6436 		}
6437 
6438 		if (e.error) {
6439 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6440 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
6441 			    needrecov);
6442 			return (e.error);
6443 		}
6444 	}
6445 
6446 	resop = &res.array[0];  /* putfh res */
6447 	ASSERT(resop->nfs_resop4_u.opgetfh.status == NFS4_OK);
6448 
6449 	resop = &res.array[1];  /* openattr res */
6450 	ASSERT(resop->nfs_resop4_u.opopenattr.status == NFS4_OK);
6451 
6452 	resop = &res.array[2];  /* getfh res */
6453 	gf_res = &resop->nfs_resop4_u.opgetfh;
6454 	if (gf_res->object.nfs_fh4_len == 0) {
6455 		*avp = NULL;
6456 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6457 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6458 		return (ENOENT);
6459 	}
6460 
6461 	sfhp = sfh4_get(&gf_res->object, VTOMI4(dvp));
6462 	vp = makenfs4node(sfhp, &res.array[3].nfs_resop4_u.opgetattr.ga_res,
6463 	    dvp->v_vfsp, t, cr, dvp,
6464 	    fn_get(VTOSV(dvp)->sv_name, XATTR_RPATH, sfhp));
6465 	sfh4_rele(&sfhp);
6466 
6467 	if (e.error)
6468 		PURGE_ATTRCACHE4(vp);
6469 
6470 	mutex_enter(&vp->v_lock);
6471 	vp->v_flag |= V_XATTRDIR;
6472 	mutex_exit(&vp->v_lock);
6473 
6474 	*avp = vp;
6475 
6476 	mutex_enter(&drp->r_statelock);
6477 	if (drp->r_xattr_dir)
6478 		VN_RELE(drp->r_xattr_dir);
6479 	VN_HOLD(vp);
6480 	drp->r_xattr_dir = vp;
6481 
6482 	/*
6483 	 * Invalidate pathconf4 cache because r_xattr_dir is no longer
6484 	 * NULL.  xattrs could be created at any time, and we have no
6485 	 * way to update pc4_xattr_exists in the base object if/when
6486 	 * it happens.
6487 	 */
6488 	drp->r_pathconf.pc4_xattr_valid = 0;
6489 
6490 	mutex_exit(&drp->r_statelock);
6491 
6492 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
6493 
6494 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
6495 
6496 	return (0);
6497 }
6498 
6499 /* ARGSUSED */
6500 static int
6501 nfs4_create(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
6502 	int mode, vnode_t **vpp, cred_t *cr, int flags, caller_context_t *ct,
6503 	vsecattr_t *vsecp)
6504 {
6505 	int error;
6506 	vnode_t *vp = NULL;
6507 	rnode4_t *rp;
6508 	struct vattr vattr;
6509 	rnode4_t *drp;
6510 	vnode_t *tempvp;
6511 	enum createmode4 createmode;
6512 	bool_t must_trunc = FALSE;
6513 	int	truncating = 0;
6514 
6515 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
6516 		return (EPERM);
6517 	if (exclusive == EXCL && (dvp->v_flag & V_XATTRDIR)) {
6518 		return (EINVAL);
6519 	}
6520 
6521 	/* . and .. have special meaning in the protocol, reject them. */
6522 
6523 	if (nm[0] == '.' && (nm[1] == '\0' || (nm[1] == '.' && nm[2] == '\0')))
6524 		return (EISDIR);
6525 
6526 	drp = VTOR4(dvp);
6527 
6528 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
6529 		return (EINTR);
6530 
6531 top:
6532 	/*
6533 	 * We make a copy of the attributes because the caller does not
6534 	 * expect us to change what va points to.
6535 	 */
6536 	vattr = *va;
6537 
6538 	/*
6539 	 * If the pathname is "", then dvp is the root vnode of
6540 	 * a remote file mounted over a local directory.
6541 	 * All that needs to be done is access
6542 	 * checking and truncation.  Note that we avoid doing
6543 	 * open w/ create because the parent directory might
6544 	 * be in pseudo-fs and the open would fail.
6545 	 */
6546 	if (*nm == '\0') {
6547 		error = 0;
6548 		VN_HOLD(dvp);
6549 		vp = dvp;
6550 		must_trunc = TRUE;
6551 	} else {
6552 		/*
6553 		 * We need to go over the wire, just to be sure whether the
6554 		 * file exists or not.  Using the DNLC can be dangerous in
6555 		 * this case when making a decision regarding existence.
6556 		 */
6557 		error = nfs4lookup(dvp, nm, &vp, cr, 1);
6558 	}
6559 
6560 	if (exclusive)
6561 		createmode = EXCLUSIVE4;
6562 	else
6563 		createmode = GUARDED4;
6564 
6565 	/*
6566 	 * error would be set if the file does not exist on the
6567 	 * server, so lets go create it.
6568 	 */
6569 	if (error) {
6570 		goto create_otw;
6571 	}
6572 
6573 	/*
6574 	 * File does exist on the server
6575 	 */
6576 	if (exclusive == EXCL)
6577 		error = EEXIST;
6578 	else if (vp->v_type == VDIR && (mode & VWRITE))
6579 		error = EISDIR;
6580 	else {
6581 		/*
6582 		 * If vnode is a device, create special vnode.
6583 		 */
6584 		if (ISVDEV(vp->v_type)) {
6585 			tempvp = vp;
6586 			vp = specvp(vp, vp->v_rdev, vp->v_type, cr);
6587 			VN_RELE(tempvp);
6588 		}
6589 		if (!(error = VOP_ACCESS(vp, mode, 0, cr, ct))) {
6590 			if ((vattr.va_mask & AT_SIZE) &&
6591 			    vp->v_type == VREG) {
6592 				rp = VTOR4(vp);
6593 				/*
6594 				 * Check here for large file handled
6595 				 * by LF-unaware process (as
6596 				 * ufs_create() does)
6597 				 */
6598 				if (!(flags & FOFFMAX)) {
6599 					mutex_enter(&rp->r_statelock);
6600 					if (rp->r_size > MAXOFF32_T)
6601 						error = EOVERFLOW;
6602 					mutex_exit(&rp->r_statelock);
6603 				}
6604 
6605 				/* if error is set then we need to return */
6606 				if (error) {
6607 					nfs_rw_exit(&drp->r_rwlock);
6608 					VN_RELE(vp);
6609 					return (error);
6610 				}
6611 
6612 				if (must_trunc) {
6613 					vattr.va_mask = AT_SIZE;
6614 					error = nfs4setattr(vp, &vattr, 0, cr,
6615 					    NULL);
6616 				} else {
6617 				/*
6618 				 * we know we have a regular file that already
6619 				 * exists and we may end up truncating the file
6620 				 * as a result of the open_otw, so flush out
6621 				 * any dirty pages for this file first.
6622 				 */
6623 					if (nfs4_has_pages(vp) &&
6624 					    ((rp->r_flags & R4DIRTY) ||
6625 					    rp->r_count > 0 ||
6626 					    rp->r_mapcnt > 0)) {
6627 						error = nfs4_putpage(vp,
6628 						    (offset_t)0, 0, 0, cr, ct);
6629 						if (error && (error == ENOSPC ||
6630 						    error == EDQUOT)) {
6631 							mutex_enter(
6632 							    &rp->r_statelock);
6633 							if (!rp->r_error)
6634 								rp->r_error =
6635 								    error;
6636 							mutex_exit(
6637 							    &rp->r_statelock);
6638 						}
6639 					}
6640 					vattr.va_mask = (AT_SIZE |
6641 					    AT_TYPE | AT_MODE);
6642 					vattr.va_type = VREG;
6643 					createmode = UNCHECKED4;
6644 					truncating = 1;
6645 					goto create_otw;
6646 				}
6647 			}
6648 		}
6649 	}
6650 	nfs_rw_exit(&drp->r_rwlock);
6651 	if (error) {
6652 		VN_RELE(vp);
6653 	} else {
6654 		vnode_t *tvp;
6655 		rnode4_t *trp;
6656 		/*
6657 		 * existing file got truncated, notify.
6658 		 */
6659 		tvp = vp;
6660 		if (vp->v_type == VREG) {
6661 			trp = VTOR4(vp);
6662 			if (IS_SHADOW(vp, trp))
6663 				tvp = RTOV4(trp);
6664 		}
6665 		vnevent_create(tvp, ct);
6666 		*vpp = vp;
6667 	}
6668 	return (error);
6669 
6670 create_otw:
6671 	dnlc_remove(dvp, nm);
6672 
6673 	ASSERT(vattr.va_mask & AT_TYPE);
6674 
6675 	/*
6676 	 * If not a regular file let nfs4mknod() handle it.
6677 	 */
6678 	if (vattr.va_type != VREG) {
6679 		error = nfs4mknod(dvp, nm, &vattr, exclusive, mode, vpp, cr);
6680 		nfs_rw_exit(&drp->r_rwlock);
6681 		return (error);
6682 	}
6683 
6684 	/*
6685 	 * It _is_ a regular file.
6686 	 */
6687 	ASSERT(vattr.va_mask & AT_MODE);
6688 	if (MANDMODE(vattr.va_mode)) {
6689 		nfs_rw_exit(&drp->r_rwlock);
6690 		return (EACCES);
6691 	}
6692 
6693 	/*
6694 	 * If this happens to be a mknod of a regular file, then flags will
6695 	 * have neither FREAD or FWRITE.  However, we must set at least one
6696 	 * for the call to nfs4open_otw.  If it's open(O_CREAT) driving
6697 	 * nfs4_create, then either FREAD, FWRITE, or FRDWR has already been
6698 	 * set (based on openmode specified by app).
6699 	 */
6700 	if ((flags & (FREAD|FWRITE)) == 0)
6701 		flags |= (FREAD|FWRITE);
6702 
6703 	error = nfs4open_otw(dvp, nm, &vattr, vpp, cr, 1, flags, createmode, 0);
6704 
6705 	if (vp != NULL) {
6706 		/* if create was successful, throw away the file's pages */
6707 		if (!error && (vattr.va_mask & AT_SIZE))
6708 			nfs4_invalidate_pages(vp, (vattr.va_size & PAGEMASK),
6709 			    cr);
6710 		/* release the lookup hold */
6711 		VN_RELE(vp);
6712 		vp = NULL;
6713 	}
6714 
6715 	/*
6716 	 * validate that we opened a regular file. This handles a misbehaving
6717 	 * server that returns an incorrect FH.
6718 	 */
6719 	if ((error == 0) && *vpp && (*vpp)->v_type != VREG) {
6720 		error = EISDIR;
6721 		VN_RELE(*vpp);
6722 	}
6723 
6724 	/*
6725 	 * If this is not an exclusive create, then the CREATE
6726 	 * request will be made with the GUARDED mode set.  This
6727 	 * means that the server will return EEXIST if the file
6728 	 * exists.  The file could exist because of a retransmitted
6729 	 * request.  In this case, we recover by starting over and
6730 	 * checking to see whether the file exists.  This second
6731 	 * time through it should and a CREATE request will not be
6732 	 * sent.
6733 	 *
6734 	 * This handles the problem of a dangling CREATE request
6735 	 * which contains attributes which indicate that the file
6736 	 * should be truncated.  This retransmitted request could
6737 	 * possibly truncate valid data in the file if not caught
6738 	 * by the duplicate request mechanism on the server or if
6739 	 * not caught by other means.  The scenario is:
6740 	 *
6741 	 * Client transmits CREATE request with size = 0
6742 	 * Client times out, retransmits request.
6743 	 * Response to the first request arrives from the server
6744 	 *  and the client proceeds on.
6745 	 * Client writes data to the file.
6746 	 * The server now processes retransmitted CREATE request
6747 	 *  and truncates file.
6748 	 *
6749 	 * The use of the GUARDED CREATE request prevents this from
6750 	 * happening because the retransmitted CREATE would fail
6751 	 * with EEXIST and would not truncate the file.
6752 	 */
6753 	if (error == EEXIST && exclusive == NONEXCL) {
6754 #ifdef DEBUG
6755 		nfs4_create_misses++;
6756 #endif
6757 		goto top;
6758 	}
6759 	nfs_rw_exit(&drp->r_rwlock);
6760 	if (truncating && !error && *vpp) {
6761 		vnode_t *tvp;
6762 		rnode4_t *trp;
6763 		/*
6764 		 * existing file got truncated, notify.
6765 		 */
6766 		tvp = *vpp;
6767 		trp = VTOR4(tvp);
6768 		if (IS_SHADOW(tvp, trp))
6769 			tvp = RTOV4(trp);
6770 		vnevent_create(tvp, ct);
6771 	}
6772 	return (error);
6773 }
6774 
6775 /*
6776  * Create compound (for mkdir, mknod, symlink):
6777  * { Putfh <dfh>; Create; Getfh; Getattr }
6778  * It's okay if setattr failed to set gid - this is not considered
6779  * an error, but purge attrs in that case.
6780  */
6781 static int
6782 call_nfs4_create_req(vnode_t *dvp, char *nm, void *data, struct vattr *va,
6783     vnode_t **vpp, cred_t *cr, nfs_ftype4 type)
6784 {
6785 	int need_end_op = FALSE;
6786 	COMPOUND4args_clnt args;
6787 	COMPOUND4res_clnt res, *resp = NULL;
6788 	nfs_argop4 *argop;
6789 	nfs_resop4 *resop;
6790 	int doqueue;
6791 	mntinfo4_t *mi;
6792 	rnode4_t *drp = VTOR4(dvp);
6793 	change_info4 *cinfo;
6794 	GETFH4res *gf_res;
6795 	struct vattr vattr;
6796 	vnode_t *vp;
6797 	fattr4 *crattr;
6798 	bool_t needrecov = FALSE;
6799 	nfs4_recov_state_t recov_state;
6800 	nfs4_sharedfh_t *sfhp = NULL;
6801 	hrtime_t t;
6802 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
6803 	int numops, argoplist_size, setgid_flag, idx_create, idx_fattr;
6804 	dirattr_info_t dinfo, *dinfop;
6805 	servinfo4_t *svp;
6806 	bitmap4 supp_attrs;
6807 
6808 	ASSERT(type == NF4DIR || type == NF4LNK || type == NF4BLK ||
6809 	    type == NF4CHR || type == NF4SOCK || type == NF4FIFO);
6810 
6811 	mi = VTOMI4(dvp);
6812 
6813 	/*
6814 	 * Make sure we properly deal with setting the right gid
6815 	 * on a new directory to reflect the parent's setgid bit
6816 	 */
6817 	setgid_flag = 0;
6818 	if (type == NF4DIR) {
6819 		struct vattr dva;
6820 
6821 		va->va_mode &= ~VSGID;
6822 		dva.va_mask = AT_MODE | AT_GID;
6823 		if (VOP_GETATTR(dvp, &dva, 0, cr, NULL) == 0) {
6824 
6825 			/*
6826 			 * If the parent's directory has the setgid bit set
6827 			 * _and_ the client was able to get a valid mapping
6828 			 * for the parent dir's owner_group, we want to
6829 			 * append NVERIFY(owner_group == dva.va_gid) and
6830 			 * SETTATTR to the CREATE compound.
6831 			 */
6832 			if (mi->mi_flags & MI4_GRPID || dva.va_mode & VSGID) {
6833 				setgid_flag = 1;
6834 				va->va_mode |= VSGID;
6835 				if (dva.va_gid != GID_NOBODY) {
6836 					va->va_mask |= AT_GID;
6837 					va->va_gid = dva.va_gid;
6838 				}
6839 			}
6840 		}
6841 	}
6842 
6843 	/*
6844 	 * Create ops:
6845 	 *	0:putfh(dir) 1:savefh(dir) 2:create 3:getfh(new) 4:getattr(new)
6846 	 *	5:restorefh(dir) 6:getattr(dir)
6847 	 *
6848 	 * if (setgid)
6849 	 *	0:putfh(dir) 1:create 2:getfh(new) 3:getattr(new)
6850 	 *	4:savefh(new) 5:putfh(dir) 6:getattr(dir) 7:restorefh(new)
6851 	 *	8:nverify 9:setattr
6852 	 */
6853 	if (setgid_flag) {
6854 		numops = 10;
6855 		idx_create = 1;
6856 		idx_fattr = 3;
6857 	} else {
6858 		numops = 7;
6859 		idx_create = 2;
6860 		idx_fattr = 4;
6861 	}
6862 
6863 	ASSERT(nfs_zone() == mi->mi_zone);
6864 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp))) {
6865 		return (EINTR);
6866 	}
6867 	recov_state.rs_flags = 0;
6868 	recov_state.rs_num_retry_despite_err = 0;
6869 
6870 	argoplist_size = numops * sizeof (nfs_argop4);
6871 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
6872 
6873 recov_retry:
6874 	if (type == NF4LNK)
6875 		args.ctag = TAG_SYMLINK;
6876 	else if (type == NF4DIR)
6877 		args.ctag = TAG_MKDIR;
6878 	else
6879 		args.ctag = TAG_MKNOD;
6880 
6881 	args.array_len = numops;
6882 	args.array = argop;
6883 
6884 	if (e.error = nfs4_start_op(mi, dvp, NULL, &recov_state)) {
6885 		nfs_rw_exit(&drp->r_rwlock);
6886 		kmem_free(argop, argoplist_size);
6887 		return (e.error);
6888 	}
6889 	need_end_op = TRUE;
6890 
6891 
6892 	/* 0: putfh directory */
6893 	argop[0].argop = OP_CPUTFH;
6894 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6895 
6896 	/* 1/2: Create object */
6897 	argop[idx_create].argop = OP_CCREATE;
6898 	argop[idx_create].nfs_argop4_u.opccreate.cname = nm;
6899 	argop[idx_create].nfs_argop4_u.opccreate.type = type;
6900 	if (type == NF4LNK) {
6901 		/*
6902 		 * symlink, treat name as data
6903 		 */
6904 		ASSERT(data != NULL);
6905 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.clinkdata =
6906 		    (char *)data;
6907 	}
6908 	if (type == NF4BLK || type == NF4CHR) {
6909 		ASSERT(data != NULL);
6910 		argop[idx_create].nfs_argop4_u.opccreate.ftype4_u.devdata =
6911 		    *((specdata4 *)data);
6912 	}
6913 
6914 	crattr = &argop[idx_create].nfs_argop4_u.opccreate.createattrs;
6915 
6916 	svp = drp->r_server;
6917 	(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
6918 	supp_attrs = svp->sv_supp_attrs;
6919 	nfs_rw_exit(&svp->sv_lock);
6920 
6921 	if (vattr_to_fattr4(va, NULL, crattr, 0, OP_CREATE, supp_attrs)) {
6922 		nfs_rw_exit(&drp->r_rwlock);
6923 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
6924 		e.error = EINVAL;
6925 		kmem_free(argop, argoplist_size);
6926 		return (e.error);
6927 	}
6928 
6929 	/* 2/3: getfh fh of created object */
6930 	ASSERT(idx_create + 1 == idx_fattr - 1);
6931 	argop[idx_create + 1].argop = OP_GETFH;
6932 
6933 	/* 3/4: getattr of new object */
6934 	argop[idx_fattr].argop = OP_GETATTR;
6935 	argop[idx_fattr].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6936 	argop[idx_fattr].nfs_argop4_u.opgetattr.mi = mi;
6937 
6938 	if (setgid_flag) {
6939 		vattr_t	_v;
6940 
6941 		argop[4].argop = OP_SAVEFH;
6942 
6943 		argop[5].argop = OP_CPUTFH;
6944 		argop[5].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
6945 
6946 		argop[6].argop = OP_GETATTR;
6947 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6948 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
6949 
6950 		argop[7].argop = OP_RESTOREFH;
6951 
6952 		/*
6953 		 * nverify
6954 		 *
6955 		 * XXX - Revisit the last argument to nfs4_end_op()
6956 		 *	 once 5020486 is fixed.
6957 		 */
6958 		_v.va_mask = AT_GID;
6959 		_v.va_gid = va->va_gid;
6960 		if (e.error = nfs4args_verify(&argop[8], &_v, OP_NVERIFY,
6961 		    supp_attrs)) {
6962 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6963 			nfs_rw_exit(&drp->r_rwlock);
6964 			nfs4_fattr4_free(crattr);
6965 			kmem_free(argop, argoplist_size);
6966 			return (e.error);
6967 		}
6968 
6969 		/*
6970 		 * setattr
6971 		 *
6972 		 * We _know_ we're not messing with AT_SIZE or AT_XTIME,
6973 		 * so no need for stateid or flags. Also we specify NULL
6974 		 * rp since we're only interested in setting owner_group
6975 		 * attributes.
6976 		 */
6977 		nfs4args_setattr(&argop[9], &_v, NULL, 0, NULL, cr, supp_attrs,
6978 		    &e.error, 0);
6979 
6980 		if (e.error) {
6981 			nfs4_end_op(mi, dvp, *vpp, &recov_state, TRUE);
6982 			nfs_rw_exit(&drp->r_rwlock);
6983 			nfs4_fattr4_free(crattr);
6984 			nfs4args_verify_free(&argop[8]);
6985 			kmem_free(argop, argoplist_size);
6986 			return (e.error);
6987 		}
6988 	} else {
6989 		argop[1].argop = OP_SAVEFH;
6990 
6991 		argop[5].argop = OP_RESTOREFH;
6992 
6993 		argop[6].argop = OP_GETATTR;
6994 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
6995 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
6996 	}
6997 
6998 	dnlc_remove(dvp, nm);
6999 
7000 	doqueue = 1;
7001 	t = gethrtime();
7002 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7003 
7004 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7005 	if (e.error) {
7006 		PURGE_ATTRCACHE4(dvp);
7007 		if (!needrecov)
7008 			goto out;
7009 	}
7010 
7011 	if (needrecov) {
7012 		if (nfs4_start_recovery(&e, mi, dvp, NULL, NULL, NULL,
7013 		    OP_CREATE, NULL, NULL, NULL) == FALSE) {
7014 			nfs4_end_op(mi, dvp, NULL, &recov_state,
7015 			    needrecov);
7016 			need_end_op = FALSE;
7017 			nfs4_fattr4_free(crattr);
7018 			if (setgid_flag) {
7019 				nfs4args_verify_free(&argop[8]);
7020 				nfs4args_setattr_free(&argop[9]);
7021 			}
7022 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
7023 			goto recov_retry;
7024 		}
7025 	}
7026 
7027 	resp = &res;
7028 
7029 	if (res.status != NFS4_OK && res.array_len <= idx_fattr + 1) {
7030 
7031 		if (res.status == NFS4ERR_BADOWNER)
7032 			nfs4_log_badowner(mi, OP_CREATE);
7033 
7034 		e.error = geterrno4(res.status);
7035 
7036 		/*
7037 		 * This check is left over from when create was implemented
7038 		 * using a setattr op (instead of createattrs).  If the
7039 		 * putfh/create/getfh failed, the error was returned.  If
7040 		 * setattr/getattr failed, we keep going.
7041 		 *
7042 		 * It might be better to get rid of the GETFH also, and just
7043 		 * do PUTFH/CREATE/GETATTR since the FH attr is mandatory.
7044 		 * Then if any of the operations failed, we could return the
7045 		 * error now, and remove much of the error code below.
7046 		 */
7047 		if (res.array_len <= idx_fattr) {
7048 			/*
7049 			 * Either Putfh, Create or Getfh failed.
7050 			 */
7051 			PURGE_ATTRCACHE4(dvp);
7052 			/*
7053 			 * nfs4_purge_stale_fh() may generate otw calls through
7054 			 * nfs4_invalidate_pages. Hence the need to call
7055 			 * nfs4_end_op() here to avoid nfs4_start_op() deadlock.
7056 			 */
7057 			nfs4_end_op(mi, dvp, NULL, &recov_state,
7058 			    needrecov);
7059 			need_end_op = FALSE;
7060 			nfs4_purge_stale_fh(e.error, dvp, cr);
7061 			goto out;
7062 		}
7063 	}
7064 
7065 	resop = &res.array[idx_create];	/* create res */
7066 	cinfo = &resop->nfs_resop4_u.opcreate.cinfo;
7067 
7068 	resop = &res.array[idx_create + 1]; /* getfh res */
7069 	gf_res = &resop->nfs_resop4_u.opgetfh;
7070 
7071 	sfhp = sfh4_get(&gf_res->object, mi);
7072 	if (e.error) {
7073 		*vpp = vp = makenfs4node(sfhp, NULL, dvp->v_vfsp, t, cr, dvp,
7074 		    fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7075 		if (vp->v_type == VNON) {
7076 			vattr.va_mask = AT_TYPE;
7077 			/*
7078 			 * Need to call nfs4_end_op before nfs4getattr to avoid
7079 			 * potential nfs4_start_op deadlock. See RFE 4777612.
7080 			 */
7081 			nfs4_end_op(mi, dvp, NULL, &recov_state,
7082 			    needrecov);
7083 			need_end_op = FALSE;
7084 			e.error = nfs4getattr(vp, &vattr, cr);
7085 			if (e.error) {
7086 				VN_RELE(vp);
7087 				*vpp = NULL;
7088 				goto out;
7089 			}
7090 			vp->v_type = vattr.va_type;
7091 		}
7092 		e.error = 0;
7093 	} else {
7094 		*vpp = vp = makenfs4node(sfhp,
7095 		    &res.array[idx_fattr].nfs_resop4_u.opgetattr.ga_res,
7096 		    dvp->v_vfsp, t, cr,
7097 		    dvp, fn_get(VTOSV(dvp)->sv_name, nm, sfhp));
7098 	}
7099 
7100 	/*
7101 	 * If compound succeeded, then update dir attrs
7102 	 */
7103 	if (res.status == NFS4_OK) {
7104 		dinfo.di_garp = &res.array[6].nfs_resop4_u.opgetattr.ga_res;
7105 		dinfo.di_cred = cr;
7106 		dinfo.di_time_call = t;
7107 		dinfop = &dinfo;
7108 	} else
7109 		dinfop = NULL;
7110 
7111 	/* Update directory cache attribute, readdir and dnlc caches */
7112 	nfs4_update_dircaches(cinfo, dvp, vp, nm, dinfop);
7113 
7114 out:
7115 	if (sfhp != NULL)
7116 		sfh4_rele(&sfhp);
7117 	nfs_rw_exit(&drp->r_rwlock);
7118 	nfs4_fattr4_free(crattr);
7119 	if (setgid_flag) {
7120 		nfs4args_verify_free(&argop[8]);
7121 		nfs4args_setattr_free(&argop[9]);
7122 	}
7123 	if (resp)
7124 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7125 	if (need_end_op)
7126 		nfs4_end_op(mi, dvp, NULL, &recov_state, needrecov);
7127 
7128 	kmem_free(argop, argoplist_size);
7129 	return (e.error);
7130 }
7131 
7132 /* ARGSUSED */
7133 static int
7134 nfs4mknod(vnode_t *dvp, char *nm, struct vattr *va, enum vcexcl exclusive,
7135     int mode, vnode_t **vpp, cred_t *cr)
7136 {
7137 	int error;
7138 	vnode_t *vp;
7139 	nfs_ftype4 type;
7140 	specdata4 spec, *specp = NULL;
7141 
7142 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
7143 
7144 	switch (va->va_type) {
7145 	case VCHR:
7146 	case VBLK:
7147 		type = (va->va_type == VCHR) ? NF4CHR : NF4BLK;
7148 		spec.specdata1 = getmajor(va->va_rdev);
7149 		spec.specdata2 = getminor(va->va_rdev);
7150 		specp = &spec;
7151 		break;
7152 
7153 	case VFIFO:
7154 		type = NF4FIFO;
7155 		break;
7156 	case VSOCK:
7157 		type = NF4SOCK;
7158 		break;
7159 
7160 	default:
7161 		return (EINVAL);
7162 	}
7163 
7164 	error = call_nfs4_create_req(dvp, nm, specp, va, &vp, cr, type);
7165 	if (error) {
7166 		return (error);
7167 	}
7168 
7169 	/*
7170 	 * This might not be needed any more; special case to deal
7171 	 * with problematic v2/v3 servers.  Since create was unable
7172 	 * to set group correctly, not sure what hope setattr has.
7173 	 */
7174 	if (va->va_gid != VTOR4(vp)->r_attr.va_gid) {
7175 		va->va_mask = AT_GID;
7176 		(void) nfs4setattr(vp, va, 0, cr, NULL);
7177 	}
7178 
7179 	/*
7180 	 * If vnode is a device create special vnode
7181 	 */
7182 	if (ISVDEV(vp->v_type)) {
7183 		*vpp = specvp(vp, vp->v_rdev, vp->v_type, cr);
7184 		VN_RELE(vp);
7185 	} else {
7186 		*vpp = vp;
7187 	}
7188 	return (error);
7189 }
7190 
7191 /*
7192  * Remove requires that the current fh be the target directory.
7193  * After the operation, the current fh is unchanged.
7194  * The compound op structure is:
7195  *      PUTFH(targetdir), REMOVE
7196  *
7197  * Weirdness: if the vnode to be removed is open
7198  * we rename it instead of removing it and nfs_inactive
7199  * will remove the new name.
7200  */
7201 /* ARGSUSED */
7202 static int
7203 nfs4_remove(vnode_t *dvp, char *nm, cred_t *cr, caller_context_t *ct, int flags)
7204 {
7205 	COMPOUND4args_clnt args;
7206 	COMPOUND4res_clnt res, *resp = NULL;
7207 	REMOVE4res *rm_res;
7208 	nfs_argop4 argop[3];
7209 	nfs_resop4 *resop;
7210 	vnode_t *vp;
7211 	char *tmpname;
7212 	int doqueue;
7213 	mntinfo4_t *mi;
7214 	rnode4_t *rp;
7215 	rnode4_t *drp;
7216 	int needrecov = 0;
7217 	nfs4_recov_state_t recov_state;
7218 	int isopen;
7219 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7220 	dirattr_info_t dinfo;
7221 
7222 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
7223 		return (EPERM);
7224 	drp = VTOR4(dvp);
7225 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
7226 		return (EINTR);
7227 
7228 	e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
7229 	if (e.error) {
7230 		nfs_rw_exit(&drp->r_rwlock);
7231 		return (e.error);
7232 	}
7233 
7234 	if (vp->v_type == VDIR) {
7235 		VN_RELE(vp);
7236 		nfs_rw_exit(&drp->r_rwlock);
7237 		return (EISDIR);
7238 	}
7239 
7240 	/*
7241 	 * First just remove the entry from the name cache, as it
7242 	 * is most likely the only entry for this vp.
7243 	 */
7244 	dnlc_remove(dvp, nm);
7245 
7246 	rp = VTOR4(vp);
7247 
7248 	/*
7249 	 * For regular file types, check to see if the file is open by looking
7250 	 * at the open streams.
7251 	 * For all other types, check the reference count on the vnode.  Since
7252 	 * they are not opened OTW they never have an open stream.
7253 	 *
7254 	 * If the file is open, rename it to .nfsXXXX.
7255 	 */
7256 	if (vp->v_type != VREG) {
7257 		/*
7258 		 * If the file has a v_count > 1 then there may be more than one
7259 		 * entry in the name cache due multiple links or an open file,
7260 		 * but we don't have the real reference count so flush all
7261 		 * possible entries.
7262 		 */
7263 		if (vp->v_count > 1)
7264 			dnlc_purge_vp(vp);
7265 
7266 		/*
7267 		 * Now we have the real reference count.
7268 		 */
7269 		isopen = vp->v_count > 1;
7270 	} else {
7271 		mutex_enter(&rp->r_os_lock);
7272 		isopen = list_head(&rp->r_open_streams) != NULL;
7273 		mutex_exit(&rp->r_os_lock);
7274 	}
7275 
7276 	mutex_enter(&rp->r_statelock);
7277 	if (isopen &&
7278 	    (rp->r_unldvp == NULL || strcmp(nm, rp->r_unlname) == 0)) {
7279 		mutex_exit(&rp->r_statelock);
7280 		tmpname = newname();
7281 		e.error = nfs4rename(dvp, nm, dvp, tmpname, cr, ct);
7282 		if (e.error)
7283 			kmem_free(tmpname, MAXNAMELEN);
7284 		else {
7285 			mutex_enter(&rp->r_statelock);
7286 			if (rp->r_unldvp == NULL) {
7287 				VN_HOLD(dvp);
7288 				rp->r_unldvp = dvp;
7289 				if (rp->r_unlcred != NULL)
7290 					crfree(rp->r_unlcred);
7291 				crhold(cr);
7292 				rp->r_unlcred = cr;
7293 				rp->r_unlname = tmpname;
7294 			} else {
7295 				kmem_free(rp->r_unlname, MAXNAMELEN);
7296 				rp->r_unlname = tmpname;
7297 			}
7298 			mutex_exit(&rp->r_statelock);
7299 		}
7300 		VN_RELE(vp);
7301 		nfs_rw_exit(&drp->r_rwlock);
7302 		return (e.error);
7303 	}
7304 	/*
7305 	 * Actually remove the file/dir
7306 	 */
7307 	mutex_exit(&rp->r_statelock);
7308 
7309 	/*
7310 	 * We need to flush any dirty pages which happen to
7311 	 * be hanging around before removing the file.
7312 	 * This shouldn't happen very often since in NFSv4
7313 	 * we should be close to open consistent.
7314 	 */
7315 	if (nfs4_has_pages(vp) &&
7316 	    ((rp->r_flags & R4DIRTY) || rp->r_count > 0)) {
7317 		e.error = nfs4_putpage(vp, (u_offset_t)0, 0, 0, cr, ct);
7318 		if (e.error && (e.error == ENOSPC || e.error == EDQUOT)) {
7319 			mutex_enter(&rp->r_statelock);
7320 			if (!rp->r_error)
7321 				rp->r_error = e.error;
7322 			mutex_exit(&rp->r_statelock);
7323 		}
7324 	}
7325 
7326 	mi = VTOMI4(dvp);
7327 
7328 	(void) nfs4delegreturn(rp, NFS4_DR_REOPEN);
7329 	recov_state.rs_flags = 0;
7330 	recov_state.rs_num_retry_despite_err = 0;
7331 
7332 recov_retry:
7333 	/*
7334 	 * Remove ops: putfh dir; remove
7335 	 */
7336 	args.ctag = TAG_REMOVE;
7337 	args.array_len = 3;
7338 	args.array = argop;
7339 
7340 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
7341 	if (e.error) {
7342 		nfs_rw_exit(&drp->r_rwlock);
7343 		VN_RELE(vp);
7344 		return (e.error);
7345 	}
7346 
7347 	/* putfh directory */
7348 	argop[0].argop = OP_CPUTFH;
7349 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
7350 
7351 	/* remove */
7352 	argop[1].argop = OP_CREMOVE;
7353 	argop[1].nfs_argop4_u.opcremove.ctarget = nm;
7354 
7355 	/* getattr dir */
7356 	argop[2].argop = OP_GETATTR;
7357 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7358 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
7359 
7360 	doqueue = 1;
7361 	dinfo.di_time_call = gethrtime();
7362 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
7363 
7364 	PURGE_ATTRCACHE4(vp);
7365 
7366 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
7367 	if (e.error)
7368 		PURGE_ATTRCACHE4(dvp);
7369 
7370 	if (needrecov) {
7371 		if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp,
7372 		    NULL, NULL, NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
7373 			if (!e.error)
7374 				(void) xdr_free(xdr_COMPOUND4res_clnt,
7375 				    (caddr_t)&res);
7376 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
7377 			    needrecov);
7378 			goto recov_retry;
7379 		}
7380 	}
7381 
7382 	/*
7383 	 * Matching nfs4_end_op() for start_op() above.
7384 	 * There is a path in the code below which calls
7385 	 * nfs4_purge_stale_fh(), which may generate otw calls through
7386 	 * nfs4_invalidate_pages. Hence we need to call nfs4_end_op()
7387 	 * here to avoid nfs4_start_op() deadlock.
7388 	 */
7389 	nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
7390 
7391 	if (!e.error) {
7392 		resp = &res;
7393 
7394 		if (res.status) {
7395 			e.error = geterrno4(res.status);
7396 			PURGE_ATTRCACHE4(dvp);
7397 			nfs4_purge_stale_fh(e.error, dvp, cr);
7398 		} else {
7399 			resop = &res.array[1];	/* remove res */
7400 			rm_res = &resop->nfs_resop4_u.opremove;
7401 
7402 			dinfo.di_garp =
7403 			    &res.array[2].nfs_resop4_u.opgetattr.ga_res;
7404 			dinfo.di_cred = cr;
7405 
7406 			/* Update directory attr, readdir and dnlc caches */
7407 			nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
7408 			    &dinfo);
7409 		}
7410 	}
7411 	nfs_rw_exit(&drp->r_rwlock);
7412 	if (resp)
7413 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7414 
7415 	if (e.error == 0) {
7416 		vnode_t *tvp;
7417 		rnode4_t *trp;
7418 		trp = VTOR4(vp);
7419 		tvp = vp;
7420 		if (IS_SHADOW(vp, trp))
7421 			tvp = RTOV4(trp);
7422 		vnevent_remove(tvp, dvp, nm, ct);
7423 	}
7424 	VN_RELE(vp);
7425 	return (e.error);
7426 }
7427 
7428 /*
7429  * Link requires that the current fh be the target directory and the
7430  * saved fh be the source fh. After the operation, the current fh is unchanged.
7431  * Thus the compound op structure is:
7432  *	PUTFH(file), SAVEFH, PUTFH(targetdir), LINK, RESTOREFH,
7433  *	GETATTR(file)
7434  */
7435 /* ARGSUSED */
7436 static int
7437 nfs4_link(vnode_t *tdvp, vnode_t *svp, char *tnm, cred_t *cr,
7438     caller_context_t *ct, int flags)
7439 {
7440 	COMPOUND4args_clnt args;
7441 	COMPOUND4res_clnt res, *resp = NULL;
7442 	LINK4res *ln_res;
7443 	int argoplist_size  = 7 * sizeof (nfs_argop4);
7444 	nfs_argop4 *argop;
7445 	nfs_resop4 *resop;
7446 	vnode_t *realvp, *nvp;
7447 	int doqueue;
7448 	mntinfo4_t *mi;
7449 	rnode4_t *tdrp;
7450 	bool_t needrecov = FALSE;
7451 	nfs4_recov_state_t recov_state;
7452 	hrtime_t t;
7453 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
7454 	dirattr_info_t dinfo;
7455 
7456 	ASSERT(*tnm != '\0');
7457 	ASSERT(tdvp->v_type == VDIR);
7458 	ASSERT(nfs4_consistent_type(tdvp));
7459 	ASSERT(nfs4_consistent_type(svp));
7460 
7461 	if (nfs_zone() != VTOMI4(tdvp)->mi_zone)
7462 		return (EPERM);
7463 	if (VOP_REALVP(svp, &realvp, ct) == 0) {
7464 		svp = realvp;
7465 		ASSERT(nfs4_consistent_type(svp));
7466 	}
7467 
7468 	tdrp = VTOR4(tdvp);
7469 	mi = VTOMI4(svp);
7470 
7471 	if (!(mi->mi_flags & MI4_LINK)) {
7472 		return (EOPNOTSUPP);
7473 	}
7474 	recov_state.rs_flags = 0;
7475 	recov_state.rs_num_retry_despite_err = 0;
7476 
7477 	if (nfs_rw_enter_sig(&tdrp->r_rwlock, RW_WRITER, INTR4(tdvp)))
7478 		return (EINTR);
7479 
7480 recov_retry:
7481 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
7482 
7483 	args.ctag = TAG_LINK;
7484 
7485 	/*
7486 	 * Link ops: putfh fl; savefh; putfh tdir; link; getattr(dir);
7487 	 * restorefh; getattr(fl)
7488 	 */
7489 	args.array_len = 7;
7490 	args.array = argop;
7491 
7492 	e.error = nfs4_start_op(VTOMI4(svp), svp, tdvp, &recov_state);
7493 	if (e.error) {
7494 		kmem_free(argop, argoplist_size);
7495 		nfs_rw_exit(&tdrp->r_rwlock);
7496 		return (e.error);
7497 	}
7498 
7499 	/* 0. putfh file */
7500 	argop[0].argop = OP_CPUTFH;
7501 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(svp)->r_fh;
7502 
7503 	/* 1. save current fh to free up the space for the dir */
7504 	argop[1].argop = OP_SAVEFH;
7505 
7506 	/* 2. putfh targetdir */
7507 	argop[2].argop = OP_CPUTFH;
7508 	argop[2].nfs_argop4_u.opcputfh.sfh = tdrp->r_fh;
7509 
7510 	/* 3. link: current_fh is targetdir, saved_fh is source */
7511 	argop[3].argop = OP_CLINK;
7512 	argop[3].nfs_argop4_u.opclink.cnewname = tnm;
7513 
7514 	/* 4. Get attributes of dir */
7515 	argop[4].argop = OP_GETATTR;
7516 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7517 	argop[4].nfs_argop4_u.opgetattr.mi = mi;
7518 
7519 	/* 5. If link was successful, restore current vp to file */
7520 	argop[5].argop = OP_RESTOREFH;
7521 
7522 	/* 6. Get attributes of linked object */
7523 	argop[6].argop = OP_GETATTR;
7524 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
7525 	argop[6].nfs_argop4_u.opgetattr.mi = mi;
7526 
7527 	dnlc_remove(tdvp, tnm);
7528 
7529 	doqueue = 1;
7530 	t = gethrtime();
7531 
7532 	rfs4call(VTOMI4(svp), &args, &res, cr, &doqueue, 0, &e);
7533 
7534 	needrecov = nfs4_needs_recovery(&e, FALSE, svp->v_vfsp);
7535 	if (e.error != 0 && !needrecov) {
7536 		PURGE_ATTRCACHE4(tdvp);
7537 		PURGE_ATTRCACHE4(svp);
7538 		nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7539 		goto out;
7540 	}
7541 
7542 	if (needrecov) {
7543 		bool_t abort;
7544 
7545 		abort = nfs4_start_recovery(&e, VTOMI4(svp), svp, tdvp,
7546 		    NULL, NULL, OP_LINK, NULL, NULL, NULL);
7547 		if (abort == FALSE) {
7548 			nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state,
7549 			    needrecov);
7550 			kmem_free(argop, argoplist_size);
7551 			if (!e.error)
7552 				(void) xdr_free(xdr_COMPOUND4res_clnt,
7553 				    (caddr_t)&res);
7554 			goto recov_retry;
7555 		} else {
7556 			if (e.error != 0) {
7557 				PURGE_ATTRCACHE4(tdvp);
7558 				PURGE_ATTRCACHE4(svp);
7559 				nfs4_end_op(VTOMI4(svp), svp, tdvp,
7560 				    &recov_state, needrecov);
7561 				goto out;
7562 			}
7563 			/* fall through for res.status case */
7564 		}
7565 	}
7566 
7567 	nfs4_end_op(VTOMI4(svp), svp, tdvp, &recov_state, needrecov);
7568 
7569 	resp = &res;
7570 	if (res.status) {
7571 		/* If link succeeded, then don't return error */
7572 		e.error = geterrno4(res.status);
7573 		if (res.array_len <= 4) {
7574 			/*
7575 			 * Either Putfh, Savefh, Putfh dir, or Link failed
7576 			 */
7577 			PURGE_ATTRCACHE4(svp);
7578 			PURGE_ATTRCACHE4(tdvp);
7579 			if (e.error == EOPNOTSUPP) {
7580 				mutex_enter(&mi->mi_lock);
7581 				mi->mi_flags &= ~MI4_LINK;
7582 				mutex_exit(&mi->mi_lock);
7583 			}
7584 			/* Remap EISDIR to EPERM for non-root user for SVVS */
7585 			/* XXX-LP */
7586 			if (e.error == EISDIR && crgetuid(cr) != 0)
7587 				e.error = EPERM;
7588 			goto out;
7589 		}
7590 	}
7591 
7592 	/* either no error or one of the postop getattr failed */
7593 
7594 	/*
7595 	 * XXX - if LINK succeeded, but no attrs were returned for link
7596 	 * file, purge its cache.
7597 	 *
7598 	 * XXX Perform a simplified version of wcc checking. Instead of
7599 	 * have another getattr to get pre-op, just purge cache if
7600 	 * any of the ops prior to and including the getattr failed.
7601 	 * If the getattr succeeded then update the attrcache accordingly.
7602 	 */
7603 
7604 	/*
7605 	 * update cache with link file postattrs.
7606 	 * Note: at this point resop points to link res.
7607 	 */
7608 	resop = &res.array[3];	/* link res */
7609 	ln_res = &resop->nfs_resop4_u.oplink;
7610 	if (res.status == NFS4_OK)
7611 		e.error = nfs4_update_attrcache(res.status,
7612 		    &res.array[6].nfs_resop4_u.opgetattr.ga_res,
7613 		    t, svp, cr);
7614 
7615 	/*
7616 	 * Call makenfs4node to create the new shadow vp for tnm.
7617 	 * We pass NULL attrs because we just cached attrs for
7618 	 * the src object.  All we're trying to accomplish is to
7619 	 * to create the new shadow vnode.
7620 	 */
7621 	nvp = makenfs4node(VTOR4(svp)->r_fh, NULL, tdvp->v_vfsp, t, cr,
7622 	    tdvp, fn_get(VTOSV(tdvp)->sv_name, tnm, VTOR4(svp)->r_fh));
7623 
7624 	/* Update target cache attribute, readdir and dnlc caches */
7625 	dinfo.di_garp = &res.array[4].nfs_resop4_u.opgetattr.ga_res;
7626 	dinfo.di_time_call = t;
7627 	dinfo.di_cred = cr;
7628 
7629 	nfs4_update_dircaches(&ln_res->cinfo, tdvp, nvp, tnm, &dinfo);
7630 	ASSERT(nfs4_consistent_type(tdvp));
7631 	ASSERT(nfs4_consistent_type(svp));
7632 	ASSERT(nfs4_consistent_type(nvp));
7633 	VN_RELE(nvp);
7634 
7635 	if (!e.error) {
7636 		vnode_t *tvp;
7637 		rnode4_t *trp;
7638 		/*
7639 		 * Notify the source file of this link operation.
7640 		 */
7641 		trp = VTOR4(svp);
7642 		tvp = svp;
7643 		if (IS_SHADOW(svp, trp))
7644 			tvp = RTOV4(trp);
7645 		vnevent_link(tvp, ct);
7646 	}
7647 out:
7648 	kmem_free(argop, argoplist_size);
7649 	if (resp)
7650 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
7651 
7652 	nfs_rw_exit(&tdrp->r_rwlock);
7653 
7654 	return (e.error);
7655 }
7656 
7657 /* ARGSUSED */
7658 static int
7659 nfs4_rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7660     caller_context_t *ct, int flags)
7661 {
7662 	vnode_t *realvp;
7663 
7664 	if (nfs_zone() != VTOMI4(odvp)->mi_zone)
7665 		return (EPERM);
7666 	if (VOP_REALVP(ndvp, &realvp, ct) == 0)
7667 		ndvp = realvp;
7668 
7669 	return (nfs4rename(odvp, onm, ndvp, nnm, cr, ct));
7670 }
7671 
7672 /*
7673  * nfs4rename does the real work of renaming in NFS Version 4.
7674  *
7675  * A file handle is considered volatile for renaming purposes if either
7676  * of the volatile bits are turned on. However, the compound may differ
7677  * based on the likelihood of the filehandle to change during rename.
7678  */
7679 static int
7680 nfs4rename(vnode_t *odvp, char *onm, vnode_t *ndvp, char *nnm, cred_t *cr,
7681     caller_context_t *ct)
7682 {
7683 	int error;
7684 	mntinfo4_t *mi;
7685 	vnode_t *nvp = NULL;
7686 	vnode_t *ovp = NULL;
7687 	char *tmpname = NULL;
7688 	rnode4_t *rp;
7689 	rnode4_t *odrp;
7690 	rnode4_t *ndrp;
7691 	int did_link = 0;
7692 	int do_link = 1;
7693 	nfsstat4 stat = NFS4_OK;
7694 
7695 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
7696 	ASSERT(nfs4_consistent_type(odvp));
7697 	ASSERT(nfs4_consistent_type(ndvp));
7698 
7699 	if (onm[0] == '.' && (onm[1] == '\0' ||
7700 	    (onm[1] == '.' && onm[2] == '\0')))
7701 		return (EINVAL);
7702 
7703 	if (nnm[0] == '.' && (nnm[1] == '\0' ||
7704 	    (nnm[1] == '.' && nnm[2] == '\0')))
7705 		return (EINVAL);
7706 
7707 	odrp = VTOR4(odvp);
7708 	ndrp = VTOR4(ndvp);
7709 	if ((intptr_t)odrp < (intptr_t)ndrp) {
7710 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp)))
7711 			return (EINTR);
7712 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp))) {
7713 			nfs_rw_exit(&odrp->r_rwlock);
7714 			return (EINTR);
7715 		}
7716 	} else {
7717 		if (nfs_rw_enter_sig(&ndrp->r_rwlock, RW_WRITER, INTR4(ndvp)))
7718 			return (EINTR);
7719 		if (nfs_rw_enter_sig(&odrp->r_rwlock, RW_WRITER, INTR4(odvp))) {
7720 			nfs_rw_exit(&ndrp->r_rwlock);
7721 			return (EINTR);
7722 		}
7723 	}
7724 
7725 	/*
7726 	 * Lookup the target file.  If it exists, it needs to be
7727 	 * checked to see whether it is a mount point and whether
7728 	 * it is active (open).
7729 	 */
7730 	error = nfs4lookup(ndvp, nnm, &nvp, cr, 0);
7731 	if (!error) {
7732 		int	isactive;
7733 
7734 		ASSERT(nfs4_consistent_type(nvp));
7735 		/*
7736 		 * If this file has been mounted on, then just
7737 		 * return busy because renaming to it would remove
7738 		 * the mounted file system from the name space.
7739 		 */
7740 		if (vn_ismntpt(nvp)) {
7741 			VN_RELE(nvp);
7742 			nfs_rw_exit(&odrp->r_rwlock);
7743 			nfs_rw_exit(&ndrp->r_rwlock);
7744 			return (EBUSY);
7745 		}
7746 
7747 		/*
7748 		 * First just remove the entry from the name cache, as it
7749 		 * is most likely the only entry for this vp.
7750 		 */
7751 		dnlc_remove(ndvp, nnm);
7752 
7753 		rp = VTOR4(nvp);
7754 
7755 		if (nvp->v_type != VREG) {
7756 			/*
7757 			 * Purge the name cache of all references to this vnode
7758 			 * so that we can check the reference count to infer
7759 			 * whether it is active or not.
7760 			 */
7761 			if (nvp->v_count > 1)
7762 				dnlc_purge_vp(nvp);
7763 
7764 			isactive = nvp->v_count > 1;
7765 		} else {
7766 			mutex_enter(&rp->r_os_lock);
7767 			isactive = list_head(&rp->r_open_streams) != NULL;
7768 			mutex_exit(&rp->r_os_lock);
7769 		}
7770 
7771 		/*
7772 		 * If the vnode is active and is not a directory,
7773 		 * arrange to rename it to a
7774 		 * temporary file so that it will continue to be
7775 		 * accessible.  This implements the "unlink-open-file"
7776 		 * semantics for the target of a rename operation.
7777 		 * Before doing this though, make sure that the
7778 		 * source and target files are not already the same.
7779 		 */
7780 		if (isactive && nvp->v_type != VDIR) {
7781 			/*
7782 			 * Lookup the source name.
7783 			 */
7784 			error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7785 
7786 			/*
7787 			 * The source name *should* already exist.
7788 			 */
7789 			if (error) {
7790 				VN_RELE(nvp);
7791 				nfs_rw_exit(&odrp->r_rwlock);
7792 				nfs_rw_exit(&ndrp->r_rwlock);
7793 				return (error);
7794 			}
7795 
7796 			ASSERT(nfs4_consistent_type(ovp));
7797 
7798 			/*
7799 			 * Compare the two vnodes.  If they are the same,
7800 			 * just release all held vnodes and return success.
7801 			 */
7802 			if (VN_CMP(ovp, nvp)) {
7803 				VN_RELE(ovp);
7804 				VN_RELE(nvp);
7805 				nfs_rw_exit(&odrp->r_rwlock);
7806 				nfs_rw_exit(&ndrp->r_rwlock);
7807 				return (0);
7808 			}
7809 
7810 			/*
7811 			 * Can't mix and match directories and non-
7812 			 * directories in rename operations.  We already
7813 			 * know that the target is not a directory.  If
7814 			 * the source is a directory, return an error.
7815 			 */
7816 			if (ovp->v_type == VDIR) {
7817 				VN_RELE(ovp);
7818 				VN_RELE(nvp);
7819 				nfs_rw_exit(&odrp->r_rwlock);
7820 				nfs_rw_exit(&ndrp->r_rwlock);
7821 				return (ENOTDIR);
7822 			}
7823 link_call:
7824 			/*
7825 			 * The target file exists, is not the same as
7826 			 * the source file, and is active.  We first
7827 			 * try to Link it to a temporary filename to
7828 			 * avoid having the server removing the file
7829 			 * completely (which could cause data loss to
7830 			 * the user's POV in the event the Rename fails
7831 			 * -- see bug 1165874).
7832 			 */
7833 			/*
7834 			 * The do_link and did_link booleans are
7835 			 * introduced in the event we get NFS4ERR_FILE_OPEN
7836 			 * returned for the Rename.  Some servers can
7837 			 * not Rename over an Open file, so they return
7838 			 * this error.  The client needs to Remove the
7839 			 * newly created Link and do two Renames, just
7840 			 * as if the server didn't support LINK.
7841 			 */
7842 			tmpname = newname();
7843 			error = 0;
7844 
7845 			if (do_link) {
7846 				error = nfs4_link(ndvp, nvp, tmpname, cr,
7847 				    NULL, 0);
7848 			}
7849 			if (error == EOPNOTSUPP || !do_link) {
7850 				error = nfs4_rename(ndvp, nnm, ndvp, tmpname,
7851 				    cr, NULL, 0);
7852 				did_link = 0;
7853 			} else {
7854 				did_link = 1;
7855 			}
7856 			if (error) {
7857 				kmem_free(tmpname, MAXNAMELEN);
7858 				VN_RELE(ovp);
7859 				VN_RELE(nvp);
7860 				nfs_rw_exit(&odrp->r_rwlock);
7861 				nfs_rw_exit(&ndrp->r_rwlock);
7862 				return (error);
7863 			}
7864 
7865 			mutex_enter(&rp->r_statelock);
7866 			if (rp->r_unldvp == NULL) {
7867 				VN_HOLD(ndvp);
7868 				rp->r_unldvp = ndvp;
7869 				if (rp->r_unlcred != NULL)
7870 					crfree(rp->r_unlcred);
7871 				crhold(cr);
7872 				rp->r_unlcred = cr;
7873 				rp->r_unlname = tmpname;
7874 			} else {
7875 				if (rp->r_unlname)
7876 					kmem_free(rp->r_unlname, MAXNAMELEN);
7877 				rp->r_unlname = tmpname;
7878 			}
7879 			mutex_exit(&rp->r_statelock);
7880 		}
7881 
7882 		(void) nfs4delegreturn(VTOR4(nvp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7883 
7884 		ASSERT(nfs4_consistent_type(nvp));
7885 	}
7886 
7887 	if (ovp == NULL) {
7888 		/*
7889 		 * When renaming directories to be a subdirectory of a
7890 		 * different parent, the dnlc entry for ".." will no
7891 		 * longer be valid, so it must be removed.
7892 		 *
7893 		 * We do a lookup here to determine whether we are renaming
7894 		 * a directory and we need to check if we are renaming
7895 		 * an unlinked file.  This might have already been done
7896 		 * in previous code, so we check ovp == NULL to avoid
7897 		 * doing it twice.
7898 		 */
7899 		error = nfs4lookup(odvp, onm, &ovp, cr, 0);
7900 		/*
7901 		 * The source name *should* already exist.
7902 		 */
7903 		if (error) {
7904 			nfs_rw_exit(&odrp->r_rwlock);
7905 			nfs_rw_exit(&ndrp->r_rwlock);
7906 			if (nvp) {
7907 				VN_RELE(nvp);
7908 			}
7909 			return (error);
7910 		}
7911 		ASSERT(ovp != NULL);
7912 		ASSERT(nfs4_consistent_type(ovp));
7913 	}
7914 
7915 	/*
7916 	 * Is the object being renamed a dir, and if so, is
7917 	 * it being renamed to a child of itself?  The underlying
7918 	 * fs should ultimately return EINVAL for this case;
7919 	 * however, buggy beta non-Solaris NFSv4 servers at
7920 	 * interop testing events have allowed this behavior,
7921 	 * and it caused our client to panic due to a recursive
7922 	 * mutex_enter in fn_move.
7923 	 *
7924 	 * The tedious locking in fn_move could be changed to
7925 	 * deal with this case, and the client could avoid the
7926 	 * panic; however, the client would just confuse itself
7927 	 * later and misbehave.  A better way to handle the broken
7928 	 * server is to detect this condition and return EINVAL
7929 	 * without ever sending the the bogus rename to the server.
7930 	 * We know the rename is invalid -- just fail it now.
7931 	 */
7932 	if (ovp->v_type == VDIR && VN_CMP(ndvp, ovp)) {
7933 		VN_RELE(ovp);
7934 		nfs_rw_exit(&odrp->r_rwlock);
7935 		nfs_rw_exit(&ndrp->r_rwlock);
7936 		if (nvp) {
7937 			VN_RELE(nvp);
7938 		}
7939 		return (EINVAL);
7940 	}
7941 
7942 	(void) nfs4delegreturn(VTOR4(ovp), NFS4_DR_PUSH|NFS4_DR_REOPEN);
7943 
7944 	/*
7945 	 * If FH4_VOL_RENAME or FH4_VOLATILE_ANY bits are set, it is
7946 	 * possible for the filehandle to change due to the rename.
7947 	 * If neither of these bits is set, but FH4_VOL_MIGRATION is set,
7948 	 * the fh will not change because of the rename, but we still need
7949 	 * to update its rnode entry with the new name for
7950 	 * an eventual fh change due to migration. The FH4_NOEXPIRE_ON_OPEN
7951 	 * has no effect on these for now, but for future improvements,
7952 	 * we might want to use it too to simplify handling of files
7953 	 * that are open with that flag on. (XXX)
7954 	 */
7955 	mi = VTOMI4(odvp);
7956 	if (NFS4_VOLATILE_FH(mi))
7957 		error = nfs4rename_volatile_fh(odvp, onm, ovp, ndvp, nnm, cr,
7958 		    &stat);
7959 	else
7960 		error = nfs4rename_persistent_fh(odvp, onm, ovp, ndvp, nnm, cr,
7961 		    &stat);
7962 
7963 	ASSERT(nfs4_consistent_type(odvp));
7964 	ASSERT(nfs4_consistent_type(ndvp));
7965 	ASSERT(nfs4_consistent_type(ovp));
7966 
7967 	if (stat == NFS4ERR_FILE_OPEN && did_link) {
7968 		do_link = 0;
7969 		/*
7970 		 * Before the 'link_call' code, we did a nfs4_lookup
7971 		 * that puts a VN_HOLD on nvp.  After the nfs4_link
7972 		 * call we call VN_RELE to match that hold.  We need
7973 		 * to place an additional VN_HOLD here since we will
7974 		 * be hitting that VN_RELE again.
7975 		 */
7976 		VN_HOLD(nvp);
7977 
7978 		(void) nfs4_remove(ndvp, tmpname, cr, NULL, 0);
7979 
7980 		/* Undo the unlinked file naming stuff we just did */
7981 		mutex_enter(&rp->r_statelock);
7982 		if (rp->r_unldvp) {
7983 			VN_RELE(ndvp);
7984 			rp->r_unldvp = NULL;
7985 			if (rp->r_unlcred != NULL)
7986 				crfree(rp->r_unlcred);
7987 			rp->r_unlcred = NULL;
7988 			/* rp->r_unlanme points to tmpname */
7989 			if (rp->r_unlname)
7990 				kmem_free(rp->r_unlname, MAXNAMELEN);
7991 			rp->r_unlname = NULL;
7992 		}
7993 		mutex_exit(&rp->r_statelock);
7994 
7995 		if (nvp) {
7996 			VN_RELE(nvp);
7997 		}
7998 		goto link_call;
7999 	}
8000 
8001 	if (error) {
8002 		VN_RELE(ovp);
8003 		nfs_rw_exit(&odrp->r_rwlock);
8004 		nfs_rw_exit(&ndrp->r_rwlock);
8005 		if (nvp) {
8006 			VN_RELE(nvp);
8007 		}
8008 		return (error);
8009 	}
8010 
8011 	/*
8012 	 * when renaming directories to be a subdirectory of a
8013 	 * different parent, the dnlc entry for ".." will no
8014 	 * longer be valid, so it must be removed
8015 	 */
8016 	rp = VTOR4(ovp);
8017 	if (ndvp != odvp) {
8018 		if (ovp->v_type == VDIR) {
8019 			dnlc_remove(ovp, "..");
8020 			if (rp->r_dir != NULL)
8021 				nfs4_purge_rddir_cache(ovp);
8022 		}
8023 	}
8024 
8025 	/*
8026 	 * If we are renaming the unlinked file, update the
8027 	 * r_unldvp and r_unlname as needed.
8028 	 */
8029 	mutex_enter(&rp->r_statelock);
8030 	if (rp->r_unldvp != NULL) {
8031 		if (strcmp(rp->r_unlname, onm) == 0) {
8032 			(void) strncpy(rp->r_unlname, nnm, MAXNAMELEN);
8033 			rp->r_unlname[MAXNAMELEN - 1] = '\0';
8034 			if (ndvp != rp->r_unldvp) {
8035 				VN_RELE(rp->r_unldvp);
8036 				rp->r_unldvp = ndvp;
8037 				VN_HOLD(ndvp);
8038 			}
8039 		}
8040 	}
8041 	mutex_exit(&rp->r_statelock);
8042 
8043 	/*
8044 	 * Notify the rename vnevents to source vnode, and to the target
8045 	 * vnode if it already existed.
8046 	 */
8047 	if (error == 0) {
8048 		vnode_t *tvp;
8049 		rnode4_t *trp;
8050 		/*
8051 		 * Notify the vnode. Each links is represented by
8052 		 * a different vnode, in nfsv4.
8053 		 */
8054 		if (nvp) {
8055 			trp = VTOR4(nvp);
8056 			tvp = nvp;
8057 			if (IS_SHADOW(nvp, trp))
8058 				tvp = RTOV4(trp);
8059 			vnevent_rename_dest(tvp, ndvp, nnm, ct);
8060 		}
8061 
8062 		/*
8063 		 * if the source and destination directory are not the
8064 		 * same notify the destination directory.
8065 		 */
8066 		if (VTOR4(odvp) != VTOR4(ndvp)) {
8067 			trp = VTOR4(ndvp);
8068 			tvp = ndvp;
8069 			if (IS_SHADOW(ndvp, trp))
8070 				tvp = RTOV4(trp);
8071 			vnevent_rename_dest_dir(tvp, ct);
8072 		}
8073 
8074 		trp = VTOR4(ovp);
8075 		tvp = ovp;
8076 		if (IS_SHADOW(ovp, trp))
8077 			tvp = RTOV4(trp);
8078 		vnevent_rename_src(tvp, odvp, onm, ct);
8079 	}
8080 
8081 	if (nvp) {
8082 		VN_RELE(nvp);
8083 	}
8084 	VN_RELE(ovp);
8085 
8086 	nfs_rw_exit(&odrp->r_rwlock);
8087 	nfs_rw_exit(&ndrp->r_rwlock);
8088 
8089 	return (error);
8090 }
8091 
8092 /*
8093  * When the parent directory has changed, sv_dfh must be updated
8094  */
8095 static void
8096 update_parentdir_sfh(vnode_t *vp, vnode_t *ndvp)
8097 {
8098 	svnode_t *sv = VTOSV(vp);
8099 	nfs4_sharedfh_t *old_dfh = sv->sv_dfh;
8100 	nfs4_sharedfh_t *new_dfh = VTOR4(ndvp)->r_fh;
8101 
8102 	sfh4_hold(new_dfh);
8103 	sv->sv_dfh = new_dfh;
8104 	sfh4_rele(&old_dfh);
8105 }
8106 
8107 /*
8108  * nfs4rename_persistent does the otw portion of renaming in NFS Version 4,
8109  * when it is known that the filehandle is persistent through rename.
8110  *
8111  * Rename requires that the current fh be the target directory and the
8112  * saved fh be the source directory. After the operation, the current fh
8113  * is unchanged.
8114  * The compound op structure for persistent fh rename is:
8115  *      PUTFH(sourcdir), SAVEFH, PUTFH(targetdir), RENAME
8116  * Rather than bother with the directory postop args, we'll simply
8117  * update that a change occurred in the cache, so no post-op getattrs.
8118  */
8119 static int
8120 nfs4rename_persistent_fh(vnode_t *odvp, char *onm, vnode_t *renvp,
8121     vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8122 {
8123 	COMPOUND4args_clnt args;
8124 	COMPOUND4res_clnt res, *resp = NULL;
8125 	nfs_argop4 *argop;
8126 	nfs_resop4 *resop;
8127 	int doqueue, argoplist_size;
8128 	mntinfo4_t *mi;
8129 	rnode4_t *odrp = VTOR4(odvp);
8130 	rnode4_t *ndrp = VTOR4(ndvp);
8131 	RENAME4res *rn_res;
8132 	bool_t needrecov;
8133 	nfs4_recov_state_t recov_state;
8134 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8135 	dirattr_info_t dinfo, *dinfop;
8136 
8137 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8138 
8139 	recov_state.rs_flags = 0;
8140 	recov_state.rs_num_retry_despite_err = 0;
8141 
8142 	/*
8143 	 * Rename ops: putfh sdir; savefh; putfh tdir; rename; getattr tdir
8144 	 *
8145 	 * If source/target are different dirs, then append putfh(src); getattr
8146 	 */
8147 	args.array_len = (odvp == ndvp) ? 5 : 7;
8148 	argoplist_size = args.array_len * sizeof (nfs_argop4);
8149 	args.array = argop = kmem_alloc(argoplist_size, KM_SLEEP);
8150 
8151 recov_retry:
8152 	*statp = NFS4_OK;
8153 
8154 	/* No need to Lookup the file, persistent fh */
8155 	args.ctag = TAG_RENAME;
8156 
8157 	mi = VTOMI4(odvp);
8158 	e.error = nfs4_start_op(mi, odvp, ndvp, &recov_state);
8159 	if (e.error) {
8160 		kmem_free(argop, argoplist_size);
8161 		return (e.error);
8162 	}
8163 
8164 	/* 0: putfh source directory */
8165 	argop[0].argop = OP_CPUTFH;
8166 	argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8167 
8168 	/* 1: Save source fh to free up current for target */
8169 	argop[1].argop = OP_SAVEFH;
8170 
8171 	/* 2: putfh targetdir */
8172 	argop[2].argop = OP_CPUTFH;
8173 	argop[2].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8174 
8175 	/* 3: current_fh is targetdir, saved_fh is sourcedir */
8176 	argop[3].argop = OP_CRENAME;
8177 	argop[3].nfs_argop4_u.opcrename.coldname = onm;
8178 	argop[3].nfs_argop4_u.opcrename.cnewname = nnm;
8179 
8180 	/* 4: getattr (targetdir) */
8181 	argop[4].argop = OP_GETATTR;
8182 	argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8183 	argop[4].nfs_argop4_u.opgetattr.mi = mi;
8184 
8185 	if (ndvp != odvp) {
8186 
8187 		/* 5: putfh (sourcedir) */
8188 		argop[5].argop = OP_CPUTFH;
8189 		argop[5].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8190 
8191 		/* 6: getattr (sourcedir) */
8192 		argop[6].argop = OP_GETATTR;
8193 		argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8194 		argop[6].nfs_argop4_u.opgetattr.mi = mi;
8195 	}
8196 
8197 	dnlc_remove(odvp, onm);
8198 	dnlc_remove(ndvp, nnm);
8199 
8200 	doqueue = 1;
8201 	dinfo.di_time_call = gethrtime();
8202 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8203 
8204 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8205 	if (e.error) {
8206 		PURGE_ATTRCACHE4(odvp);
8207 		PURGE_ATTRCACHE4(ndvp);
8208 	} else {
8209 		*statp = res.status;
8210 	}
8211 
8212 	if (needrecov) {
8213 		if (nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8214 		    OP_RENAME, NULL, NULL, NULL) == FALSE) {
8215 			nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8216 			if (!e.error)
8217 				(void) xdr_free(xdr_COMPOUND4res_clnt,
8218 				    (caddr_t)&res);
8219 			goto recov_retry;
8220 		}
8221 	}
8222 
8223 	if (!e.error) {
8224 		resp = &res;
8225 		/*
8226 		 * as long as OP_RENAME
8227 		 */
8228 		if (res.status != NFS4_OK && res.array_len <= 4) {
8229 			e.error = geterrno4(res.status);
8230 			PURGE_ATTRCACHE4(odvp);
8231 			PURGE_ATTRCACHE4(ndvp);
8232 			/*
8233 			 * System V defines rename to return EEXIST, not
8234 			 * ENOTEMPTY if the target directory is not empty.
8235 			 * Over the wire, the error is NFSERR_ENOTEMPTY
8236 			 * which geterrno4 maps to ENOTEMPTY.
8237 			 */
8238 			if (e.error == ENOTEMPTY)
8239 				e.error = EEXIST;
8240 		} else {
8241 
8242 			resop = &res.array[3];	/* rename res */
8243 			rn_res = &resop->nfs_resop4_u.oprename;
8244 
8245 			if (res.status == NFS4_OK) {
8246 				/*
8247 				 * Update target attribute, readdir and dnlc
8248 				 * caches.
8249 				 */
8250 				dinfo.di_garp =
8251 				    &res.array[4].nfs_resop4_u.opgetattr.ga_res;
8252 				dinfo.di_cred = cr;
8253 				dinfop = &dinfo;
8254 			} else
8255 				dinfop = NULL;
8256 
8257 			nfs4_update_dircaches(&rn_res->target_cinfo,
8258 			    ndvp, NULL, NULL, dinfop);
8259 
8260 			/*
8261 			 * Update source attribute, readdir and dnlc caches
8262 			 *
8263 			 */
8264 			if (ndvp != odvp) {
8265 				update_parentdir_sfh(renvp, ndvp);
8266 
8267 				if (dinfop)
8268 					dinfo.di_garp =
8269 					    &(res.array[6].nfs_resop4_u.
8270 					    opgetattr.ga_res);
8271 
8272 				nfs4_update_dircaches(&rn_res->source_cinfo,
8273 				    odvp, NULL, NULL, dinfop);
8274 			}
8275 
8276 			fn_move(VTOSV(renvp)->sv_name, VTOSV(ndvp)->sv_name,
8277 			    nnm);
8278 		}
8279 	}
8280 
8281 	if (resp)
8282 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8283 	nfs4_end_op(mi, odvp, ndvp, &recov_state, needrecov);
8284 	kmem_free(argop, argoplist_size);
8285 
8286 	return (e.error);
8287 }
8288 
8289 /*
8290  * nfs4rename_volatile_fh does the otw part of renaming in NFS Version 4, when
8291  * it is possible for the filehandle to change due to the rename.
8292  *
8293  * The compound req in this case includes a post-rename lookup and getattr
8294  * to ensure that we have the correct fh and attributes for the object.
8295  *
8296  * Rename requires that the current fh be the target directory and the
8297  * saved fh be the source directory. After the operation, the current fh
8298  * is unchanged.
8299  *
8300  * We need the new filehandle (hence a LOOKUP and GETFH) so that we can
8301  * update the filehandle for the renamed object.  We also get the old
8302  * filehandle for historical reasons; this should be taken out sometime.
8303  * This results in a rather cumbersome compound...
8304  *
8305  *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8306  *    PUTFH(targetdir), RENAME, LOOKUP(trgt), GETFH(new), GETATTR
8307  *
8308  */
8309 static int
8310 nfs4rename_volatile_fh(vnode_t *odvp, char *onm, vnode_t *ovp,
8311     vnode_t *ndvp, char *nnm, cred_t *cr, nfsstat4 *statp)
8312 {
8313 	COMPOUND4args_clnt args;
8314 	COMPOUND4res_clnt res, *resp = NULL;
8315 	int argoplist_size;
8316 	nfs_argop4 *argop;
8317 	nfs_resop4 *resop;
8318 	int doqueue;
8319 	mntinfo4_t *mi;
8320 	rnode4_t *odrp = VTOR4(odvp);	/* old directory */
8321 	rnode4_t *ndrp = VTOR4(ndvp);	/* new directory */
8322 	rnode4_t *orp = VTOR4(ovp);	/* object being renamed */
8323 	RENAME4res *rn_res;
8324 	GETFH4res *ngf_res;
8325 	bool_t needrecov;
8326 	nfs4_recov_state_t recov_state;
8327 	hrtime_t t;
8328 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8329 	dirattr_info_t dinfo, *dinfop = &dinfo;
8330 
8331 	ASSERT(nfs_zone() == VTOMI4(odvp)->mi_zone);
8332 
8333 	recov_state.rs_flags = 0;
8334 	recov_state.rs_num_retry_despite_err = 0;
8335 
8336 recov_retry:
8337 	*statp = NFS4_OK;
8338 
8339 	/*
8340 	 * There is a window between the RPC and updating the path and
8341 	 * filehandle stored in the rnode.  Lock out the FHEXPIRED recovery
8342 	 * code, so that it doesn't try to use the old path during that
8343 	 * window.
8344 	 */
8345 	mutex_enter(&orp->r_statelock);
8346 	while (orp->r_flags & R4RECEXPFH) {
8347 		klwp_t *lwp = ttolwp(curthread);
8348 
8349 		if (lwp != NULL)
8350 			lwp->lwp_nostop++;
8351 		if (cv_wait_sig(&orp->r_cv, &orp->r_statelock) == 0) {
8352 			mutex_exit(&orp->r_statelock);
8353 			if (lwp != NULL)
8354 				lwp->lwp_nostop--;
8355 			return (EINTR);
8356 		}
8357 		if (lwp != NULL)
8358 			lwp->lwp_nostop--;
8359 	}
8360 	orp->r_flags |= R4RECEXPFH;
8361 	mutex_exit(&orp->r_statelock);
8362 
8363 	mi = VTOMI4(odvp);
8364 
8365 	args.ctag = TAG_RENAME_VFH;
8366 	args.array_len = (odvp == ndvp) ? 10 : 12;
8367 	argoplist_size  = args.array_len * sizeof (nfs_argop4);
8368 	argop = kmem_alloc(argoplist_size, KM_SLEEP);
8369 
8370 	/*
8371 	 * Rename ops:
8372 	 *    PUTFH(sourcdir), SAVEFH, LOOKUP(src), GETFH(old),
8373 	 *    PUTFH(targetdir), RENAME, GETATTR(targetdir)
8374 	 *    LOOKUP(trgt), GETFH(new), GETATTR,
8375 	 *
8376 	 *    if (odvp != ndvp)
8377 	 *	add putfh(sourcedir), getattr(sourcedir) }
8378 	 */
8379 	args.array = argop;
8380 
8381 	e.error = nfs4_start_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8382 	    &recov_state, NULL);
8383 	if (e.error) {
8384 		kmem_free(argop, argoplist_size);
8385 		mutex_enter(&orp->r_statelock);
8386 		orp->r_flags &= ~R4RECEXPFH;
8387 		cv_broadcast(&orp->r_cv);
8388 		mutex_exit(&orp->r_statelock);
8389 		return (e.error);
8390 	}
8391 
8392 	/* 0: putfh source directory */
8393 	argop[0].argop = OP_CPUTFH;
8394 	argop[0].nfs_argop4_u.opcputfh.sfh = odrp->r_fh;
8395 
8396 	/* 1: Save source fh to free up current for target */
8397 	argop[1].argop = OP_SAVEFH;
8398 
8399 	/* 2: Lookup pre-rename fh of renamed object */
8400 	argop[2].argop = OP_CLOOKUP;
8401 	argop[2].nfs_argop4_u.opclookup.cname = onm;
8402 
8403 	/* 3: getfh fh of renamed object (before rename) */
8404 	argop[3].argop = OP_GETFH;
8405 
8406 	/* 4: putfh targetdir */
8407 	argop[4].argop = OP_CPUTFH;
8408 	argop[4].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8409 
8410 	/* 5: current_fh is targetdir, saved_fh is sourcedir */
8411 	argop[5].argop = OP_CRENAME;
8412 	argop[5].nfs_argop4_u.opcrename.coldname = onm;
8413 	argop[5].nfs_argop4_u.opcrename.cnewname = nnm;
8414 
8415 	/* 6: getattr of target dir (post op attrs) */
8416 	argop[6].argop = OP_GETATTR;
8417 	argop[6].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8418 	argop[6].nfs_argop4_u.opgetattr.mi = mi;
8419 
8420 	/* 7: Lookup post-rename fh of renamed object */
8421 	argop[7].argop = OP_CLOOKUP;
8422 	argop[7].nfs_argop4_u.opclookup.cname = nnm;
8423 
8424 	/* 8: getfh fh of renamed object (after rename) */
8425 	argop[8].argop = OP_GETFH;
8426 
8427 	/* 9: getattr of renamed object */
8428 	argop[9].argop = OP_GETATTR;
8429 	argop[9].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8430 	argop[9].nfs_argop4_u.opgetattr.mi = mi;
8431 
8432 	/*
8433 	 * If source/target dirs are different, then get new post-op
8434 	 * attrs for source dir also.
8435 	 */
8436 	if (ndvp != odvp) {
8437 		/* 10: putfh (sourcedir) */
8438 		argop[10].argop = OP_CPUTFH;
8439 		argop[10].nfs_argop4_u.opcputfh.sfh = ndrp->r_fh;
8440 
8441 		/* 11: getattr (sourcedir) */
8442 		argop[11].argop = OP_GETATTR;
8443 		argop[11].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8444 		argop[11].nfs_argop4_u.opgetattr.mi = mi;
8445 	}
8446 
8447 	dnlc_remove(odvp, onm);
8448 	dnlc_remove(ndvp, nnm);
8449 
8450 	doqueue = 1;
8451 	t = gethrtime();
8452 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8453 
8454 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8455 	if (e.error) {
8456 		PURGE_ATTRCACHE4(odvp);
8457 		PURGE_ATTRCACHE4(ndvp);
8458 		if (!needrecov) {
8459 			nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8460 			    &recov_state, needrecov);
8461 			goto out;
8462 		}
8463 	} else {
8464 		*statp = res.status;
8465 	}
8466 
8467 	if (needrecov) {
8468 		bool_t abort;
8469 
8470 		abort = nfs4_start_recovery(&e, mi, odvp, ndvp, NULL, NULL,
8471 		    OP_RENAME, NULL, NULL, NULL);
8472 		if (abort == FALSE) {
8473 			nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8474 			    &recov_state, needrecov);
8475 			kmem_free(argop, argoplist_size);
8476 			if (!e.error)
8477 				(void) xdr_free(xdr_COMPOUND4res_clnt,
8478 				    (caddr_t)&res);
8479 			mutex_enter(&orp->r_statelock);
8480 			orp->r_flags &= ~R4RECEXPFH;
8481 			cv_broadcast(&orp->r_cv);
8482 			mutex_exit(&orp->r_statelock);
8483 			goto recov_retry;
8484 		} else {
8485 			if (e.error != 0) {
8486 				nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME,
8487 				    &recov_state, needrecov);
8488 				goto out;
8489 			}
8490 			/* fall through for res.status case */
8491 		}
8492 	}
8493 
8494 	resp = &res;
8495 	/*
8496 	 * If OP_RENAME (or any prev op) failed, then return an error.
8497 	 * OP_RENAME is index 5, so if array len <= 6 we return an error.
8498 	 */
8499 	if ((res.status != NFS4_OK) && (res.array_len <= 6)) {
8500 		/*
8501 		 * Error in an op other than last Getattr
8502 		 */
8503 		e.error = geterrno4(res.status);
8504 		PURGE_ATTRCACHE4(odvp);
8505 		PURGE_ATTRCACHE4(ndvp);
8506 		/*
8507 		 * System V defines rename to return EEXIST, not
8508 		 * ENOTEMPTY if the target directory is not empty.
8509 		 * Over the wire, the error is NFSERR_ENOTEMPTY
8510 		 * which geterrno4 maps to ENOTEMPTY.
8511 		 */
8512 		if (e.error == ENOTEMPTY)
8513 			e.error = EEXIST;
8514 		nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state,
8515 		    needrecov);
8516 		goto out;
8517 	}
8518 
8519 	/* rename results */
8520 	rn_res = &res.array[5].nfs_resop4_u.oprename;
8521 
8522 	if (res.status == NFS4_OK) {
8523 		/* Update target attribute, readdir and dnlc caches */
8524 		dinfo.di_garp =
8525 		    &res.array[6].nfs_resop4_u.opgetattr.ga_res;
8526 		dinfo.di_cred = cr;
8527 		dinfo.di_time_call = t;
8528 	} else
8529 		dinfop = NULL;
8530 
8531 	/* Update source cache attribute, readdir and dnlc caches */
8532 	nfs4_update_dircaches(&rn_res->target_cinfo, ndvp, NULL, NULL, dinfop);
8533 
8534 	/* Update source cache attribute, readdir and dnlc caches */
8535 	if (ndvp != odvp) {
8536 		update_parentdir_sfh(ovp, ndvp);
8537 
8538 		/*
8539 		 * If dinfop is non-NULL, then compound succeded, so
8540 		 * set di_garp to attrs for source dir.  dinfop is only
8541 		 * set to NULL when compound fails.
8542 		 */
8543 		if (dinfop)
8544 			dinfo.di_garp =
8545 			    &res.array[11].nfs_resop4_u.opgetattr.ga_res;
8546 		nfs4_update_dircaches(&rn_res->source_cinfo, odvp, NULL, NULL,
8547 		    dinfop);
8548 	}
8549 
8550 	/*
8551 	 * Update the rnode with the new component name and args,
8552 	 * and if the file handle changed, also update it with the new fh.
8553 	 * This is only necessary if the target object has an rnode
8554 	 * entry and there is no need to create one for it.
8555 	 */
8556 	resop = &res.array[8];	/* getfh new res */
8557 	ngf_res = &resop->nfs_resop4_u.opgetfh;
8558 
8559 	/*
8560 	 * Update the path and filehandle for the renamed object.
8561 	 */
8562 	nfs4rename_update(ovp, ndvp, &ngf_res->object, nnm);
8563 
8564 	nfs4_end_fop(mi, odvp, ndvp, OH_VFH_RENAME, &recov_state, needrecov);
8565 
8566 	if (res.status == NFS4_OK) {
8567 		resop++;	/* getattr res */
8568 		e.error = nfs4_update_attrcache(res.status,
8569 		    &resop->nfs_resop4_u.opgetattr.ga_res,
8570 		    t, ovp, cr);
8571 	}
8572 
8573 out:
8574 	kmem_free(argop, argoplist_size);
8575 	if (resp)
8576 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8577 	mutex_enter(&orp->r_statelock);
8578 	orp->r_flags &= ~R4RECEXPFH;
8579 	cv_broadcast(&orp->r_cv);
8580 	mutex_exit(&orp->r_statelock);
8581 
8582 	return (e.error);
8583 }
8584 
8585 /* ARGSUSED */
8586 static int
8587 nfs4_mkdir(vnode_t *dvp, char *nm, struct vattr *va, vnode_t **vpp, cred_t *cr,
8588     caller_context_t *ct, int flags, vsecattr_t *vsecp)
8589 {
8590 	int error;
8591 	vnode_t *vp;
8592 
8593 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8594 		return (EPERM);
8595 	/*
8596 	 * As ".." has special meaning and rather than send a mkdir
8597 	 * over the wire to just let the server freak out, we just
8598 	 * short circuit it here and return EEXIST
8599 	 */
8600 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8601 		return (EEXIST);
8602 
8603 	/*
8604 	 * Decision to get the right gid and setgid bit of the
8605 	 * new directory is now made in call_nfs4_create_req.
8606 	 */
8607 	va->va_mask |= AT_MODE;
8608 	error = call_nfs4_create_req(dvp, nm, NULL, va, &vp, cr, NF4DIR);
8609 	if (error)
8610 		return (error);
8611 
8612 	*vpp = vp;
8613 	return (0);
8614 }
8615 
8616 
8617 /*
8618  * rmdir is using the same remove v4 op as does remove.
8619  * Remove requires that the current fh be the target directory.
8620  * After the operation, the current fh is unchanged.
8621  * The compound op structure is:
8622  *      PUTFH(targetdir), REMOVE
8623  */
8624 /*ARGSUSED4*/
8625 static int
8626 nfs4_rmdir(vnode_t *dvp, char *nm, vnode_t *cdir, cred_t *cr,
8627     caller_context_t *ct, int flags)
8628 {
8629 	int need_end_op = FALSE;
8630 	COMPOUND4args_clnt args;
8631 	COMPOUND4res_clnt res, *resp = NULL;
8632 	REMOVE4res *rm_res;
8633 	nfs_argop4 argop[3];
8634 	nfs_resop4 *resop;
8635 	vnode_t *vp;
8636 	int doqueue;
8637 	mntinfo4_t *mi;
8638 	rnode4_t *drp;
8639 	bool_t needrecov = FALSE;
8640 	nfs4_recov_state_t recov_state;
8641 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
8642 	dirattr_info_t dinfo, *dinfop;
8643 
8644 	if (nfs_zone() != VTOMI4(dvp)->mi_zone)
8645 		return (EPERM);
8646 	/*
8647 	 * As ".." has special meaning and rather than send a rmdir
8648 	 * over the wire to just let the server freak out, we just
8649 	 * short circuit it here and return EEXIST
8650 	 */
8651 	if (nm[0] == '.' && nm[1] == '.' && nm[2] == '\0')
8652 		return (EEXIST);
8653 
8654 	drp = VTOR4(dvp);
8655 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_WRITER, INTR4(dvp)))
8656 		return (EINTR);
8657 
8658 	/*
8659 	 * Attempt to prevent a rmdir(".") from succeeding.
8660 	 */
8661 	e.error = nfs4lookup(dvp, nm, &vp, cr, 0);
8662 	if (e.error) {
8663 		nfs_rw_exit(&drp->r_rwlock);
8664 		return (e.error);
8665 	}
8666 	if (vp == cdir) {
8667 		VN_RELE(vp);
8668 		nfs_rw_exit(&drp->r_rwlock);
8669 		return (EINVAL);
8670 	}
8671 
8672 	/*
8673 	 * Since nfsv4 remove op works on both files and directories,
8674 	 * check that the removed object is indeed a directory.
8675 	 */
8676 	if (vp->v_type != VDIR) {
8677 		VN_RELE(vp);
8678 		nfs_rw_exit(&drp->r_rwlock);
8679 		return (ENOTDIR);
8680 	}
8681 
8682 	/*
8683 	 * First just remove the entry from the name cache, as it
8684 	 * is most likely an entry for this vp.
8685 	 */
8686 	dnlc_remove(dvp, nm);
8687 
8688 	/*
8689 	 * If there vnode reference count is greater than one, then
8690 	 * there may be additional references in the DNLC which will
8691 	 * need to be purged.  First, trying removing the entry for
8692 	 * the parent directory and see if that removes the additional
8693 	 * reference(s).  If that doesn't do it, then use dnlc_purge_vp
8694 	 * to completely remove any references to the directory which
8695 	 * might still exist in the DNLC.
8696 	 */
8697 	if (vp->v_count > 1) {
8698 		dnlc_remove(vp, "..");
8699 		if (vp->v_count > 1)
8700 			dnlc_purge_vp(vp);
8701 	}
8702 
8703 	mi = VTOMI4(dvp);
8704 	recov_state.rs_flags = 0;
8705 	recov_state.rs_num_retry_despite_err = 0;
8706 
8707 recov_retry:
8708 	args.ctag = TAG_RMDIR;
8709 
8710 	/*
8711 	 * Rmdir ops: putfh dir; remove
8712 	 */
8713 	args.array_len = 3;
8714 	args.array = argop;
8715 
8716 	e.error = nfs4_start_op(VTOMI4(dvp), dvp, NULL, &recov_state);
8717 	if (e.error) {
8718 		nfs_rw_exit(&drp->r_rwlock);
8719 		return (e.error);
8720 	}
8721 	need_end_op = TRUE;
8722 
8723 	/* putfh directory */
8724 	argop[0].argop = OP_CPUTFH;
8725 	argop[0].nfs_argop4_u.opcputfh.sfh = drp->r_fh;
8726 
8727 	/* remove */
8728 	argop[1].argop = OP_CREMOVE;
8729 	argop[1].nfs_argop4_u.opcremove.ctarget = nm;
8730 
8731 	/* getattr (postop attrs for dir that contained removed dir) */
8732 	argop[2].argop = OP_GETATTR;
8733 	argop[2].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
8734 	argop[2].nfs_argop4_u.opgetattr.mi = mi;
8735 
8736 	dinfo.di_time_call = gethrtime();
8737 	doqueue = 1;
8738 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
8739 
8740 	PURGE_ATTRCACHE4(vp);
8741 
8742 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
8743 	if (e.error) {
8744 		PURGE_ATTRCACHE4(dvp);
8745 	}
8746 
8747 	if (needrecov) {
8748 		if (nfs4_start_recovery(&e, VTOMI4(dvp), dvp, NULL, NULL,
8749 		    NULL, OP_REMOVE, NULL, NULL, NULL) == FALSE) {
8750 			if (!e.error)
8751 				(void) xdr_free(xdr_COMPOUND4res_clnt,
8752 				    (caddr_t)&res);
8753 
8754 			nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state,
8755 			    needrecov);
8756 			need_end_op = FALSE;
8757 			goto recov_retry;
8758 		}
8759 	}
8760 
8761 	if (!e.error) {
8762 		resp = &res;
8763 
8764 		/*
8765 		 * Only return error if first 2 ops (OP_REMOVE or earlier)
8766 		 * failed.
8767 		 */
8768 		if (res.status != NFS4_OK && res.array_len <= 2) {
8769 			e.error = geterrno4(res.status);
8770 			PURGE_ATTRCACHE4(dvp);
8771 			nfs4_end_op(VTOMI4(dvp), dvp, NULL,
8772 			    &recov_state, needrecov);
8773 			need_end_op = FALSE;
8774 			nfs4_purge_stale_fh(e.error, dvp, cr);
8775 			/*
8776 			 * System V defines rmdir to return EEXIST, not
8777 			 * ENOTEMPTY if the directory is not empty.  Over
8778 			 * the wire, the error is NFSERR_ENOTEMPTY which
8779 			 * geterrno4 maps to ENOTEMPTY.
8780 			 */
8781 			if (e.error == ENOTEMPTY)
8782 				e.error = EEXIST;
8783 		} else {
8784 			resop = &res.array[1];	/* remove res */
8785 			rm_res = &resop->nfs_resop4_u.opremove;
8786 
8787 			if (res.status == NFS4_OK) {
8788 				resop = &res.array[2];	/* dir attrs */
8789 				dinfo.di_garp =
8790 				    &resop->nfs_resop4_u.opgetattr.ga_res;
8791 				dinfo.di_cred = cr;
8792 				dinfop = &dinfo;
8793 			} else
8794 				dinfop = NULL;
8795 
8796 			/* Update dir attribute, readdir and dnlc caches */
8797 			nfs4_update_dircaches(&rm_res->cinfo, dvp, NULL, NULL,
8798 			    dinfop);
8799 
8800 			/* destroy rddir cache for dir that was removed */
8801 			if (VTOR4(vp)->r_dir != NULL)
8802 				nfs4_purge_rddir_cache(vp);
8803 		}
8804 	}
8805 
8806 	if (need_end_op)
8807 		nfs4_end_op(VTOMI4(dvp), dvp, NULL, &recov_state, needrecov);
8808 
8809 	nfs_rw_exit(&drp->r_rwlock);
8810 
8811 	if (resp)
8812 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
8813 
8814 	if (e.error == 0) {
8815 		vnode_t *tvp;
8816 		rnode4_t *trp;
8817 		trp = VTOR4(vp);
8818 		tvp = vp;
8819 		if (IS_SHADOW(vp, trp))
8820 			tvp = RTOV4(trp);
8821 		vnevent_rmdir(tvp, dvp, nm, ct);
8822 	}
8823 
8824 	VN_RELE(vp);
8825 
8826 	return (e.error);
8827 }
8828 
8829 /* ARGSUSED */
8830 static int
8831 nfs4_symlink(vnode_t *dvp, char *lnm, struct vattr *tva, char *tnm, cred_t *cr,
8832     caller_context_t *ct, int flags)
8833 {
8834 	int error;
8835 	vnode_t *vp;
8836 	rnode4_t *rp;
8837 	char *contents;
8838 	mntinfo4_t *mi = VTOMI4(dvp);
8839 
8840 	if (nfs_zone() != mi->mi_zone)
8841 		return (EPERM);
8842 	if (!(mi->mi_flags & MI4_SYMLINK))
8843 		return (EOPNOTSUPP);
8844 
8845 	error = call_nfs4_create_req(dvp, lnm, tnm, tva, &vp, cr, NF4LNK);
8846 	if (error)
8847 		return (error);
8848 
8849 	ASSERT(nfs4_consistent_type(vp));
8850 	rp = VTOR4(vp);
8851 	if (nfs4_do_symlink_cache && rp->r_symlink.contents == NULL) {
8852 
8853 		contents = kmem_alloc(MAXPATHLEN, KM_SLEEP);
8854 
8855 		if (contents != NULL) {
8856 			mutex_enter(&rp->r_statelock);
8857 			if (rp->r_symlink.contents == NULL) {
8858 				rp->r_symlink.len = strlen(tnm);
8859 				bcopy(tnm, contents, rp->r_symlink.len);
8860 				rp->r_symlink.contents = contents;
8861 				rp->r_symlink.size = MAXPATHLEN;
8862 				mutex_exit(&rp->r_statelock);
8863 			} else {
8864 				mutex_exit(&rp->r_statelock);
8865 				kmem_free((void *)contents, MAXPATHLEN);
8866 			}
8867 		}
8868 	}
8869 	VN_RELE(vp);
8870 
8871 	return (error);
8872 }
8873 
8874 
8875 /*
8876  * Read directory entries.
8877  * There are some weird things to look out for here.  The uio_loffset
8878  * field is either 0 or it is the offset returned from a previous
8879  * readdir.  It is an opaque value used by the server to find the
8880  * correct directory block to read. The count field is the number
8881  * of blocks to read on the server.  This is advisory only, the server
8882  * may return only one block's worth of entries.  Entries may be compressed
8883  * on the server.
8884  */
8885 /* ARGSUSED */
8886 static int
8887 nfs4_readdir(vnode_t *vp, struct uio *uiop, cred_t *cr, int *eofp,
8888 	caller_context_t *ct, int flags)
8889 {
8890 	int error;
8891 	uint_t count;
8892 	rnode4_t *rp;
8893 	rddir4_cache *rdc;
8894 	rddir4_cache *rrdc;
8895 
8896 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
8897 		return (EIO);
8898 	rp = VTOR4(vp);
8899 
8900 	ASSERT(nfs_rw_lock_held(&rp->r_rwlock, RW_READER));
8901 
8902 	/*
8903 	 * Make sure that the directory cache is valid.
8904 	 */
8905 	if (rp->r_dir != NULL) {
8906 		if (nfs_disable_rddir_cache != 0) {
8907 			/*
8908 			 * Setting nfs_disable_rddir_cache in /etc/system
8909 			 * allows interoperability with servers that do not
8910 			 * properly update the attributes of directories.
8911 			 * Any cached information gets purged before an
8912 			 * access is made to it.
8913 			 */
8914 			nfs4_purge_rddir_cache(vp);
8915 		}
8916 
8917 		error = nfs4_validate_caches(vp, cr);
8918 		if (error)
8919 			return (error);
8920 	}
8921 
8922 	count = MIN(uiop->uio_iov->iov_len, MAXBSIZE);
8923 
8924 	/*
8925 	 * Short circuit last readdir which always returns 0 bytes.
8926 	 * This can be done after the directory has been read through
8927 	 * completely at least once.  This will set r_direof which
8928 	 * can be used to find the value of the last cookie.
8929 	 */
8930 	mutex_enter(&rp->r_statelock);
8931 	if (rp->r_direof != NULL &&
8932 	    uiop->uio_loffset == rp->r_direof->nfs4_ncookie) {
8933 		mutex_exit(&rp->r_statelock);
8934 #ifdef DEBUG
8935 		nfs4_readdir_cache_shorts++;
8936 #endif
8937 		if (eofp)
8938 			*eofp = 1;
8939 		return (0);
8940 	}
8941 
8942 	/*
8943 	 * Look for a cache entry.  Cache entries are identified
8944 	 * by the NFS cookie value and the byte count requested.
8945 	 */
8946 	rdc = rddir4_cache_lookup(rp, uiop->uio_loffset, count);
8947 
8948 	/*
8949 	 * If rdc is NULL then the lookup resulted in an unrecoverable error.
8950 	 */
8951 	if (rdc == NULL) {
8952 		mutex_exit(&rp->r_statelock);
8953 		return (EINTR);
8954 	}
8955 
8956 	/*
8957 	 * Check to see if we need to fill this entry in.
8958 	 */
8959 	if (rdc->flags & RDDIRREQ) {
8960 		rdc->flags &= ~RDDIRREQ;
8961 		rdc->flags |= RDDIR;
8962 		mutex_exit(&rp->r_statelock);
8963 
8964 		/*
8965 		 * Do the readdir.
8966 		 */
8967 		nfs4readdir(vp, rdc, cr);
8968 
8969 		/*
8970 		 * Reacquire the lock, so that we can continue
8971 		 */
8972 		mutex_enter(&rp->r_statelock);
8973 		/*
8974 		 * The entry is now complete
8975 		 */
8976 		rdc->flags &= ~RDDIR;
8977 	}
8978 
8979 	ASSERT(!(rdc->flags & RDDIR));
8980 
8981 	/*
8982 	 * If an error occurred while attempting
8983 	 * to fill the cache entry, mark the entry invalid and
8984 	 * just return the error.
8985 	 */
8986 	if (rdc->error) {
8987 		error = rdc->error;
8988 		rdc->flags |= RDDIRREQ;
8989 		rddir4_cache_rele(rp, rdc);
8990 		mutex_exit(&rp->r_statelock);
8991 		return (error);
8992 	}
8993 
8994 	/*
8995 	 * The cache entry is complete and good,
8996 	 * copyout the dirent structs to the calling
8997 	 * thread.
8998 	 */
8999 	error = uiomove(rdc->entries, rdc->actlen, UIO_READ, uiop);
9000 
9001 	/*
9002 	 * If no error occurred during the copyout,
9003 	 * update the offset in the uio struct to
9004 	 * contain the value of the next NFS 4 cookie
9005 	 * and set the eof value appropriately.
9006 	 */
9007 	if (!error) {
9008 		uiop->uio_loffset = rdc->nfs4_ncookie;
9009 		if (eofp)
9010 			*eofp = rdc->eof;
9011 	}
9012 
9013 	/*
9014 	 * Decide whether to do readahead.  Don't if we
9015 	 * have already read to the end of directory.
9016 	 */
9017 	if (rdc->eof) {
9018 		/*
9019 		 * Make the entry the direof only if it is cached
9020 		 */
9021 		if (rdc->flags & RDDIRCACHED)
9022 			rp->r_direof = rdc;
9023 		rddir4_cache_rele(rp, rdc);
9024 		mutex_exit(&rp->r_statelock);
9025 		return (error);
9026 	}
9027 
9028 	/* Determine if a readdir readahead should be done */
9029 	if (!(rp->r_flags & R4LOOKUP)) {
9030 		rddir4_cache_rele(rp, rdc);
9031 		mutex_exit(&rp->r_statelock);
9032 		return (error);
9033 	}
9034 
9035 	/*
9036 	 * Now look for a readahead entry.
9037 	 *
9038 	 * Check to see whether we found an entry for the readahead.
9039 	 * If so, we don't need to do anything further, so free the new
9040 	 * entry if one was allocated.  Otherwise, allocate a new entry, add
9041 	 * it to the cache, and then initiate an asynchronous readdir
9042 	 * operation to fill it.
9043 	 */
9044 	rrdc = rddir4_cache_lookup(rp, rdc->nfs4_ncookie, count);
9045 
9046 	/*
9047 	 * A readdir cache entry could not be obtained for the readahead.  In
9048 	 * this case we skip the readahead and return.
9049 	 */
9050 	if (rrdc == NULL) {
9051 		rddir4_cache_rele(rp, rdc);
9052 		mutex_exit(&rp->r_statelock);
9053 		return (error);
9054 	}
9055 
9056 	/*
9057 	 * Check to see if we need to fill this entry in.
9058 	 */
9059 	if (rrdc->flags & RDDIRREQ) {
9060 		rrdc->flags &= ~RDDIRREQ;
9061 		rrdc->flags |= RDDIR;
9062 		rddir4_cache_rele(rp, rdc);
9063 		mutex_exit(&rp->r_statelock);
9064 #ifdef DEBUG
9065 		nfs4_readdir_readahead++;
9066 #endif
9067 		/*
9068 		 * Do the readdir.
9069 		 */
9070 		nfs4_async_readdir(vp, rrdc, cr, do_nfs4readdir);
9071 		return (error);
9072 	}
9073 
9074 	rddir4_cache_rele(rp, rrdc);
9075 	rddir4_cache_rele(rp, rdc);
9076 	mutex_exit(&rp->r_statelock);
9077 	return (error);
9078 }
9079 
9080 static int
9081 do_nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9082 {
9083 	int error;
9084 	rnode4_t *rp;
9085 
9086 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
9087 
9088 	rp = VTOR4(vp);
9089 
9090 	/*
9091 	 * Obtain the readdir results for the caller.
9092 	 */
9093 	nfs4readdir(vp, rdc, cr);
9094 
9095 	mutex_enter(&rp->r_statelock);
9096 	/*
9097 	 * The entry is now complete
9098 	 */
9099 	rdc->flags &= ~RDDIR;
9100 
9101 	error = rdc->error;
9102 	if (error)
9103 		rdc->flags |= RDDIRREQ;
9104 	rddir4_cache_rele(rp, rdc);
9105 	mutex_exit(&rp->r_statelock);
9106 
9107 	return (error);
9108 }
9109 
9110 /*
9111  * Read directory entries.
9112  * There are some weird things to look out for here.  The uio_loffset
9113  * field is either 0 or it is the offset returned from a previous
9114  * readdir.  It is an opaque value used by the server to find the
9115  * correct directory block to read. The count field is the number
9116  * of blocks to read on the server.  This is advisory only, the server
9117  * may return only one block's worth of entries.  Entries may be compressed
9118  * on the server.
9119  *
9120  * Generates the following compound request:
9121  * 1. If readdir offset is zero and no dnlc entry for parent exists,
9122  *    must include a Lookupp as well. In this case, send:
9123  *    { Putfh <fh>; Readdir; Lookupp; Getfh; Getattr }
9124  * 2. Otherwise just do: { Putfh <fh>; Readdir }
9125  *
9126  * Get complete attributes and filehandles for entries if this is the
9127  * first read of the directory. Otherwise, just get fileid's.
9128  */
9129 static void
9130 nfs4readdir(vnode_t *vp, rddir4_cache *rdc, cred_t *cr)
9131 {
9132 	COMPOUND4args_clnt args;
9133 	COMPOUND4res_clnt res;
9134 	READDIR4args *rargs;
9135 	READDIR4res_clnt *rd_res;
9136 	bitmap4 rd_bitsval;
9137 	nfs_argop4 argop[5];
9138 	nfs_resop4 *resop;
9139 	rnode4_t *rp = VTOR4(vp);
9140 	mntinfo4_t *mi = VTOMI4(vp);
9141 	int doqueue;
9142 	u_longlong_t nodeid, pnodeid;	/* id's of dir and its parents */
9143 	vnode_t *dvp;
9144 	nfs_cookie4 cookie = (nfs_cookie4)rdc->nfs4_cookie;
9145 	int num_ops, res_opcnt;
9146 	bool_t needrecov = FALSE;
9147 	nfs4_recov_state_t recov_state;
9148 	hrtime_t t;
9149 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
9150 
9151 	ASSERT(nfs_zone() == mi->mi_zone);
9152 	ASSERT(rdc->flags & RDDIR);
9153 	ASSERT(rdc->entries == NULL);
9154 
9155 	/*
9156 	 * If rp were a stub, it should have triggered and caused
9157 	 * a mount for us to get this far.
9158 	 */
9159 	ASSERT(!RP_ISSTUB(rp));
9160 
9161 	num_ops = 2;
9162 	if (cookie == (nfs_cookie4)0 || cookie == (nfs_cookie4)1) {
9163 		/*
9164 		 * Since nfsv4 readdir may not return entries for "." and "..",
9165 		 * the client must recreate them:
9166 		 * To find the correct nodeid, do the following:
9167 		 * For current node, get nodeid from dnlc.
9168 		 * - if current node is rootvp, set pnodeid to nodeid.
9169 		 * - else if parent is in the dnlc, get its nodeid from there.
9170 		 * - else add LOOKUPP+GETATTR to compound.
9171 		 */
9172 		nodeid = rp->r_attr.va_nodeid;
9173 		if (vp->v_flag & VROOT) {
9174 			pnodeid = nodeid;	/* root of mount point */
9175 		} else {
9176 			dvp = dnlc_lookup(vp, "..");
9177 			if (dvp != NULL && dvp != DNLC_NO_VNODE) {
9178 				/* parent in dnlc cache - no need for otw */
9179 				pnodeid = VTOR4(dvp)->r_attr.va_nodeid;
9180 			} else {
9181 				/*
9182 				 * parent not in dnlc cache,
9183 				 * do lookupp to get its id
9184 				 */
9185 				num_ops = 5;
9186 				pnodeid = 0; /* set later by getattr parent */
9187 			}
9188 			if (dvp)
9189 				VN_RELE(dvp);
9190 		}
9191 	}
9192 	recov_state.rs_flags = 0;
9193 	recov_state.rs_num_retry_despite_err = 0;
9194 
9195 	/* Save the original mount point security flavor */
9196 	(void) save_mnt_secinfo(mi->mi_curr_serv);
9197 
9198 recov_retry:
9199 	args.ctag = TAG_READDIR;
9200 
9201 	args.array = argop;
9202 	args.array_len = num_ops;
9203 
9204 	if (e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9205 	    &recov_state, NULL)) {
9206 		/*
9207 		 * If readdir a node that is a stub for a crossed mount point,
9208 		 * keep the original secinfo flavor for the current file
9209 		 * system, not the crossed one.
9210 		 */
9211 		(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9212 		rdc->error = e.error;
9213 		return;
9214 	}
9215 
9216 	/*
9217 	 * Determine which attrs to request for dirents.  This code
9218 	 * must be protected by nfs4_start/end_fop because of r_server
9219 	 * (which will change during failover recovery).
9220 	 *
9221 	 */
9222 	if (rp->r_flags & (R4LOOKUP | R4READDIRWATTR)) {
9223 		/*
9224 		 * Get all vattr attrs plus filehandle and rdattr_error
9225 		 */
9226 		rd_bitsval = NFS4_VATTR_MASK |
9227 		    FATTR4_RDATTR_ERROR_MASK |
9228 		    FATTR4_FILEHANDLE_MASK;
9229 
9230 		if (rp->r_flags & R4READDIRWATTR) {
9231 			mutex_enter(&rp->r_statelock);
9232 			rp->r_flags &= ~R4READDIRWATTR;
9233 			mutex_exit(&rp->r_statelock);
9234 		}
9235 	} else {
9236 		servinfo4_t *svp = rp->r_server;
9237 
9238 		/*
9239 		 * Already read directory. Use readdir with
9240 		 * no attrs (except for mounted_on_fileid) for updates.
9241 		 */
9242 		rd_bitsval = FATTR4_RDATTR_ERROR_MASK;
9243 
9244 		/*
9245 		 * request mounted on fileid if supported, else request
9246 		 * fileid.  maybe we should verify that fileid is supported
9247 		 * and request something else if not.
9248 		 */
9249 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
9250 		if (svp->sv_supp_attrs & FATTR4_MOUNTED_ON_FILEID_MASK)
9251 			rd_bitsval |= FATTR4_MOUNTED_ON_FILEID_MASK;
9252 		nfs_rw_exit(&svp->sv_lock);
9253 	}
9254 
9255 	/* putfh directory fh */
9256 	argop[0].argop = OP_CPUTFH;
9257 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
9258 
9259 	argop[1].argop = OP_READDIR;
9260 	rargs = &argop[1].nfs_argop4_u.opreaddir;
9261 	/*
9262 	 * 1 and 2 are reserved for client "." and ".." entry offset.
9263 	 * cookie 0 should be used over-the-wire to start reading at
9264 	 * the beginning of the directory excluding "." and "..".
9265 	 */
9266 	if (rdc->nfs4_cookie == 0 ||
9267 	    rdc->nfs4_cookie == 1 ||
9268 	    rdc->nfs4_cookie == 2) {
9269 		rargs->cookie = (nfs_cookie4)0;
9270 		rargs->cookieverf = 0;
9271 	} else {
9272 		rargs->cookie = (nfs_cookie4)rdc->nfs4_cookie;
9273 		mutex_enter(&rp->r_statelock);
9274 		rargs->cookieverf = rp->r_cookieverf4;
9275 		mutex_exit(&rp->r_statelock);
9276 	}
9277 	rargs->dircount = MIN(rdc->buflen, mi->mi_tsize);
9278 	rargs->maxcount = mi->mi_tsize;
9279 	rargs->attr_request = rd_bitsval;
9280 	rargs->rdc = rdc;
9281 	rargs->dvp = vp;
9282 	rargs->mi = mi;
9283 	rargs->cr = cr;
9284 
9285 
9286 	/*
9287 	 * If count < than the minimum required, we return no entries
9288 	 * and fail with EINVAL
9289 	 */
9290 	if (rargs->dircount < (DIRENT64_RECLEN(1) + DIRENT64_RECLEN(2))) {
9291 		rdc->error = EINVAL;
9292 		goto out;
9293 	}
9294 
9295 	if (args.array_len == 5) {
9296 		/*
9297 		 * Add lookupp and getattr for parent nodeid.
9298 		 */
9299 		argop[2].argop = OP_LOOKUPP;
9300 
9301 		argop[3].argop = OP_GETFH;
9302 
9303 		/* getattr parent */
9304 		argop[4].argop = OP_GETATTR;
9305 		argop[4].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
9306 		argop[4].nfs_argop4_u.opgetattr.mi = mi;
9307 	}
9308 
9309 	doqueue = 1;
9310 
9311 	if (mi->mi_io_kstats) {
9312 		mutex_enter(&mi->mi_lock);
9313 		kstat_runq_enter(KSTAT_IO_PTR(mi->mi_io_kstats));
9314 		mutex_exit(&mi->mi_lock);
9315 	}
9316 
9317 	/* capture the time of this call */
9318 	rargs->t = t = gethrtime();
9319 
9320 	rfs4call(mi, &args, &res, cr, &doqueue, 0, &e);
9321 
9322 	if (mi->mi_io_kstats) {
9323 		mutex_enter(&mi->mi_lock);
9324 		kstat_runq_exit(KSTAT_IO_PTR(mi->mi_io_kstats));
9325 		mutex_exit(&mi->mi_lock);
9326 	}
9327 
9328 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
9329 
9330 	/*
9331 	 * If RPC error occurred and it isn't an error that
9332 	 * triggers recovery, then go ahead and fail now.
9333 	 */
9334 	if (e.error != 0 && !needrecov) {
9335 		rdc->error = e.error;
9336 		goto out;
9337 	}
9338 
9339 	if (needrecov) {
9340 		bool_t abort;
9341 
9342 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
9343 		    "nfs4readdir: initiating recovery.\n"));
9344 
9345 		abort = nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
9346 		    NULL, OP_READDIR, NULL, NULL, NULL);
9347 		if (abort == FALSE) {
9348 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9349 			    &recov_state, needrecov);
9350 			if (!e.error)
9351 				(void) xdr_free(xdr_COMPOUND4res_clnt,
9352 				    (caddr_t)&res);
9353 			if (rdc->entries != NULL) {
9354 				kmem_free(rdc->entries, rdc->entlen);
9355 				rdc->entries = NULL;
9356 			}
9357 			goto recov_retry;
9358 		}
9359 
9360 		if (e.error != 0) {
9361 			rdc->error = e.error;
9362 			goto out;
9363 		}
9364 
9365 		/* fall through for res.status case */
9366 	}
9367 
9368 	res_opcnt = res.array_len;
9369 
9370 	/*
9371 	 * If compound failed first 2 ops (PUTFH+READDIR), then return
9372 	 * failure here.  Subsequent ops are for filling out dot-dot
9373 	 * dirent, and if they fail, we still want to give the caller
9374 	 * the dirents returned by (the successful) READDIR op, so we need
9375 	 * to silently ignore failure for subsequent ops (LOOKUPP+GETATTR).
9376 	 *
9377 	 * One example where PUTFH+READDIR ops would succeed but
9378 	 * LOOKUPP+GETATTR would fail would be a dir that has r perm
9379 	 * but lacks x.  In this case, a POSIX server's VOP_READDIR
9380 	 * would succeed; however, VOP_LOOKUP(..) would fail since no
9381 	 * x perm.  We need to come up with a non-vendor-specific way
9382 	 * for a POSIX server to return d_ino from dotdot's dirent if
9383 	 * client only requests mounted_on_fileid, and just say the
9384 	 * LOOKUPP succeeded and fill out the GETATTR.  However, if
9385 	 * client requested any mandatory attrs, server would be required
9386 	 * to fail the GETATTR op because it can't call VOP_LOOKUP+VOP_GETATTR
9387 	 * for dotdot.
9388 	 */
9389 
9390 	if (res.status) {
9391 		if (res_opcnt <= 2) {
9392 			e.error = geterrno4(res.status);
9393 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_READDIR,
9394 			    &recov_state, needrecov);
9395 			nfs4_purge_stale_fh(e.error, vp, cr);
9396 			rdc->error = e.error;
9397 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9398 			if (rdc->entries != NULL) {
9399 				kmem_free(rdc->entries, rdc->entlen);
9400 				rdc->entries = NULL;
9401 			}
9402 			/*
9403 			 * If readdir a node that is a stub for a
9404 			 * crossed mount point, keep the original
9405 			 * secinfo flavor for the current file system,
9406 			 * not the crossed one.
9407 			 */
9408 			(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9409 			return;
9410 		}
9411 	}
9412 
9413 	resop = &res.array[1];	/* readdir res */
9414 	rd_res = &resop->nfs_resop4_u.opreaddirclnt;
9415 
9416 	mutex_enter(&rp->r_statelock);
9417 	rp->r_cookieverf4 = rd_res->cookieverf;
9418 	mutex_exit(&rp->r_statelock);
9419 
9420 	/*
9421 	 * For "." and ".." entries
9422 	 * e.g.
9423 	 *	seek(cookie=0) -> "." entry with d_off = 1
9424 	 *	seek(cookie=1) -> ".." entry with d_off = 2
9425 	 */
9426 	if (cookie == (nfs_cookie4) 0) {
9427 		if (rd_res->dotp)
9428 			rd_res->dotp->d_ino = nodeid;
9429 		if (rd_res->dotdotp)
9430 			rd_res->dotdotp->d_ino = pnodeid;
9431 	}
9432 	if (cookie == (nfs_cookie4) 1) {
9433 		if (rd_res->dotdotp)
9434 			rd_res->dotdotp->d_ino = pnodeid;
9435 	}
9436 
9437 
9438 	/* LOOKUPP+GETATTR attemped */
9439 	if (args.array_len == 5 && rd_res->dotdotp) {
9440 		if (res.status == NFS4_OK && res_opcnt == 5) {
9441 			nfs_fh4 *fhp;
9442 			nfs4_sharedfh_t *sfhp;
9443 			vnode_t *pvp;
9444 			nfs4_ga_res_t *garp;
9445 
9446 			resop++;	/* lookupp */
9447 			resop++;	/* getfh   */
9448 			fhp = &resop->nfs_resop4_u.opgetfh.object;
9449 
9450 			resop++;	/* getattr of parent */
9451 
9452 			/*
9453 			 * First, take care of finishing the
9454 			 * readdir results.
9455 			 */
9456 			garp = &resop->nfs_resop4_u.opgetattr.ga_res;
9457 			/*
9458 			 * The d_ino of .. must be the inode number
9459 			 * of the mounted filesystem.
9460 			 */
9461 			if (garp->n4g_va.va_mask & AT_NODEID)
9462 				rd_res->dotdotp->d_ino =
9463 				    garp->n4g_va.va_nodeid;
9464 
9465 
9466 			/*
9467 			 * Next, create the ".." dnlc entry
9468 			 */
9469 			sfhp = sfh4_get(fhp, mi);
9470 			if (!nfs4_make_dotdot(sfhp, t, vp, cr, &pvp, 0)) {
9471 				dnlc_update(vp, "..", pvp);
9472 				VN_RELE(pvp);
9473 			}
9474 			sfh4_rele(&sfhp);
9475 		}
9476 	}
9477 
9478 	if (mi->mi_io_kstats) {
9479 		mutex_enter(&mi->mi_lock);
9480 		KSTAT_IO_PTR(mi->mi_io_kstats)->reads++;
9481 		KSTAT_IO_PTR(mi->mi_io_kstats)->nread += rdc->actlen;
9482 		mutex_exit(&mi->mi_lock);
9483 	}
9484 
9485 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
9486 
9487 out:
9488 	/*
9489 	 * If readdir a node that is a stub for a crossed mount point,
9490 	 * keep the original secinfo flavor for the current file system,
9491 	 * not the crossed one.
9492 	 */
9493 	(void) check_mnt_secinfo(mi->mi_curr_serv, vp);
9494 
9495 	nfs4_end_fop(mi, vp, NULL, OH_READDIR, &recov_state, needrecov);
9496 }
9497 
9498 
9499 static int
9500 nfs4_bio(struct buf *bp, stable_how4 *stab_comm, cred_t *cr, bool_t readahead)
9501 {
9502 	rnode4_t *rp = VTOR4(bp->b_vp);
9503 	int count;
9504 	int error;
9505 	cred_t *cred_otw = NULL;
9506 	offset_t offset;
9507 	nfs4_open_stream_t *osp = NULL;
9508 	bool_t first_time = TRUE;	/* first time getting otw cred */
9509 	bool_t last_time = FALSE;	/* last time getting otw cred */
9510 
9511 	ASSERT(nfs_zone() == VTOMI4(bp->b_vp)->mi_zone);
9512 
9513 	DTRACE_IO1(start, struct buf *, bp);
9514 	offset = ldbtob(bp->b_lblkno);
9515 
9516 	if (bp->b_flags & B_READ) {
9517 	read_again:
9518 		/*
9519 		 * Releases the osp, if it is provided.
9520 		 * Puts a hold on the cred_otw and the new osp (if found).
9521 		 */
9522 		cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9523 		    &first_time, &last_time);
9524 		error = bp->b_error = nfs4read(bp->b_vp, bp->b_un.b_addr,
9525 		    offset, bp->b_bcount, &bp->b_resid, cred_otw,
9526 		    readahead, NULL);
9527 		crfree(cred_otw);
9528 		if (!error) {
9529 			if (bp->b_resid) {
9530 				/*
9531 				 * Didn't get it all because we hit EOF,
9532 				 * zero all the memory beyond the EOF.
9533 				 */
9534 				/* bzero(rdaddr + */
9535 				bzero(bp->b_un.b_addr +
9536 				    bp->b_bcount - bp->b_resid, bp->b_resid);
9537 			}
9538 			mutex_enter(&rp->r_statelock);
9539 			if (bp->b_resid == bp->b_bcount &&
9540 			    offset >= rp->r_size) {
9541 				/*
9542 				 * We didn't read anything at all as we are
9543 				 * past EOF.  Return an error indicator back
9544 				 * but don't destroy the pages (yet).
9545 				 */
9546 				error = NFS_EOF;
9547 			}
9548 			mutex_exit(&rp->r_statelock);
9549 		} else if (error == EACCES && last_time == FALSE) {
9550 				goto read_again;
9551 		}
9552 	} else {
9553 		if (!(rp->r_flags & R4STALE)) {
9554 write_again:
9555 			/*
9556 			 * Releases the osp, if it is provided.
9557 			 * Puts a hold on the cred_otw and the new
9558 			 * osp (if found).
9559 			 */
9560 			cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
9561 			    &first_time, &last_time);
9562 			mutex_enter(&rp->r_statelock);
9563 			count = MIN(bp->b_bcount, rp->r_size - offset);
9564 			mutex_exit(&rp->r_statelock);
9565 			if (count < 0)
9566 				cmn_err(CE_PANIC, "nfs4_bio: write count < 0");
9567 #ifdef DEBUG
9568 			if (count == 0) {
9569 				zoneid_t zoneid = getzoneid();
9570 
9571 				zcmn_err(zoneid, CE_WARN,
9572 				    "nfs4_bio: zero length write at %lld",
9573 				    offset);
9574 				zcmn_err(zoneid, CE_CONT, "flags=0x%x, "
9575 				    "b_bcount=%ld, file size=%lld",
9576 				    rp->r_flags, (long)bp->b_bcount,
9577 				    rp->r_size);
9578 				sfh4_printfhandle(VTOR4(bp->b_vp)->r_fh);
9579 				if (nfs4_bio_do_stop)
9580 					debug_enter("nfs4_bio");
9581 			}
9582 #endif
9583 			error = nfs4write(bp->b_vp, bp->b_un.b_addr, offset,
9584 			    count, cred_otw, stab_comm);
9585 			if (error == EACCES && last_time == FALSE) {
9586 				crfree(cred_otw);
9587 				goto write_again;
9588 			}
9589 			bp->b_error = error;
9590 			if (error && error != EINTR &&
9591 			    !(bp->b_vp->v_vfsp->vfs_flag & VFS_UNMOUNTED)) {
9592 				/*
9593 				 * Don't print EDQUOT errors on the console.
9594 				 * Don't print asynchronous EACCES errors.
9595 				 * Don't print EFBIG errors.
9596 				 * Print all other write errors.
9597 				 */
9598 				if (error != EDQUOT && error != EFBIG &&
9599 				    (error != EACCES ||
9600 				    !(bp->b_flags & B_ASYNC)))
9601 					nfs4_write_error(bp->b_vp,
9602 					    error, cred_otw);
9603 				/*
9604 				 * Update r_error and r_flags as appropriate.
9605 				 * If the error was ESTALE, then mark the
9606 				 * rnode as not being writeable and save
9607 				 * the error status.  Otherwise, save any
9608 				 * errors which occur from asynchronous
9609 				 * page invalidations.  Any errors occurring
9610 				 * from other operations should be saved
9611 				 * by the caller.
9612 				 */
9613 				mutex_enter(&rp->r_statelock);
9614 				if (error == ESTALE) {
9615 					rp->r_flags |= R4STALE;
9616 					if (!rp->r_error)
9617 						rp->r_error = error;
9618 				} else if (!rp->r_error &&
9619 				    (bp->b_flags &
9620 				    (B_INVAL|B_FORCE|B_ASYNC)) ==
9621 				    (B_INVAL|B_FORCE|B_ASYNC)) {
9622 					rp->r_error = error;
9623 				}
9624 				mutex_exit(&rp->r_statelock);
9625 			}
9626 			crfree(cred_otw);
9627 		} else {
9628 			error = rp->r_error;
9629 			/*
9630 			 * A close may have cleared r_error, if so,
9631 			 * propagate ESTALE error return properly
9632 			 */
9633 			if (error == 0)
9634 				error = ESTALE;
9635 		}
9636 	}
9637 
9638 	if (error != 0 && error != NFS_EOF)
9639 		bp->b_flags |= B_ERROR;
9640 
9641 	if (osp)
9642 		open_stream_rele(osp, rp);
9643 
9644 	DTRACE_IO1(done, struct buf *, bp);
9645 
9646 	return (error);
9647 }
9648 
9649 /* ARGSUSED */
9650 int
9651 nfs4_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
9652 {
9653 	return (EREMOTE);
9654 }
9655 
9656 /* ARGSUSED2 */
9657 int
9658 nfs4_rwlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9659 {
9660 	rnode4_t *rp = VTOR4(vp);
9661 
9662 	if (!write_lock) {
9663 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9664 		return (V_WRITELOCK_FALSE);
9665 	}
9666 
9667 	if ((rp->r_flags & R4DIRECTIO) ||
9668 	    (VTOMI4(vp)->mi_flags & MI4_DIRECTIO)) {
9669 		(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_READER, FALSE);
9670 		if (rp->r_mapcnt == 0 && !nfs4_has_pages(vp))
9671 			return (V_WRITELOCK_FALSE);
9672 		nfs_rw_exit(&rp->r_rwlock);
9673 	}
9674 
9675 	(void) nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, FALSE);
9676 	return (V_WRITELOCK_TRUE);
9677 }
9678 
9679 /* ARGSUSED */
9680 void
9681 nfs4_rwunlock(vnode_t *vp, int write_lock, caller_context_t *ctp)
9682 {
9683 	rnode4_t *rp = VTOR4(vp);
9684 
9685 	nfs_rw_exit(&rp->r_rwlock);
9686 }
9687 
9688 /* ARGSUSED */
9689 static int
9690 nfs4_seek(vnode_t *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
9691 {
9692 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9693 		return (EIO);
9694 
9695 	/*
9696 	 * Because we stuff the readdir cookie into the offset field
9697 	 * someone may attempt to do an lseek with the cookie which
9698 	 * we want to succeed.
9699 	 */
9700 	if (vp->v_type == VDIR)
9701 		return (0);
9702 	if (*noffp < 0)
9703 		return (EINVAL);
9704 	return (0);
9705 }
9706 
9707 
9708 /*
9709  * Return all the pages from [off..off+len) in file
9710  */
9711 /* ARGSUSED */
9712 static int
9713 nfs4_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
9714     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9715 	enum seg_rw rw, cred_t *cr, caller_context_t *ct)
9716 {
9717 	rnode4_t *rp;
9718 	int error;
9719 	mntinfo4_t *mi;
9720 
9721 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9722 		return (EIO);
9723 	rp = VTOR4(vp);
9724 	if (IS_SHADOW(vp, rp))
9725 		vp = RTOV4(rp);
9726 
9727 	if (vp->v_flag & VNOMAP)
9728 		return (ENOSYS);
9729 
9730 	if (protp != NULL)
9731 		*protp = PROT_ALL;
9732 
9733 	/*
9734 	 * Now validate that the caches are up to date.
9735 	 */
9736 	if (error = nfs4_validate_caches(vp, cr))
9737 		return (error);
9738 
9739 	mi = VTOMI4(vp);
9740 retry:
9741 	mutex_enter(&rp->r_statelock);
9742 
9743 	/*
9744 	 * Don't create dirty pages faster than they
9745 	 * can be cleaned so that the system doesn't
9746 	 * get imbalanced.  If the async queue is
9747 	 * maxed out, then wait for it to drain before
9748 	 * creating more dirty pages.  Also, wait for
9749 	 * any threads doing pagewalks in the vop_getattr
9750 	 * entry points so that they don't block for
9751 	 * long periods.
9752 	 */
9753 	if (rw == S_CREATE) {
9754 		while ((mi->mi_max_threads != 0 &&
9755 		    rp->r_awcount > 2 * mi->mi_max_threads) ||
9756 		    rp->r_gcount > 0)
9757 			cv_wait(&rp->r_cv, &rp->r_statelock);
9758 	}
9759 
9760 	/*
9761 	 * If we are getting called as a side effect of an nfs_write()
9762 	 * operation the local file size might not be extended yet.
9763 	 * In this case we want to be able to return pages of zeroes.
9764 	 */
9765 	if (off + len > rp->r_size + PAGEOFFSET && seg != segkmap) {
9766 		NFS4_DEBUG(nfs4_pageio_debug,
9767 		    (CE_NOTE, "getpage beyond EOF: off=%lld, "
9768 		    "len=%llu, size=%llu, attrsize =%llu", off,
9769 		    (u_longlong_t)len, rp->r_size, rp->r_attr.va_size));
9770 		mutex_exit(&rp->r_statelock);
9771 		return (EFAULT);		/* beyond EOF */
9772 	}
9773 
9774 	mutex_exit(&rp->r_statelock);
9775 
9776 	if (len <= PAGESIZE) {
9777 		error = nfs4_getapage(vp, off, len, protp, pl, plsz,
9778 		    seg, addr, rw, cr);
9779 		NFS4_DEBUG(nfs4_pageio_debug && error,
9780 		    (CE_NOTE, "getpage error %d; off=%lld, "
9781 		    "len=%lld", error, off, (u_longlong_t)len));
9782 	} else {
9783 		error = pvn_getpages(nfs4_getapage, vp, off, len, protp,
9784 		    pl, plsz, seg, addr, rw, cr);
9785 		NFS4_DEBUG(nfs4_pageio_debug && error,
9786 		    (CE_NOTE, "getpages error %d; off=%lld, "
9787 		    "len=%lld", error, off, (u_longlong_t)len));
9788 	}
9789 
9790 	switch (error) {
9791 	case NFS_EOF:
9792 		nfs4_purge_caches(vp, NFS4_NOPURGE_DNLC, cr, FALSE);
9793 		goto retry;
9794 	case ESTALE:
9795 		nfs4_purge_stale_fh(error, vp, cr);
9796 	}
9797 
9798 	return (error);
9799 }
9800 
9801 /*
9802  * Called from pvn_getpages or nfs4_getpage to get a particular page.
9803  */
9804 /* ARGSUSED */
9805 static int
9806 nfs4_getapage(vnode_t *vp, u_offset_t off, size_t len, uint_t *protp,
9807     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
9808     enum seg_rw rw, cred_t *cr)
9809 {
9810 	rnode4_t *rp;
9811 	uint_t bsize;
9812 	struct buf *bp;
9813 	page_t *pp;
9814 	u_offset_t lbn;
9815 	u_offset_t io_off;
9816 	u_offset_t blkoff;
9817 	u_offset_t rablkoff;
9818 	size_t io_len;
9819 	uint_t blksize;
9820 	int error;
9821 	int readahead;
9822 	int readahead_issued = 0;
9823 	int ra_window; /* readahead window */
9824 	page_t *pagefound;
9825 	page_t *savepp;
9826 
9827 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
9828 		return (EIO);
9829 
9830 	rp = VTOR4(vp);
9831 	ASSERT(!IS_SHADOW(vp, rp));
9832 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
9833 
9834 reread:
9835 	bp = NULL;
9836 	pp = NULL;
9837 	pagefound = NULL;
9838 
9839 	if (pl != NULL)
9840 		pl[0] = NULL;
9841 
9842 	error = 0;
9843 	lbn = off / bsize;
9844 	blkoff = lbn * bsize;
9845 
9846 	/*
9847 	 * Queueing up the readahead before doing the synchronous read
9848 	 * results in a significant increase in read throughput because
9849 	 * of the increased parallelism between the async threads and
9850 	 * the process context.
9851 	 */
9852 	if ((off & ((vp->v_vfsp->vfs_bsize) - 1)) == 0 &&
9853 	    rw != S_CREATE &&
9854 	    !(vp->v_flag & VNOCACHE)) {
9855 		mutex_enter(&rp->r_statelock);
9856 
9857 		/*
9858 		 * Calculate the number of readaheads to do.
9859 		 * a) No readaheads at offset = 0.
9860 		 * b) Do maximum(nfs4_nra) readaheads when the readahead
9861 		 *    window is closed.
9862 		 * c) Do readaheads between 1 to (nfs4_nra - 1) depending
9863 		 *    upon how far the readahead window is open or close.
9864 		 * d) No readaheads if rp->r_nextr is not within the scope
9865 		 *    of the readahead window (random i/o).
9866 		 */
9867 
9868 		if (off == 0)
9869 			readahead = 0;
9870 		else if (blkoff == rp->r_nextr)
9871 			readahead = nfs4_nra;
9872 		else if (rp->r_nextr > blkoff &&
9873 		    ((ra_window = (rp->r_nextr - blkoff) / bsize)
9874 		    <= (nfs4_nra - 1)))
9875 			readahead = nfs4_nra - ra_window;
9876 		else
9877 			readahead = 0;
9878 
9879 		rablkoff = rp->r_nextr;
9880 		while (readahead > 0 && rablkoff + bsize < rp->r_size) {
9881 			mutex_exit(&rp->r_statelock);
9882 			if (nfs4_async_readahead(vp, rablkoff + bsize,
9883 			    addr + (rablkoff + bsize - off),
9884 			    seg, cr, nfs4_readahead) < 0) {
9885 				mutex_enter(&rp->r_statelock);
9886 				break;
9887 			}
9888 			readahead--;
9889 			rablkoff += bsize;
9890 			/*
9891 			 * Indicate that we did a readahead so
9892 			 * readahead offset is not updated
9893 			 * by the synchronous read below.
9894 			 */
9895 			readahead_issued = 1;
9896 			mutex_enter(&rp->r_statelock);
9897 			/*
9898 			 * set readahead offset to
9899 			 * offset of last async readahead
9900 			 * request.
9901 			 */
9902 			rp->r_nextr = rablkoff;
9903 		}
9904 		mutex_exit(&rp->r_statelock);
9905 	}
9906 
9907 again:
9908 	if ((pagefound = page_exists(vp, off)) == NULL) {
9909 		if (pl == NULL) {
9910 			(void) nfs4_async_readahead(vp, blkoff, addr, seg, cr,
9911 			    nfs4_readahead);
9912 		} else if (rw == S_CREATE) {
9913 			/*
9914 			 * Block for this page is not allocated, or the offset
9915 			 * is beyond the current allocation size, or we're
9916 			 * allocating a swap slot and the page was not found,
9917 			 * so allocate it and return a zero page.
9918 			 */
9919 			if ((pp = page_create_va(vp, off,
9920 			    PAGESIZE, PG_WAIT, seg, addr)) == NULL)
9921 				cmn_err(CE_PANIC, "nfs4_getapage: page_create");
9922 			io_len = PAGESIZE;
9923 			mutex_enter(&rp->r_statelock);
9924 			rp->r_nextr = off + PAGESIZE;
9925 			mutex_exit(&rp->r_statelock);
9926 		} else {
9927 			/*
9928 			 * Need to go to server to get a block
9929 			 */
9930 			mutex_enter(&rp->r_statelock);
9931 			if (blkoff < rp->r_size &&
9932 			    blkoff + bsize > rp->r_size) {
9933 				/*
9934 				 * If less than a block left in
9935 				 * file read less than a block.
9936 				 */
9937 				if (rp->r_size <= off) {
9938 					/*
9939 					 * Trying to access beyond EOF,
9940 					 * set up to get at least one page.
9941 					 */
9942 					blksize = off + PAGESIZE - blkoff;
9943 				} else
9944 					blksize = rp->r_size - blkoff;
9945 			} else if ((off == 0) ||
9946 			    (off != rp->r_nextr && !readahead_issued)) {
9947 				blksize = PAGESIZE;
9948 				blkoff = off; /* block = page here */
9949 			} else
9950 				blksize = bsize;
9951 			mutex_exit(&rp->r_statelock);
9952 
9953 			pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
9954 			    &io_len, blkoff, blksize, 0);
9955 
9956 			/*
9957 			 * Some other thread has entered the page,
9958 			 * so just use it.
9959 			 */
9960 			if (pp == NULL)
9961 				goto again;
9962 
9963 			/*
9964 			 * Now round the request size up to page boundaries.
9965 			 * This ensures that the entire page will be
9966 			 * initialized to zeroes if EOF is encountered.
9967 			 */
9968 			io_len = ptob(btopr(io_len));
9969 
9970 			bp = pageio_setup(pp, io_len, vp, B_READ);
9971 			ASSERT(bp != NULL);
9972 
9973 			/*
9974 			 * pageio_setup should have set b_addr to 0.  This
9975 			 * is correct since we want to do I/O on a page
9976 			 * boundary.  bp_mapin will use this addr to calculate
9977 			 * an offset, and then set b_addr to the kernel virtual
9978 			 * address it allocated for us.
9979 			 */
9980 			ASSERT(bp->b_un.b_addr == 0);
9981 
9982 			bp->b_edev = 0;
9983 			bp->b_dev = 0;
9984 			bp->b_lblkno = lbtodb(io_off);
9985 			bp->b_file = vp;
9986 			bp->b_offset = (offset_t)off;
9987 			bp_mapin(bp);
9988 
9989 			/*
9990 			 * If doing a write beyond what we believe is EOF,
9991 			 * don't bother trying to read the pages from the
9992 			 * server, we'll just zero the pages here.  We
9993 			 * don't check that the rw flag is S_WRITE here
9994 			 * because some implementations may attempt a
9995 			 * read access to the buffer before copying data.
9996 			 */
9997 			mutex_enter(&rp->r_statelock);
9998 			if (io_off >= rp->r_size && seg == segkmap) {
9999 				mutex_exit(&rp->r_statelock);
10000 				bzero(bp->b_un.b_addr, io_len);
10001 			} else {
10002 				mutex_exit(&rp->r_statelock);
10003 				error = nfs4_bio(bp, NULL, cr, FALSE);
10004 			}
10005 
10006 			/*
10007 			 * Unmap the buffer before freeing it.
10008 			 */
10009 			bp_mapout(bp);
10010 			pageio_done(bp);
10011 
10012 			savepp = pp;
10013 			do {
10014 				pp->p_fsdata = C_NOCOMMIT;
10015 			} while ((pp = pp->p_next) != savepp);
10016 
10017 			if (error == NFS_EOF) {
10018 				/*
10019 				 * If doing a write system call just return
10020 				 * zeroed pages, else user tried to get pages
10021 				 * beyond EOF, return error.  We don't check
10022 				 * that the rw flag is S_WRITE here because
10023 				 * some implementations may attempt a read
10024 				 * access to the buffer before copying data.
10025 				 */
10026 				if (seg == segkmap)
10027 					error = 0;
10028 				else
10029 					error = EFAULT;
10030 			}
10031 
10032 			if (!readahead_issued && !error) {
10033 				mutex_enter(&rp->r_statelock);
10034 				rp->r_nextr = io_off + io_len;
10035 				mutex_exit(&rp->r_statelock);
10036 			}
10037 		}
10038 	}
10039 
10040 out:
10041 	if (pl == NULL)
10042 		return (error);
10043 
10044 	if (error) {
10045 		if (pp != NULL)
10046 			pvn_read_done(pp, B_ERROR);
10047 		return (error);
10048 	}
10049 
10050 	if (pagefound) {
10051 		se_t se = (rw == S_CREATE ? SE_EXCL : SE_SHARED);
10052 
10053 		/*
10054 		 * Page exists in the cache, acquire the appropriate lock.
10055 		 * If this fails, start all over again.
10056 		 */
10057 		if ((pp = page_lookup(vp, off, se)) == NULL) {
10058 #ifdef DEBUG
10059 			nfs4_lostpage++;
10060 #endif
10061 			goto reread;
10062 		}
10063 		pl[0] = pp;
10064 		pl[1] = NULL;
10065 		return (0);
10066 	}
10067 
10068 	if (pp != NULL)
10069 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
10070 
10071 	return (error);
10072 }
10073 
10074 static void
10075 nfs4_readahead(vnode_t *vp, u_offset_t blkoff, caddr_t addr, struct seg *seg,
10076     cred_t *cr)
10077 {
10078 	int error;
10079 	page_t *pp;
10080 	u_offset_t io_off;
10081 	size_t io_len;
10082 	struct buf *bp;
10083 	uint_t bsize, blksize;
10084 	rnode4_t *rp = VTOR4(vp);
10085 	page_t *savepp;
10086 
10087 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10088 
10089 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10090 
10091 	mutex_enter(&rp->r_statelock);
10092 	if (blkoff < rp->r_size && blkoff + bsize > rp->r_size) {
10093 		/*
10094 		 * If less than a block left in file read less
10095 		 * than a block.
10096 		 */
10097 		blksize = rp->r_size - blkoff;
10098 	} else
10099 		blksize = bsize;
10100 	mutex_exit(&rp->r_statelock);
10101 
10102 	pp = pvn_read_kluster(vp, blkoff, segkmap, addr,
10103 	    &io_off, &io_len, blkoff, blksize, 1);
10104 	/*
10105 	 * The isra flag passed to the kluster function is 1, we may have
10106 	 * gotten a return value of NULL for a variety of reasons (# of free
10107 	 * pages < minfree, someone entered the page on the vnode etc). In all
10108 	 * cases, we want to punt on the readahead.
10109 	 */
10110 	if (pp == NULL)
10111 		return;
10112 
10113 	/*
10114 	 * Now round the request size up to page boundaries.
10115 	 * This ensures that the entire page will be
10116 	 * initialized to zeroes if EOF is encountered.
10117 	 */
10118 	io_len = ptob(btopr(io_len));
10119 
10120 	bp = pageio_setup(pp, io_len, vp, B_READ);
10121 	ASSERT(bp != NULL);
10122 
10123 	/*
10124 	 * pageio_setup should have set b_addr to 0.  This is correct since
10125 	 * we want to do I/O on a page boundary. bp_mapin() will use this addr
10126 	 * to calculate an offset, and then set b_addr to the kernel virtual
10127 	 * address it allocated for us.
10128 	 */
10129 	ASSERT(bp->b_un.b_addr == 0);
10130 
10131 	bp->b_edev = 0;
10132 	bp->b_dev = 0;
10133 	bp->b_lblkno = lbtodb(io_off);
10134 	bp->b_file = vp;
10135 	bp->b_offset = (offset_t)blkoff;
10136 	bp_mapin(bp);
10137 
10138 	/*
10139 	 * If doing a write beyond what we believe is EOF, don't bother trying
10140 	 * to read the pages from the server, we'll just zero the pages here.
10141 	 * We don't check that the rw flag is S_WRITE here because some
10142 	 * implementations may attempt a read access to the buffer before
10143 	 * copying data.
10144 	 */
10145 	mutex_enter(&rp->r_statelock);
10146 	if (io_off >= rp->r_size && seg == segkmap) {
10147 		mutex_exit(&rp->r_statelock);
10148 		bzero(bp->b_un.b_addr, io_len);
10149 		error = 0;
10150 	} else {
10151 		mutex_exit(&rp->r_statelock);
10152 		error = nfs4_bio(bp, NULL, cr, TRUE);
10153 		if (error == NFS_EOF)
10154 			error = 0;
10155 	}
10156 
10157 	/*
10158 	 * Unmap the buffer before freeing it.
10159 	 */
10160 	bp_mapout(bp);
10161 	pageio_done(bp);
10162 
10163 	savepp = pp;
10164 	do {
10165 		pp->p_fsdata = C_NOCOMMIT;
10166 	} while ((pp = pp->p_next) != savepp);
10167 
10168 	pvn_read_done(pp, error ? B_READ | B_ERROR : B_READ);
10169 
10170 	/*
10171 	 * In case of error set readahead offset
10172 	 * to the lowest offset.
10173 	 * pvn_read_done() calls VN_DISPOSE to destroy the pages
10174 	 */
10175 	if (error && rp->r_nextr > io_off) {
10176 		mutex_enter(&rp->r_statelock);
10177 		if (rp->r_nextr > io_off)
10178 			rp->r_nextr = io_off;
10179 		mutex_exit(&rp->r_statelock);
10180 	}
10181 }
10182 
10183 /*
10184  * Flags are composed of {B_INVAL, B_FREE, B_DONTNEED, B_FORCE}
10185  * If len == 0, do from off to EOF.
10186  *
10187  * The normal cases should be len == 0 && off == 0 (entire vp list) or
10188  * len == MAXBSIZE (from segmap_release actions), and len == PAGESIZE
10189  * (from pageout).
10190  */
10191 /* ARGSUSED */
10192 static int
10193 nfs4_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
10194 	caller_context_t *ct)
10195 {
10196 	int error;
10197 	rnode4_t *rp;
10198 
10199 	ASSERT(cr != NULL);
10200 
10201 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
10202 		return (EIO);
10203 
10204 	rp = VTOR4(vp);
10205 	if (IS_SHADOW(vp, rp))
10206 		vp = RTOV4(rp);
10207 
10208 	/*
10209 	 * XXX - Why should this check be made here?
10210 	 */
10211 	if (vp->v_flag & VNOMAP)
10212 		return (ENOSYS);
10213 
10214 	if (len == 0 && !(flags & B_INVAL) &&
10215 	    (vp->v_vfsp->vfs_flag & VFS_RDONLY))
10216 		return (0);
10217 
10218 	mutex_enter(&rp->r_statelock);
10219 	rp->r_count++;
10220 	mutex_exit(&rp->r_statelock);
10221 	error = nfs4_putpages(vp, off, len, flags, cr);
10222 	mutex_enter(&rp->r_statelock);
10223 	rp->r_count--;
10224 	cv_broadcast(&rp->r_cv);
10225 	mutex_exit(&rp->r_statelock);
10226 
10227 	return (error);
10228 }
10229 
10230 /*
10231  * Write out a single page, possibly klustering adjacent dirty pages.
10232  */
10233 int
10234 nfs4_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
10235     int flags, cred_t *cr)
10236 {
10237 	u_offset_t io_off;
10238 	u_offset_t lbn_off;
10239 	u_offset_t lbn;
10240 	size_t io_len;
10241 	uint_t bsize;
10242 	int error;
10243 	rnode4_t *rp;
10244 
10245 	ASSERT(!(vp->v_vfsp->vfs_flag & VFS_RDONLY));
10246 	ASSERT(pp != NULL);
10247 	ASSERT(cr != NULL);
10248 	ASSERT((flags & B_ASYNC) || nfs_zone() == VTOMI4(vp)->mi_zone);
10249 
10250 	rp = VTOR4(vp);
10251 	ASSERT(rp->r_count > 0);
10252 	ASSERT(!IS_SHADOW(vp, rp));
10253 
10254 	bsize = MAX(vp->v_vfsp->vfs_bsize, PAGESIZE);
10255 	lbn = pp->p_offset / bsize;
10256 	lbn_off = lbn * bsize;
10257 
10258 	/*
10259 	 * Find a kluster that fits in one block, or in
10260 	 * one page if pages are bigger than blocks.  If
10261 	 * there is less file space allocated than a whole
10262 	 * page, we'll shorten the i/o request below.
10263 	 */
10264 	pp = pvn_write_kluster(vp, pp, &io_off, &io_len, lbn_off,
10265 	    roundup(bsize, PAGESIZE), flags);
10266 
10267 	/*
10268 	 * pvn_write_kluster shouldn't have returned a page with offset
10269 	 * behind the original page we were given.  Verify that.
10270 	 */
10271 	ASSERT((pp->p_offset / bsize) >= lbn);
10272 
10273 	/*
10274 	 * Now pp will have the list of kept dirty pages marked for
10275 	 * write back.  It will also handle invalidation and freeing
10276 	 * of pages that are not dirty.  Check for page length rounding
10277 	 * problems.
10278 	 */
10279 	if (io_off + io_len > lbn_off + bsize) {
10280 		ASSERT((io_off + io_len) - (lbn_off + bsize) < PAGESIZE);
10281 		io_len = lbn_off + bsize - io_off;
10282 	}
10283 	/*
10284 	 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10285 	 * consistent value of r_size. R4MODINPROGRESS is set in writerp4().
10286 	 * When R4MODINPROGRESS is set it indicates that a uiomove() is in
10287 	 * progress and the r_size has not been made consistent with the
10288 	 * new size of the file. When the uiomove() completes the r_size is
10289 	 * updated and the R4MODINPROGRESS flag is cleared.
10290 	 *
10291 	 * The R4MODINPROGRESS flag makes sure that nfs4_bio() sees a
10292 	 * consistent value of r_size. Without this handshaking, it is
10293 	 * possible that nfs4_bio() picks  up the old value of r_size
10294 	 * before the uiomove() in writerp4() completes. This will result
10295 	 * in the write through nfs4_bio() being dropped.
10296 	 *
10297 	 * More precisely, there is a window between the time the uiomove()
10298 	 * completes and the time the r_size is updated. If a VOP_PUTPAGE()
10299 	 * operation intervenes in this window, the page will be picked up,
10300 	 * because it is dirty (it will be unlocked, unless it was
10301 	 * pagecreate'd). When the page is picked up as dirty, the dirty
10302 	 * bit is reset (pvn_getdirty()). In nfs4write(), r_size is
10303 	 * checked. This will still be the old size. Therefore the page will
10304 	 * not be written out. When segmap_release() calls VOP_PUTPAGE(),
10305 	 * the page will be found to be clean and the write will be dropped.
10306 	 */
10307 	if (rp->r_flags & R4MODINPROGRESS) {
10308 		mutex_enter(&rp->r_statelock);
10309 		if ((rp->r_flags & R4MODINPROGRESS) &&
10310 		    rp->r_modaddr + MAXBSIZE > io_off &&
10311 		    rp->r_modaddr < io_off + io_len) {
10312 			page_t *plist;
10313 			/*
10314 			 * A write is in progress for this region of the file.
10315 			 * If we did not detect R4MODINPROGRESS here then this
10316 			 * path through nfs_putapage() would eventually go to
10317 			 * nfs4_bio() and may not write out all of the data
10318 			 * in the pages. We end up losing data. So we decide
10319 			 * to set the modified bit on each page in the page
10320 			 * list and mark the rnode with R4DIRTY. This write
10321 			 * will be restarted at some later time.
10322 			 */
10323 			plist = pp;
10324 			while (plist != NULL) {
10325 				pp = plist;
10326 				page_sub(&plist, pp);
10327 				hat_setmod(pp);
10328 				page_io_unlock(pp);
10329 				page_unlock(pp);
10330 			}
10331 			rp->r_flags |= R4DIRTY;
10332 			mutex_exit(&rp->r_statelock);
10333 			if (offp)
10334 				*offp = io_off;
10335 			if (lenp)
10336 				*lenp = io_len;
10337 			return (0);
10338 		}
10339 		mutex_exit(&rp->r_statelock);
10340 	}
10341 
10342 	if (flags & B_ASYNC) {
10343 		error = nfs4_async_putapage(vp, pp, io_off, io_len, flags, cr,
10344 		    nfs4_sync_putapage);
10345 	} else
10346 		error = nfs4_sync_putapage(vp, pp, io_off, io_len, flags, cr);
10347 
10348 	if (offp)
10349 		*offp = io_off;
10350 	if (lenp)
10351 		*lenp = io_len;
10352 	return (error);
10353 }
10354 
10355 static int
10356 nfs4_sync_putapage(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
10357     int flags, cred_t *cr)
10358 {
10359 	int error;
10360 	rnode4_t *rp;
10361 
10362 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
10363 
10364 	flags |= B_WRITE;
10365 
10366 	error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
10367 
10368 	rp = VTOR4(vp);
10369 
10370 	if ((error == ENOSPC || error == EDQUOT || error == EFBIG ||
10371 	    error == EACCES) &&
10372 	    (flags & (B_INVAL|B_FORCE)) != (B_INVAL|B_FORCE)) {
10373 		if (!(rp->r_flags & R4OUTOFSPACE)) {
10374 			mutex_enter(&rp->r_statelock);
10375 			rp->r_flags |= R4OUTOFSPACE;
10376 			mutex_exit(&rp->r_statelock);
10377 		}
10378 		flags |= B_ERROR;
10379 		pvn_write_done(pp, flags);
10380 		/*
10381 		 * If this was not an async thread, then try again to
10382 		 * write out the pages, but this time, also destroy
10383 		 * them whether or not the write is successful.  This
10384 		 * will prevent memory from filling up with these
10385 		 * pages and destroying them is the only alternative
10386 		 * if they can't be written out.
10387 		 *
10388 		 * Don't do this if this is an async thread because
10389 		 * when the pages are unlocked in pvn_write_done,
10390 		 * some other thread could have come along, locked
10391 		 * them, and queued for an async thread.  It would be
10392 		 * possible for all of the async threads to be tied
10393 		 * up waiting to lock the pages again and they would
10394 		 * all already be locked and waiting for an async
10395 		 * thread to handle them.  Deadlock.
10396 		 */
10397 		if (!(flags & B_ASYNC)) {
10398 			error = nfs4_putpage(vp, io_off, io_len,
10399 			    B_INVAL | B_FORCE, cr, NULL);
10400 		}
10401 	} else {
10402 		if (error)
10403 			flags |= B_ERROR;
10404 		else if (rp->r_flags & R4OUTOFSPACE) {
10405 			mutex_enter(&rp->r_statelock);
10406 			rp->r_flags &= ~R4OUTOFSPACE;
10407 			mutex_exit(&rp->r_statelock);
10408 		}
10409 		pvn_write_done(pp, flags);
10410 		if (freemem < desfree)
10411 			(void) nfs4_commit_vp(vp, (u_offset_t)0, 0, cr,
10412 			    NFS4_WRITE_NOWAIT);
10413 	}
10414 
10415 	return (error);
10416 }
10417 
10418 #ifdef DEBUG
10419 int nfs4_force_open_before_mmap = 0;
10420 #endif
10421 
10422 /* ARGSUSED */
10423 static int
10424 nfs4_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
10425     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10426     caller_context_t *ct)
10427 {
10428 	struct segvn_crargs vn_a;
10429 	int error = 0;
10430 	rnode4_t *rp = VTOR4(vp);
10431 	mntinfo4_t *mi = VTOMI4(vp);
10432 
10433 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10434 		return (EIO);
10435 
10436 	if (vp->v_flag & VNOMAP)
10437 		return (ENOSYS);
10438 
10439 	if (off < 0 || (off + len) < 0)
10440 		return (ENXIO);
10441 
10442 	if (vp->v_type != VREG)
10443 		return (ENODEV);
10444 
10445 	/*
10446 	 * If the file is delegated to the client don't do anything.
10447 	 * If the file is not delegated, then validate the data cache.
10448 	 */
10449 	mutex_enter(&rp->r_statev4_lock);
10450 	if (rp->r_deleg_type == OPEN_DELEGATE_NONE) {
10451 		mutex_exit(&rp->r_statev4_lock);
10452 		error = nfs4_validate_caches(vp, cr);
10453 		if (error)
10454 			return (error);
10455 	} else {
10456 		mutex_exit(&rp->r_statev4_lock);
10457 	}
10458 
10459 	/*
10460 	 * Check to see if the vnode is currently marked as not cachable.
10461 	 * This means portions of the file are locked (through VOP_FRLOCK).
10462 	 * In this case the map request must be refused.  We use
10463 	 * rp->r_lkserlock to avoid a race with concurrent lock requests.
10464 	 *
10465 	 * Atomically increment r_inmap after acquiring r_rwlock. The
10466 	 * idea here is to acquire r_rwlock to block read/write and
10467 	 * not to protect r_inmap. r_inmap will inform nfs4_read/write()
10468 	 * that we are in nfs4_map(). Now, r_rwlock is acquired in order
10469 	 * and we can prevent the deadlock that would have occurred
10470 	 * when nfs4_addmap() would have acquired it out of order.
10471 	 *
10472 	 * Since we are not protecting r_inmap by any lock, we do not
10473 	 * hold any lock when we decrement it. We atomically decrement
10474 	 * r_inmap after we release r_lkserlock.
10475 	 */
10476 
10477 	if (nfs_rw_enter_sig(&rp->r_rwlock, RW_WRITER, INTR4(vp)))
10478 		return (EINTR);
10479 	atomic_add_int(&rp->r_inmap, 1);
10480 	nfs_rw_exit(&rp->r_rwlock);
10481 
10482 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_READER, INTR4(vp))) {
10483 		atomic_add_int(&rp->r_inmap, -1);
10484 		return (EINTR);
10485 	}
10486 
10487 
10488 	if (vp->v_flag & VNOCACHE) {
10489 		error = EAGAIN;
10490 		goto done;
10491 	}
10492 
10493 	/*
10494 	 * Don't allow concurrent locks and mapping if mandatory locking is
10495 	 * enabled.
10496 	 */
10497 	if (flk_has_remote_locks(vp)) {
10498 		struct vattr va;
10499 		va.va_mask = AT_MODE;
10500 		error = nfs4getattr(vp, &va, cr);
10501 		if (error != 0)
10502 			goto done;
10503 		if (MANDLOCK(vp, va.va_mode)) {
10504 			error = EAGAIN;
10505 			goto done;
10506 		}
10507 	}
10508 
10509 	/*
10510 	 * It is possible that the rnode has a lost lock request that we
10511 	 * are still trying to recover, and that the request conflicts with
10512 	 * this map request.
10513 	 *
10514 	 * An alternative approach would be for nfs4_safemap() to consider
10515 	 * queued lock requests when deciding whether to set or clear
10516 	 * VNOCACHE.  This would require the frlock code path to call
10517 	 * nfs4_safemap() after enqueing a lost request.
10518 	 */
10519 	if (nfs4_map_lost_lock_conflict(vp)) {
10520 		error = EAGAIN;
10521 		goto done;
10522 	}
10523 
10524 	as_rangelock(as);
10525 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
10526 	if (error != 0) {
10527 		as_rangeunlock(as);
10528 		goto done;
10529 	}
10530 
10531 	if (vp->v_type == VREG) {
10532 		/*
10533 		 * We need to retrieve the open stream
10534 		 */
10535 		nfs4_open_stream_t	*osp = NULL;
10536 		nfs4_open_owner_t	*oop = NULL;
10537 
10538 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10539 		if (oop != NULL) {
10540 			/* returns with 'os_sync_lock' held */
10541 			osp = find_open_stream(oop, rp);
10542 			open_owner_rele(oop);
10543 		}
10544 		if (osp == NULL) {
10545 #ifdef DEBUG
10546 			if (nfs4_force_open_before_mmap) {
10547 				error = EIO;
10548 				goto done;
10549 			}
10550 #endif
10551 			/* returns with 'os_sync_lock' held */
10552 			error = open_and_get_osp(vp, cr, &osp);
10553 			if (osp == NULL) {
10554 				NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10555 				    "nfs4_map: we tried to OPEN the file "
10556 				    "but again no osp, so fail with EIO"));
10557 				goto done;
10558 			}
10559 		}
10560 
10561 		if (osp->os_failed_reopen) {
10562 			mutex_exit(&osp->os_sync_lock);
10563 			open_stream_rele(osp, rp);
10564 			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
10565 			    "nfs4_map: os_failed_reopen set on "
10566 			    "osp %p, cr %p, rp %s", (void *)osp,
10567 			    (void *)cr, rnode4info(rp)));
10568 			error = EIO;
10569 			goto done;
10570 		}
10571 		mutex_exit(&osp->os_sync_lock);
10572 		open_stream_rele(osp, rp);
10573 	}
10574 
10575 	vn_a.vp = vp;
10576 	vn_a.offset = off;
10577 	vn_a.type = (flags & MAP_TYPE);
10578 	vn_a.prot = (uchar_t)prot;
10579 	vn_a.maxprot = (uchar_t)maxprot;
10580 	vn_a.flags = (flags & ~MAP_TYPE);
10581 	vn_a.cred = cr;
10582 	vn_a.amp = NULL;
10583 	vn_a.szc = 0;
10584 	vn_a.lgrp_mem_policy_flags = 0;
10585 
10586 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
10587 	as_rangeunlock(as);
10588 
10589 done:
10590 	nfs_rw_exit(&rp->r_lkserlock);
10591 	atomic_add_int(&rp->r_inmap, -1);
10592 	return (error);
10593 }
10594 
10595 /*
10596  * We're most likely dealing with a kernel module that likes to READ
10597  * and mmap without OPENing the file (ie: lookup/read/mmap), so lets
10598  * officially OPEN the file to create the necessary client state
10599  * for bookkeeping of os_mmap_read/write counts.
10600  *
10601  * Since VOP_MAP only passes in a pointer to the vnode rather than
10602  * a double pointer, we can't handle the case where nfs4open_otw()
10603  * returns a different vnode than the one passed into VOP_MAP (since
10604  * VOP_DELMAP will not see the vnode nfs4open_otw used).  In this case,
10605  * we return NULL and let nfs4_map() fail.  Note: the only case where
10606  * this should happen is if the file got removed and replaced with the
10607  * same name on the server (in addition to the fact that we're trying
10608  * to VOP_MAP withouth VOP_OPENing the file in the first place).
10609  */
10610 static int
10611 open_and_get_osp(vnode_t *map_vp, cred_t *cr, nfs4_open_stream_t **ospp)
10612 {
10613 	rnode4_t		*rp, *drp;
10614 	vnode_t			*dvp, *open_vp;
10615 	char			file_name[MAXNAMELEN];
10616 	int			just_created;
10617 	nfs4_open_stream_t	*osp;
10618 	nfs4_open_owner_t	*oop;
10619 	int			error;
10620 
10621 	*ospp = NULL;
10622 	open_vp = map_vp;
10623 
10624 	rp = VTOR4(open_vp);
10625 	if ((error = vtodv(open_vp, &dvp, cr, TRUE)) != 0)
10626 		return (error);
10627 	drp = VTOR4(dvp);
10628 
10629 	if (nfs_rw_enter_sig(&drp->r_rwlock, RW_READER, INTR4(dvp))) {
10630 		VN_RELE(dvp);
10631 		return (EINTR);
10632 	}
10633 
10634 	if ((error = vtoname(open_vp, file_name, MAXNAMELEN)) != 0) {
10635 		nfs_rw_exit(&drp->r_rwlock);
10636 		VN_RELE(dvp);
10637 		return (error);
10638 	}
10639 
10640 	mutex_enter(&rp->r_statev4_lock);
10641 	if (rp->created_v4) {
10642 		rp->created_v4 = 0;
10643 		mutex_exit(&rp->r_statev4_lock);
10644 
10645 		dnlc_update(dvp, file_name, open_vp);
10646 		/* This is needed so we don't bump the open ref count */
10647 		just_created = 1;
10648 	} else {
10649 		mutex_exit(&rp->r_statev4_lock);
10650 		just_created = 0;
10651 	}
10652 
10653 	VN_HOLD(map_vp);
10654 
10655 	error = nfs4open_otw(dvp, file_name, NULL, &open_vp, cr, 0, FREAD, 0,
10656 	    just_created);
10657 	if (error) {
10658 		nfs_rw_exit(&drp->r_rwlock);
10659 		VN_RELE(dvp);
10660 		VN_RELE(map_vp);
10661 		return (error);
10662 	}
10663 
10664 	nfs_rw_exit(&drp->r_rwlock);
10665 	VN_RELE(dvp);
10666 
10667 	/*
10668 	 * If nfs4open_otw() returned a different vnode then "undo"
10669 	 * the open and return failure to the caller.
10670 	 */
10671 	if (!VN_CMP(open_vp, map_vp)) {
10672 		nfs4_error_t e;
10673 
10674 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10675 		    "open returned a different vnode"));
10676 		/*
10677 		 * If there's an error, ignore it,
10678 		 * and let VOP_INACTIVE handle it.
10679 		 */
10680 		(void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10681 		    CLOSE_NORM, 0, 0, 0);
10682 		VN_RELE(map_vp);
10683 		return (EIO);
10684 	}
10685 
10686 	VN_RELE(map_vp);
10687 
10688 	oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(open_vp));
10689 	if (!oop) {
10690 		nfs4_error_t e;
10691 
10692 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "open_and_get_osp: "
10693 		    "no open owner"));
10694 		/*
10695 		 * If there's an error, ignore it,
10696 		 * and let VOP_INACTIVE handle it.
10697 		 */
10698 		(void) nfs4close_one(open_vp, NULL, cr, FREAD, NULL, &e,
10699 		    CLOSE_NORM, 0, 0, 0);
10700 		return (EIO);
10701 	}
10702 	osp = find_open_stream(oop, rp);
10703 	open_owner_rele(oop);
10704 	*ospp = osp;
10705 	return (0);
10706 }
10707 
10708 /*
10709  * Please be aware that when this function is called, the address space write
10710  * a_lock is held.  Do not put over the wire calls in this function.
10711  */
10712 /* ARGSUSED */
10713 static int
10714 nfs4_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
10715     size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
10716     caller_context_t *ct)
10717 {
10718 	rnode4_t		*rp;
10719 	int			error = 0;
10720 	mntinfo4_t		*mi;
10721 
10722 	mi = VTOMI4(vp);
10723 	rp = VTOR4(vp);
10724 
10725 	if (nfs_zone() != mi->mi_zone)
10726 		return (EIO);
10727 	if (vp->v_flag & VNOMAP)
10728 		return (ENOSYS);
10729 
10730 	/*
10731 	 * Don't need to update the open stream first, since this
10732 	 * mmap can't add any additional share access that isn't
10733 	 * already contained in the open stream (for the case where we
10734 	 * open/mmap/only update rp->r_mapcnt/server reboots/reopen doesn't
10735 	 * take into account os_mmap_read[write] counts).
10736 	 */
10737 	atomic_add_long((ulong_t *)&rp->r_mapcnt, btopr(len));
10738 
10739 	if (vp->v_type == VREG) {
10740 		/*
10741 		 * We need to retrieve the open stream and update the counts.
10742 		 * If there is no open stream here, something is wrong.
10743 		 */
10744 		nfs4_open_stream_t	*osp = NULL;
10745 		nfs4_open_owner_t	*oop = NULL;
10746 
10747 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
10748 		if (oop != NULL) {
10749 			/* returns with 'os_sync_lock' held */
10750 			osp = find_open_stream(oop, rp);
10751 			open_owner_rele(oop);
10752 		}
10753 		if (osp == NULL) {
10754 			NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE,
10755 			    "nfs4_addmap: we should have an osp"
10756 			    "but we don't, so fail with EIO"));
10757 			error = EIO;
10758 			goto out;
10759 		}
10760 
10761 		NFS4_DEBUG(nfs4_mmap_debug, (CE_NOTE, "nfs4_addmap: osp %p,"
10762 		    " pages %ld, prot 0x%x", (void *)osp, btopr(len), prot));
10763 
10764 		/*
10765 		 * Update the map count in the open stream.
10766 		 * This is necessary in the case where we
10767 		 * open/mmap/close/, then the server reboots, and we
10768 		 * attempt to reopen.  If the mmap doesn't add share
10769 		 * access then we send an invalid reopen with
10770 		 * access = NONE.
10771 		 *
10772 		 * We need to specifically check each PROT_* so a mmap
10773 		 * call of (PROT_WRITE | PROT_EXEC) will ensure us both
10774 		 * read and write access.  A simple comparison of prot
10775 		 * to ~PROT_WRITE to determine read access is insufficient
10776 		 * since prot can be |= with PROT_USER, etc.
10777 		 */
10778 
10779 		/*
10780 		 * Unless we're MAP_SHARED, no sense in adding os_mmap_write
10781 		 */
10782 		if ((flags & MAP_SHARED) && (maxprot & PROT_WRITE))
10783 			osp->os_mmap_write += btopr(len);
10784 		if (maxprot & PROT_READ)
10785 			osp->os_mmap_read += btopr(len);
10786 		if (maxprot & PROT_EXEC)
10787 			osp->os_mmap_read += btopr(len);
10788 		/*
10789 		 * Ensure that os_mmap_read gets incremented, even if
10790 		 * maxprot were to look like PROT_NONE.
10791 		 */
10792 		if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
10793 		    !(maxprot & PROT_EXEC))
10794 			osp->os_mmap_read += btopr(len);
10795 		osp->os_mapcnt += btopr(len);
10796 		mutex_exit(&osp->os_sync_lock);
10797 		open_stream_rele(osp, rp);
10798 	}
10799 
10800 out:
10801 	/*
10802 	 * If we got an error, then undo our
10803 	 * incrementing of 'r_mapcnt'.
10804 	 */
10805 
10806 	if (error) {
10807 		atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(len));
10808 		ASSERT(rp->r_mapcnt >= 0);
10809 	}
10810 	return (error);
10811 }
10812 
10813 /* ARGSUSED */
10814 static int
10815 nfs4_cmp(vnode_t *vp1, vnode_t *vp2, caller_context_t *ct)
10816 {
10817 
10818 	return (VTOR4(vp1) == VTOR4(vp2));
10819 }
10820 
10821 /* ARGSUSED */
10822 static int
10823 nfs4_frlock(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10824     offset_t offset, struct flk_callback *flk_cbp, cred_t *cr,
10825     caller_context_t *ct)
10826 {
10827 	int rc;
10828 	u_offset_t start, end;
10829 	rnode4_t *rp;
10830 	int error = 0, intr = INTR4(vp);
10831 	nfs4_error_t e;
10832 
10833 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10834 		return (EIO);
10835 
10836 	/* check for valid cmd parameter */
10837 	if (cmd != F_GETLK && cmd != F_SETLK && cmd != F_SETLKW)
10838 		return (EINVAL);
10839 
10840 	/* Verify l_type. */
10841 	switch (bfp->l_type) {
10842 	case F_RDLCK:
10843 		if (cmd != F_GETLK && !(flag & FREAD))
10844 			return (EBADF);
10845 		break;
10846 	case F_WRLCK:
10847 		if (cmd != F_GETLK && !(flag & FWRITE))
10848 			return (EBADF);
10849 		break;
10850 	case F_UNLCK:
10851 		intr = 0;
10852 		break;
10853 
10854 	default:
10855 		return (EINVAL);
10856 	}
10857 
10858 	/* check the validity of the lock range */
10859 	if (rc = flk_convert_lock_data(vp, bfp, &start, &end, offset))
10860 		return (rc);
10861 	if (rc = flk_check_lock_data(start, end, MAXEND))
10862 		return (rc);
10863 
10864 	/*
10865 	 * If the filesystem is mounted using local locking, pass the
10866 	 * request off to the local locking code.
10867 	 */
10868 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK || vp->v_type != VREG) {
10869 		if (cmd == F_SETLK || cmd == F_SETLKW) {
10870 			/*
10871 			 * For complete safety, we should be holding
10872 			 * r_lkserlock.  However, we can't call
10873 			 * nfs4_safelock and then fs_frlock while
10874 			 * holding r_lkserlock, so just invoke
10875 			 * nfs4_safelock and expect that this will
10876 			 * catch enough of the cases.
10877 			 */
10878 			if (!nfs4_safelock(vp, bfp, cr))
10879 				return (EAGAIN);
10880 		}
10881 		return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
10882 	}
10883 
10884 	rp = VTOR4(vp);
10885 
10886 	/*
10887 	 * Check whether the given lock request can proceed, given the
10888 	 * current file mappings.
10889 	 */
10890 	if (nfs_rw_enter_sig(&rp->r_lkserlock, RW_WRITER, intr))
10891 		return (EINTR);
10892 	if (cmd == F_SETLK || cmd == F_SETLKW) {
10893 		if (!nfs4_safelock(vp, bfp, cr)) {
10894 			rc = EAGAIN;
10895 			goto done;
10896 		}
10897 	}
10898 
10899 	/*
10900 	 * Flush the cache after waiting for async I/O to finish.  For new
10901 	 * locks, this is so that the process gets the latest bits from the
10902 	 * server.  For unlocks, this is so that other clients see the
10903 	 * latest bits once the file has been unlocked.  If currently dirty
10904 	 * pages can't be flushed, then don't allow a lock to be set.  But
10905 	 * allow unlocks to succeed, to avoid having orphan locks on the
10906 	 * server.
10907 	 */
10908 	if (cmd != F_GETLK) {
10909 		mutex_enter(&rp->r_statelock);
10910 		while (rp->r_count > 0) {
10911 			if (intr) {
10912 				klwp_t *lwp = ttolwp(curthread);
10913 
10914 				if (lwp != NULL)
10915 					lwp->lwp_nostop++;
10916 				if (cv_wait_sig(&rp->r_cv,
10917 				    &rp->r_statelock) == 0) {
10918 					if (lwp != NULL)
10919 						lwp->lwp_nostop--;
10920 					rc = EINTR;
10921 					break;
10922 				}
10923 				if (lwp != NULL)
10924 					lwp->lwp_nostop--;
10925 				} else
10926 					cv_wait(&rp->r_cv, &rp->r_statelock);
10927 		}
10928 		mutex_exit(&rp->r_statelock);
10929 		if (rc != 0)
10930 			goto done;
10931 		error = nfs4_putpage(vp, (offset_t)0, 0, B_INVAL, cr, ct);
10932 		if (error) {
10933 			if (error == ENOSPC || error == EDQUOT) {
10934 				mutex_enter(&rp->r_statelock);
10935 				if (!rp->r_error)
10936 					rp->r_error = error;
10937 				mutex_exit(&rp->r_statelock);
10938 			}
10939 			if (bfp->l_type != F_UNLCK) {
10940 				rc = ENOLCK;
10941 				goto done;
10942 			}
10943 		}
10944 	}
10945 
10946 	/*
10947 	 * Call the lock manager to do the real work of contacting
10948 	 * the server and obtaining the lock.
10949 	 */
10950 	nfs4frlock(NFS4_LCK_CTYPE_NORM, vp, cmd, bfp, flag, offset,
10951 	    cr, &e, NULL, NULL);
10952 	rc = e.error;
10953 
10954 	if (rc == 0)
10955 		nfs4_lockcompletion(vp, cmd);
10956 
10957 done:
10958 	nfs_rw_exit(&rp->r_lkserlock);
10959 
10960 	return (rc);
10961 }
10962 
10963 /*
10964  * Free storage space associated with the specified vnode.  The portion
10965  * to be freed is specified by bfp->l_start and bfp->l_len (already
10966  * normalized to a "whence" of 0).
10967  *
10968  * This is an experimental facility whose continued existence is not
10969  * guaranteed.  Currently, we only support the special case
10970  * of l_len == 0, meaning free to end of file.
10971  */
10972 /* ARGSUSED */
10973 static int
10974 nfs4_space(vnode_t *vp, int cmd, struct flock64 *bfp, int flag,
10975     offset_t offset, cred_t *cr, caller_context_t *ct)
10976 {
10977 	int error;
10978 
10979 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
10980 		return (EIO);
10981 	ASSERT(vp->v_type == VREG);
10982 	if (cmd != F_FREESP)
10983 		return (EINVAL);
10984 
10985 	error = convoff(vp, bfp, 0, offset);
10986 	if (!error) {
10987 		ASSERT(bfp->l_start >= 0);
10988 		if (bfp->l_len == 0) {
10989 			struct vattr va;
10990 
10991 			va.va_mask = AT_SIZE;
10992 			va.va_size = bfp->l_start;
10993 			error = nfs4setattr(vp, &va, 0, cr, NULL);
10994 		} else
10995 			error = EINVAL;
10996 	}
10997 
10998 	return (error);
10999 }
11000 
11001 /* ARGSUSED */
11002 int
11003 nfs4_realvp(vnode_t *vp, vnode_t **vpp, caller_context_t *ct)
11004 {
11005 	rnode4_t *rp;
11006 	rp = VTOR4(vp);
11007 
11008 	if (vp->v_type == VREG && IS_SHADOW(vp, rp)) {
11009 		vp = RTOV4(rp);
11010 	}
11011 	*vpp = vp;
11012 	return (0);
11013 }
11014 
11015 /*
11016  * Setup and add an address space callback to do the work of the delmap call.
11017  * The callback will (and must be) deleted in the actual callback function.
11018  *
11019  * This is done in order to take care of the problem that we have with holding
11020  * the address space's a_lock for a long period of time (e.g. if the NFS server
11021  * is down).  Callbacks will be executed in the address space code while the
11022  * a_lock is not held.  Holding the address space's a_lock causes things such
11023  * as ps and fork to hang because they are trying to acquire this lock as well.
11024  */
11025 /* ARGSUSED */
11026 static int
11027 nfs4_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
11028     size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
11029     caller_context_t *ct)
11030 {
11031 	int			caller_found;
11032 	int			error;
11033 	rnode4_t		*rp;
11034 	nfs4_delmap_args_t	*dmapp;
11035 	nfs4_delmapcall_t	*delmap_call;
11036 
11037 	if (vp->v_flag & VNOMAP)
11038 		return (ENOSYS);
11039 
11040 	/*
11041 	 * A process may not change zones if it has NFS pages mmap'ed
11042 	 * in, so we can't legitimately get here from the wrong zone.
11043 	 */
11044 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11045 
11046 	rp = VTOR4(vp);
11047 
11048 	/*
11049 	 * The way that the address space of this process deletes its mapping
11050 	 * of this file is via the following call chains:
11051 	 * - as_free()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11052 	 * - as_unmap()->SEGOP_UNMAP()/segvn_unmap()->VOP_DELMAP()/nfs4_delmap()
11053 	 *
11054 	 * With the use of address space callbacks we are allowed to drop the
11055 	 * address space lock, a_lock, while executing the NFS operations that
11056 	 * need to go over the wire.  Returning EAGAIN to the caller of this
11057 	 * function is what drives the execution of the callback that we add
11058 	 * below.  The callback will be executed by the address space code
11059 	 * after dropping the a_lock.  When the callback is finished, since
11060 	 * we dropped the a_lock, it must be re-acquired and segvn_unmap()
11061 	 * is called again on the same segment to finish the rest of the work
11062 	 * that needs to happen during unmapping.
11063 	 *
11064 	 * This action of calling back into the segment driver causes
11065 	 * nfs4_delmap() to get called again, but since the callback was
11066 	 * already executed at this point, it already did the work and there
11067 	 * is nothing left for us to do.
11068 	 *
11069 	 * To Summarize:
11070 	 * - The first time nfs4_delmap is called by the current thread is when
11071 	 * we add the caller associated with this delmap to the delmap caller
11072 	 * list, add the callback, and return EAGAIN.
11073 	 * - The second time in this call chain when nfs4_delmap is called we
11074 	 * will find this caller in the delmap caller list and realize there
11075 	 * is no more work to do thus removing this caller from the list and
11076 	 * returning the error that was set in the callback execution.
11077 	 */
11078 	caller_found = nfs4_find_and_delete_delmapcall(rp, &error);
11079 	if (caller_found) {
11080 		/*
11081 		 * 'error' is from the actual delmap operations.  To avoid
11082 		 * hangs, we need to handle the return of EAGAIN differently
11083 		 * since this is what drives the callback execution.
11084 		 * In this case, we don't want to return EAGAIN and do the
11085 		 * callback execution because there are none to execute.
11086 		 */
11087 		if (error == EAGAIN)
11088 			return (0);
11089 		else
11090 			return (error);
11091 	}
11092 
11093 	/* current caller was not in the list */
11094 	delmap_call = nfs4_init_delmapcall();
11095 
11096 	mutex_enter(&rp->r_statelock);
11097 	list_insert_tail(&rp->r_indelmap, delmap_call);
11098 	mutex_exit(&rp->r_statelock);
11099 
11100 	dmapp = kmem_alloc(sizeof (nfs4_delmap_args_t), KM_SLEEP);
11101 
11102 	dmapp->vp = vp;
11103 	dmapp->off = off;
11104 	dmapp->addr = addr;
11105 	dmapp->len = len;
11106 	dmapp->prot = prot;
11107 	dmapp->maxprot = maxprot;
11108 	dmapp->flags = flags;
11109 	dmapp->cr = cr;
11110 	dmapp->caller = delmap_call;
11111 
11112 	error = as_add_callback(as, nfs4_delmap_callback, dmapp,
11113 	    AS_UNMAP_EVENT, addr, len, KM_SLEEP);
11114 
11115 	return (error ? error : EAGAIN);
11116 }
11117 
11118 static nfs4_delmapcall_t *
11119 nfs4_init_delmapcall()
11120 {
11121 	nfs4_delmapcall_t	*delmap_call;
11122 
11123 	delmap_call = kmem_alloc(sizeof (nfs4_delmapcall_t), KM_SLEEP);
11124 	delmap_call->call_id = curthread;
11125 	delmap_call->error = 0;
11126 
11127 	return (delmap_call);
11128 }
11129 
11130 static void
11131 nfs4_free_delmapcall(nfs4_delmapcall_t *delmap_call)
11132 {
11133 	kmem_free(delmap_call, sizeof (nfs4_delmapcall_t));
11134 }
11135 
11136 /*
11137  * Searches for the current delmap caller (based on curthread) in the list of
11138  * callers.  If it is found, we remove it and free the delmap caller.
11139  * Returns:
11140  *      0 if the caller wasn't found
11141  *      1 if the caller was found, removed and freed.  *errp will be set
11142  *	to what the result of the delmap was.
11143  */
11144 static int
11145 nfs4_find_and_delete_delmapcall(rnode4_t *rp, int *errp)
11146 {
11147 	nfs4_delmapcall_t	*delmap_call;
11148 
11149 	/*
11150 	 * If the list doesn't exist yet, we create it and return
11151 	 * that the caller wasn't found.  No list = no callers.
11152 	 */
11153 	mutex_enter(&rp->r_statelock);
11154 	if (!(rp->r_flags & R4DELMAPLIST)) {
11155 		/* The list does not exist */
11156 		list_create(&rp->r_indelmap, sizeof (nfs4_delmapcall_t),
11157 		    offsetof(nfs4_delmapcall_t, call_node));
11158 		rp->r_flags |= R4DELMAPLIST;
11159 		mutex_exit(&rp->r_statelock);
11160 		return (0);
11161 	} else {
11162 		/* The list exists so search it */
11163 		for (delmap_call = list_head(&rp->r_indelmap);
11164 		    delmap_call != NULL;
11165 		    delmap_call = list_next(&rp->r_indelmap, delmap_call)) {
11166 			if (delmap_call->call_id == curthread) {
11167 				/* current caller is in the list */
11168 				*errp = delmap_call->error;
11169 				list_remove(&rp->r_indelmap, delmap_call);
11170 				mutex_exit(&rp->r_statelock);
11171 				nfs4_free_delmapcall(delmap_call);
11172 				return (1);
11173 			}
11174 		}
11175 	}
11176 	mutex_exit(&rp->r_statelock);
11177 	return (0);
11178 }
11179 
11180 /*
11181  * Remove some pages from an mmap'd vnode.  Just update the
11182  * count of pages.  If doing close-to-open, then flush and
11183  * commit all of the pages associated with this file.
11184  * Otherwise, start an asynchronous page flush to write out
11185  * any dirty pages.  This will also associate a credential
11186  * with the rnode which can be used to write the pages.
11187  */
11188 /* ARGSUSED */
11189 static void
11190 nfs4_delmap_callback(struct as *as, void *arg, uint_t event)
11191 {
11192 	nfs4_error_t		e = { 0, NFS4_OK, RPC_SUCCESS };
11193 	rnode4_t		*rp;
11194 	mntinfo4_t		*mi;
11195 	nfs4_delmap_args_t	*dmapp = (nfs4_delmap_args_t *)arg;
11196 
11197 	rp = VTOR4(dmapp->vp);
11198 	mi = VTOMI4(dmapp->vp);
11199 
11200 	atomic_add_long((ulong_t *)&rp->r_mapcnt, -btopr(dmapp->len));
11201 	ASSERT(rp->r_mapcnt >= 0);
11202 
11203 	/*
11204 	 * Initiate a page flush and potential commit if there are
11205 	 * pages, the file system was not mounted readonly, the segment
11206 	 * was mapped shared, and the pages themselves were writeable.
11207 	 */
11208 	if (nfs4_has_pages(dmapp->vp) &&
11209 	    !(dmapp->vp->v_vfsp->vfs_flag & VFS_RDONLY) &&
11210 	    dmapp->flags == MAP_SHARED && (dmapp->maxprot & PROT_WRITE)) {
11211 		mutex_enter(&rp->r_statelock);
11212 		rp->r_flags |= R4DIRTY;
11213 		mutex_exit(&rp->r_statelock);
11214 		e.error = nfs4_putpage_commit(dmapp->vp, dmapp->off,
11215 		    dmapp->len, dmapp->cr);
11216 		if (!e.error) {
11217 			mutex_enter(&rp->r_statelock);
11218 			e.error = rp->r_error;
11219 			rp->r_error = 0;
11220 			mutex_exit(&rp->r_statelock);
11221 		}
11222 	} else
11223 		e.error = 0;
11224 
11225 	if ((rp->r_flags & R4DIRECTIO) || (mi->mi_flags & MI4_DIRECTIO))
11226 		(void) nfs4_putpage(dmapp->vp, dmapp->off, dmapp->len,
11227 		    B_INVAL, dmapp->cr, NULL);
11228 
11229 	if (e.error) {
11230 		e.stat = puterrno4(e.error);
11231 		nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11232 		    OP_COMMIT, FALSE, NULL, 0, dmapp->vp);
11233 		dmapp->caller->error = e.error;
11234 	}
11235 
11236 	/* Check to see if we need to close the file */
11237 
11238 	if (dmapp->vp->v_type == VREG) {
11239 		nfs4close_one(dmapp->vp, NULL, dmapp->cr, 0, NULL, &e,
11240 		    CLOSE_DELMAP, dmapp->len, dmapp->maxprot, dmapp->flags);
11241 
11242 		if (e.error != 0 || e.stat != NFS4_OK) {
11243 			/*
11244 			 * Since it is possible that e.error == 0 and
11245 			 * e.stat != NFS4_OK (and vice versa),
11246 			 * we do the proper checking in order to get both
11247 			 * e.error and e.stat reporting the correct info.
11248 			 */
11249 			if (e.stat == NFS4_OK)
11250 				e.stat = puterrno4(e.error);
11251 			if (e.error == 0)
11252 				e.error = geterrno4(e.stat);
11253 
11254 			nfs4_queue_fact(RF_DELMAP_CB_ERR, mi, e.stat, 0,
11255 			    OP_CLOSE, FALSE, NULL, 0, dmapp->vp);
11256 			dmapp->caller->error = e.error;
11257 		}
11258 	}
11259 
11260 	(void) as_delete_callback(as, arg);
11261 	kmem_free(dmapp, sizeof (nfs4_delmap_args_t));
11262 }
11263 
11264 
11265 static uint_t
11266 fattr4_maxfilesize_to_bits(uint64_t ll)
11267 {
11268 	uint_t l = 1;
11269 
11270 	if (ll == 0) {
11271 		return (0);
11272 	}
11273 
11274 	if (ll & 0xffffffff00000000) {
11275 		l += 32; ll >>= 32;
11276 	}
11277 	if (ll & 0xffff0000) {
11278 		l += 16; ll >>= 16;
11279 	}
11280 	if (ll & 0xff00) {
11281 		l += 8; ll >>= 8;
11282 	}
11283 	if (ll & 0xf0) {
11284 		l += 4; ll >>= 4;
11285 	}
11286 	if (ll & 0xc) {
11287 		l += 2; ll >>= 2;
11288 	}
11289 	if (ll & 0x2) {
11290 		l += 1;
11291 	}
11292 	return (l);
11293 }
11294 
11295 static int
11296 nfs4_have_xattrs(vnode_t *vp, ulong_t *valp, cred_t *cr)
11297 {
11298 	vnode_t *avp = NULL;
11299 	int error;
11300 
11301 	if ((error = nfs4lookup_xattr(vp, "", &avp,
11302 	    LOOKUP_XATTR, cr)) == 0)
11303 		error = do_xattr_exists_check(avp, valp, cr);
11304 	if (avp)
11305 		VN_RELE(avp);
11306 
11307 	return (error);
11308 }
11309 
11310 /* ARGSUSED */
11311 int
11312 nfs4_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
11313 	caller_context_t *ct)
11314 {
11315 	int error;
11316 	hrtime_t t;
11317 	rnode4_t *rp;
11318 	nfs4_ga_res_t gar;
11319 	nfs4_ga_ext_res_t ger;
11320 
11321 	gar.n4g_ext_res = &ger;
11322 
11323 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
11324 		return (EIO);
11325 	if (cmd == _PC_PATH_MAX || cmd == _PC_SYMLINK_MAX) {
11326 		*valp = MAXPATHLEN;
11327 		return (0);
11328 	}
11329 	if (cmd == _PC_ACL_ENABLED) {
11330 		*valp = _ACL_ACE_ENABLED;
11331 		return (0);
11332 	}
11333 
11334 	rp = VTOR4(vp);
11335 	if (cmd == _PC_XATTR_EXISTS) {
11336 		/*
11337 		 * The existence of the xattr directory is not sufficient
11338 		 * for determining whether generic user attributes exists.
11339 		 * The attribute directory could only be a transient directory
11340 		 * used for Solaris sysattr support.  Do a small readdir
11341 		 * to verify if the only entries are sysattrs or not.
11342 		 *
11343 		 * pc4_xattr_valid can be only be trusted when r_xattr_dir
11344 		 * is NULL.  Once the xadir vp exists, we can create xattrs,
11345 		 * and we don't have any way to update the "base" object's
11346 		 * pc4_xattr_exists from the xattr or xadir.  Maybe FEM
11347 		 * could help out.
11348 		 */
11349 		if (ATTRCACHE4_VALID(vp) && rp->r_pathconf.pc4_xattr_valid &&
11350 		    rp->r_xattr_dir == NULL) {
11351 			return (nfs4_have_xattrs(vp, valp, cr));
11352 		}
11353 	} else {  /* OLD CODE */
11354 		if (ATTRCACHE4_VALID(vp)) {
11355 			mutex_enter(&rp->r_statelock);
11356 			if (rp->r_pathconf.pc4_cache_valid) {
11357 				error = 0;
11358 				switch (cmd) {
11359 				case _PC_FILESIZEBITS:
11360 					*valp =
11361 					    rp->r_pathconf.pc4_filesizebits;
11362 					break;
11363 				case _PC_LINK_MAX:
11364 					*valp =
11365 					    rp->r_pathconf.pc4_link_max;
11366 					break;
11367 				case _PC_NAME_MAX:
11368 					*valp =
11369 					    rp->r_pathconf.pc4_name_max;
11370 					break;
11371 				case _PC_CHOWN_RESTRICTED:
11372 					*valp =
11373 					    rp->r_pathconf.pc4_chown_restricted;
11374 					break;
11375 				case _PC_NO_TRUNC:
11376 					*valp =
11377 					    rp->r_pathconf.pc4_no_trunc;
11378 					break;
11379 				default:
11380 					error = EINVAL;
11381 					break;
11382 				}
11383 				mutex_exit(&rp->r_statelock);
11384 #ifdef DEBUG
11385 				nfs4_pathconf_cache_hits++;
11386 #endif
11387 				return (error);
11388 			}
11389 			mutex_exit(&rp->r_statelock);
11390 		}
11391 	}
11392 #ifdef DEBUG
11393 	nfs4_pathconf_cache_misses++;
11394 #endif
11395 
11396 	t = gethrtime();
11397 
11398 	error = nfs4_attr_otw(vp, TAG_PATHCONF, &gar, NFS4_PATHCONF_MASK, cr);
11399 
11400 	if (error) {
11401 		mutex_enter(&rp->r_statelock);
11402 		rp->r_pathconf.pc4_cache_valid = FALSE;
11403 		rp->r_pathconf.pc4_xattr_valid = FALSE;
11404 		mutex_exit(&rp->r_statelock);
11405 		return (error);
11406 	}
11407 
11408 	/* interpret the max filesize */
11409 	gar.n4g_ext_res->n4g_pc4.pc4_filesizebits =
11410 	    fattr4_maxfilesize_to_bits(gar.n4g_ext_res->n4g_maxfilesize);
11411 
11412 	/* Store the attributes we just received */
11413 	nfs4_attr_cache(vp, &gar, t, cr, TRUE, NULL);
11414 
11415 	switch (cmd) {
11416 	case _PC_FILESIZEBITS:
11417 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_filesizebits;
11418 		break;
11419 	case _PC_LINK_MAX:
11420 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_link_max;
11421 		break;
11422 	case _PC_NAME_MAX:
11423 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_name_max;
11424 		break;
11425 	case _PC_CHOWN_RESTRICTED:
11426 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_chown_restricted;
11427 		break;
11428 	case _PC_NO_TRUNC:
11429 		*valp = gar.n4g_ext_res->n4g_pc4.pc4_no_trunc;
11430 		break;
11431 	case _PC_XATTR_EXISTS:
11432 		if (gar.n4g_ext_res->n4g_pc4.pc4_xattr_exists) {
11433 			if (error = nfs4_have_xattrs(vp, valp, cr))
11434 				return (error);
11435 		}
11436 		break;
11437 	default:
11438 		return (EINVAL);
11439 	}
11440 
11441 	return (0);
11442 }
11443 
11444 /*
11445  * Called by async thread to do synchronous pageio. Do the i/o, wait
11446  * for it to complete, and cleanup the page list when done.
11447  */
11448 static int
11449 nfs4_sync_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11450     int flags, cred_t *cr)
11451 {
11452 	int error;
11453 
11454 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11455 
11456 	error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11457 	if (flags & B_READ)
11458 		pvn_read_done(pp, (error ? B_ERROR : 0) | flags);
11459 	else
11460 		pvn_write_done(pp, (error ? B_ERROR : 0) | flags);
11461 	return (error);
11462 }
11463 
11464 /* ARGSUSED */
11465 static int
11466 nfs4_pageio(vnode_t *vp, page_t *pp, u_offset_t io_off, size_t io_len,
11467 	int flags, cred_t *cr, caller_context_t *ct)
11468 {
11469 	int error;
11470 	rnode4_t *rp;
11471 
11472 	if (!(flags & B_ASYNC) && nfs_zone() != VTOMI4(vp)->mi_zone)
11473 		return (EIO);
11474 
11475 	if (pp == NULL)
11476 		return (EINVAL);
11477 
11478 	rp = VTOR4(vp);
11479 	mutex_enter(&rp->r_statelock);
11480 	rp->r_count++;
11481 	mutex_exit(&rp->r_statelock);
11482 
11483 	if (flags & B_ASYNC) {
11484 		error = nfs4_async_pageio(vp, pp, io_off, io_len, flags, cr,
11485 		    nfs4_sync_pageio);
11486 	} else
11487 		error = nfs4_rdwrlbn(vp, pp, io_off, io_len, flags, cr);
11488 	mutex_enter(&rp->r_statelock);
11489 	rp->r_count--;
11490 	cv_broadcast(&rp->r_cv);
11491 	mutex_exit(&rp->r_statelock);
11492 	return (error);
11493 }
11494 
11495 /* ARGSUSED */
11496 static void
11497 nfs4_dispose(vnode_t *vp, page_t *pp, int fl, int dn, cred_t *cr,
11498 	caller_context_t *ct)
11499 {
11500 	int error;
11501 	rnode4_t *rp;
11502 	page_t *plist;
11503 	page_t *pptr;
11504 	offset3 offset;
11505 	count3 len;
11506 	k_sigset_t smask;
11507 
11508 	/*
11509 	 * We should get called with fl equal to either B_FREE or
11510 	 * B_INVAL.  Any other value is illegal.
11511 	 *
11512 	 * The page that we are either supposed to free or destroy
11513 	 * should be exclusive locked and its io lock should not
11514 	 * be held.
11515 	 */
11516 	ASSERT(fl == B_FREE || fl == B_INVAL);
11517 	ASSERT((PAGE_EXCL(pp) && !page_iolock_assert(pp)) || panicstr);
11518 
11519 	rp = VTOR4(vp);
11520 
11521 	/*
11522 	 * If the page doesn't need to be committed or we shouldn't
11523 	 * even bother attempting to commit it, then just make sure
11524 	 * that the p_fsdata byte is clear and then either free or
11525 	 * destroy the page as appropriate.
11526 	 */
11527 	if (pp->p_fsdata == C_NOCOMMIT || (rp->r_flags & R4STALE)) {
11528 		pp->p_fsdata = C_NOCOMMIT;
11529 		if (fl == B_FREE)
11530 			page_free(pp, dn);
11531 		else
11532 			page_destroy(pp, dn);
11533 		return;
11534 	}
11535 
11536 	/*
11537 	 * If there is a page invalidation operation going on, then
11538 	 * if this is one of the pages being destroyed, then just
11539 	 * clear the p_fsdata byte and then either free or destroy
11540 	 * the page as appropriate.
11541 	 */
11542 	mutex_enter(&rp->r_statelock);
11543 	if ((rp->r_flags & R4TRUNCATE) && pp->p_offset >= rp->r_truncaddr) {
11544 		mutex_exit(&rp->r_statelock);
11545 		pp->p_fsdata = C_NOCOMMIT;
11546 		if (fl == B_FREE)
11547 			page_free(pp, dn);
11548 		else
11549 			page_destroy(pp, dn);
11550 		return;
11551 	}
11552 
11553 	/*
11554 	 * If we are freeing this page and someone else is already
11555 	 * waiting to do a commit, then just unlock the page and
11556 	 * return.  That other thread will take care of commiting
11557 	 * this page.  The page can be freed sometime after the
11558 	 * commit has finished.  Otherwise, if the page is marked
11559 	 * as delay commit, then we may be getting called from
11560 	 * pvn_write_done, one page at a time.   This could result
11561 	 * in one commit per page, so we end up doing lots of small
11562 	 * commits instead of fewer larger commits.  This is bad,
11563 	 * we want do as few commits as possible.
11564 	 */
11565 	if (fl == B_FREE) {
11566 		if (rp->r_flags & R4COMMITWAIT) {
11567 			page_unlock(pp);
11568 			mutex_exit(&rp->r_statelock);
11569 			return;
11570 		}
11571 		if (pp->p_fsdata == C_DELAYCOMMIT) {
11572 			pp->p_fsdata = C_COMMIT;
11573 			page_unlock(pp);
11574 			mutex_exit(&rp->r_statelock);
11575 			return;
11576 		}
11577 	}
11578 
11579 	/*
11580 	 * Check to see if there is a signal which would prevent an
11581 	 * attempt to commit the pages from being successful.  If so,
11582 	 * then don't bother with all of the work to gather pages and
11583 	 * generate the unsuccessful RPC.  Just return from here and
11584 	 * let the page be committed at some later time.
11585 	 */
11586 	sigintr(&smask, VTOMI4(vp)->mi_flags & MI4_INT);
11587 	if (ttolwp(curthread) != NULL && ISSIG(curthread, JUSTLOOKING)) {
11588 		sigunintr(&smask);
11589 		page_unlock(pp);
11590 		mutex_exit(&rp->r_statelock);
11591 		return;
11592 	}
11593 	sigunintr(&smask);
11594 
11595 	/*
11596 	 * We are starting to need to commit pages, so let's try
11597 	 * to commit as many as possible at once to reduce the
11598 	 * overhead.
11599 	 *
11600 	 * Set the `commit inprogress' state bit.  We must
11601 	 * first wait until any current one finishes.  Then
11602 	 * we initialize the c_pages list with this page.
11603 	 */
11604 	while (rp->r_flags & R4COMMIT) {
11605 		rp->r_flags |= R4COMMITWAIT;
11606 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
11607 		rp->r_flags &= ~R4COMMITWAIT;
11608 	}
11609 	rp->r_flags |= R4COMMIT;
11610 	mutex_exit(&rp->r_statelock);
11611 	ASSERT(rp->r_commit.c_pages == NULL);
11612 	rp->r_commit.c_pages = pp;
11613 	rp->r_commit.c_commbase = (offset3)pp->p_offset;
11614 	rp->r_commit.c_commlen = PAGESIZE;
11615 
11616 	/*
11617 	 * Gather together all other pages which can be committed.
11618 	 * They will all be chained off r_commit.c_pages.
11619 	 */
11620 	nfs4_get_commit(vp);
11621 
11622 	/*
11623 	 * Clear the `commit inprogress' status and disconnect
11624 	 * the list of pages to be committed from the rnode.
11625 	 * At this same time, we also save the starting offset
11626 	 * and length of data to be committed on the server.
11627 	 */
11628 	plist = rp->r_commit.c_pages;
11629 	rp->r_commit.c_pages = NULL;
11630 	offset = rp->r_commit.c_commbase;
11631 	len = rp->r_commit.c_commlen;
11632 	mutex_enter(&rp->r_statelock);
11633 	rp->r_flags &= ~R4COMMIT;
11634 	cv_broadcast(&rp->r_commit.c_cv);
11635 	mutex_exit(&rp->r_statelock);
11636 
11637 	if (curproc == proc_pageout || curproc == proc_fsflush ||
11638 	    nfs_zone() != VTOMI4(vp)->mi_zone) {
11639 		nfs4_async_commit(vp, plist, offset, len,
11640 		    cr, do_nfs4_async_commit);
11641 		return;
11642 	}
11643 
11644 	/*
11645 	 * Actually generate the COMMIT op over the wire operation.
11646 	 */
11647 	error = nfs4_commit(vp, (offset4)offset, (count4)len, cr);
11648 
11649 	/*
11650 	 * If we got an error during the commit, just unlock all
11651 	 * of the pages.  The pages will get retransmitted to the
11652 	 * server during a putpage operation.
11653 	 */
11654 	if (error) {
11655 		while (plist != NULL) {
11656 			pptr = plist;
11657 			page_sub(&plist, pptr);
11658 			page_unlock(pptr);
11659 		}
11660 		return;
11661 	}
11662 
11663 	/*
11664 	 * We've tried as hard as we can to commit the data to stable
11665 	 * storage on the server.  We just unlock the rest of the pages
11666 	 * and clear the commit required state.  They will be put
11667 	 * onto the tail of the cachelist if they are nolonger
11668 	 * mapped.
11669 	 */
11670 	while (plist != pp) {
11671 		pptr = plist;
11672 		page_sub(&plist, pptr);
11673 		pptr->p_fsdata = C_NOCOMMIT;
11674 		page_unlock(pptr);
11675 	}
11676 
11677 	/*
11678 	 * It is possible that nfs4_commit didn't return error but
11679 	 * some other thread has modified the page we are going
11680 	 * to free/destroy.
11681 	 *    In this case we need to rewrite the page. Do an explicit check
11682 	 * before attempting to free/destroy the page. If modified, needs to
11683 	 * be rewritten so unlock the page and return.
11684 	 */
11685 	if (hat_ismod(pp)) {
11686 		pp->p_fsdata = C_NOCOMMIT;
11687 		page_unlock(pp);
11688 		return;
11689 	}
11690 
11691 	/*
11692 	 * Now, as appropriate, either free or destroy the page
11693 	 * that we were called with.
11694 	 */
11695 	pp->p_fsdata = C_NOCOMMIT;
11696 	if (fl == B_FREE)
11697 		page_free(pp, dn);
11698 	else
11699 		page_destroy(pp, dn);
11700 }
11701 
11702 /*
11703  * Commit requires that the current fh be the file written to.
11704  * The compound op structure is:
11705  *      PUTFH(file), COMMIT
11706  */
11707 static int
11708 nfs4_commit(vnode_t *vp, offset4 offset, count4 count, cred_t *cr)
11709 {
11710 	COMPOUND4args_clnt args;
11711 	COMPOUND4res_clnt res;
11712 	COMMIT4res *cm_res;
11713 	nfs_argop4 argop[2];
11714 	nfs_resop4 *resop;
11715 	int doqueue;
11716 	mntinfo4_t *mi;
11717 	rnode4_t *rp;
11718 	cred_t *cred_otw = NULL;
11719 	bool_t needrecov = FALSE;
11720 	nfs4_recov_state_t recov_state;
11721 	nfs4_open_stream_t *osp = NULL;
11722 	bool_t first_time = TRUE;	/* first time getting OTW cred */
11723 	bool_t last_time = FALSE;	/* last time getting OTW cred */
11724 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
11725 
11726 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11727 
11728 	rp = VTOR4(vp);
11729 
11730 	mi = VTOMI4(vp);
11731 	recov_state.rs_flags = 0;
11732 	recov_state.rs_num_retry_despite_err = 0;
11733 get_commit_cred:
11734 	/*
11735 	 * Releases the osp, if a valid open stream is provided.
11736 	 * Puts a hold on the cred_otw and the new osp (if found).
11737 	 */
11738 	cred_otw = nfs4_get_otw_cred_by_osp(rp, cr, &osp,
11739 	    &first_time, &last_time);
11740 	args.ctag = TAG_COMMIT;
11741 recov_retry:
11742 	/*
11743 	 * Commit ops: putfh file; commit
11744 	 */
11745 	args.array_len = 2;
11746 	args.array = argop;
11747 
11748 	e.error = nfs4_start_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11749 	    &recov_state, NULL);
11750 	if (e.error) {
11751 		crfree(cred_otw);
11752 		if (osp != NULL)
11753 			open_stream_rele(osp, rp);
11754 		return (e.error);
11755 	}
11756 
11757 	/* putfh directory */
11758 	argop[0].argop = OP_CPUTFH;
11759 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
11760 
11761 	/* commit */
11762 	argop[1].argop = OP_COMMIT;
11763 	argop[1].nfs_argop4_u.opcommit.offset = offset;
11764 	argop[1].nfs_argop4_u.opcommit.count = count;
11765 
11766 	doqueue = 1;
11767 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, &e);
11768 
11769 	needrecov = nfs4_needs_recovery(&e, FALSE, mi->mi_vfsp);
11770 	if (!needrecov && e.error) {
11771 		nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state,
11772 		    needrecov);
11773 		crfree(cred_otw);
11774 		if (e.error == EACCES && last_time == FALSE)
11775 			goto get_commit_cred;
11776 		if (osp != NULL)
11777 			open_stream_rele(osp, rp);
11778 		return (e.error);
11779 	}
11780 
11781 	if (needrecov) {
11782 		if (nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
11783 		    NULL, OP_COMMIT, NULL, NULL, NULL) == FALSE) {
11784 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11785 			    &recov_state, needrecov);
11786 			if (!e.error)
11787 				(void) xdr_free(xdr_COMPOUND4res_clnt,
11788 				    (caddr_t)&res);
11789 			goto recov_retry;
11790 		}
11791 		if (e.error) {
11792 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11793 			    &recov_state, needrecov);
11794 			crfree(cred_otw);
11795 			if (osp != NULL)
11796 				open_stream_rele(osp, rp);
11797 			return (e.error);
11798 		}
11799 		/* fall through for res.status case */
11800 	}
11801 
11802 	if (res.status) {
11803 		e.error = geterrno4(res.status);
11804 		if (e.error == EACCES && last_time == FALSE) {
11805 			crfree(cred_otw);
11806 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11807 			    &recov_state, needrecov);
11808 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11809 			goto get_commit_cred;
11810 		}
11811 		/*
11812 		 * Can't do a nfs4_purge_stale_fh here because this
11813 		 * can cause a deadlock.  nfs4_commit can
11814 		 * be called from nfs4_dispose which can be called
11815 		 * indirectly via pvn_vplist_dirty.  nfs4_purge_stale_fh
11816 		 * can call back to pvn_vplist_dirty.
11817 		 */
11818 		if (e.error == ESTALE) {
11819 			mutex_enter(&rp->r_statelock);
11820 			rp->r_flags |= R4STALE;
11821 			if (!rp->r_error)
11822 				rp->r_error = e.error;
11823 			mutex_exit(&rp->r_statelock);
11824 			PURGE_ATTRCACHE4(vp);
11825 		} else {
11826 			mutex_enter(&rp->r_statelock);
11827 			if (!rp->r_error)
11828 				rp->r_error = e.error;
11829 			mutex_exit(&rp->r_statelock);
11830 		}
11831 	} else {
11832 		ASSERT(rp->r_flags & R4HAVEVERF);
11833 		resop = &res.array[1];	/* commit res */
11834 		cm_res = &resop->nfs_resop4_u.opcommit;
11835 		mutex_enter(&rp->r_statelock);
11836 		if (cm_res->writeverf == rp->r_writeverf) {
11837 			mutex_exit(&rp->r_statelock);
11838 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11839 			nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT,
11840 			    &recov_state, needrecov);
11841 			crfree(cred_otw);
11842 			if (osp != NULL)
11843 				open_stream_rele(osp, rp);
11844 			return (0);
11845 		}
11846 		nfs4_set_mod(vp);
11847 		rp->r_writeverf = cm_res->writeverf;
11848 		mutex_exit(&rp->r_statelock);
11849 		e.error = NFS_VERF_MISMATCH;
11850 	}
11851 
11852 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
11853 	nfs4_end_fop(VTOMI4(vp), vp, NULL, OH_COMMIT, &recov_state, needrecov);
11854 	crfree(cred_otw);
11855 	if (osp != NULL)
11856 		open_stream_rele(osp, rp);
11857 
11858 	return (e.error);
11859 }
11860 
11861 static void
11862 nfs4_set_mod(vnode_t *vp)
11863 {
11864 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11865 
11866 	/* make sure we're looking at the master vnode, not a shadow */
11867 	pvn_vplist_setdirty(RTOV4(VTOR4(vp)), nfs_setmod_check);
11868 }
11869 
11870 /*
11871  * This function is used to gather a page list of the pages which
11872  * can be committed on the server.
11873  *
11874  * The calling thread must have set R4COMMIT.  This bit is used to
11875  * serialize access to the commit structure in the rnode.  As long
11876  * as the thread has set R4COMMIT, then it can manipulate the commit
11877  * structure without requiring any other locks.
11878  *
11879  * When this function is called from nfs4_dispose() the page passed
11880  * into nfs4_dispose() will be SE_EXCL locked, and so this function
11881  * will skip it. This is not a problem since we initially add the
11882  * page to the r_commit page list.
11883  *
11884  */
11885 static void
11886 nfs4_get_commit(vnode_t *vp)
11887 {
11888 	rnode4_t *rp;
11889 	page_t *pp;
11890 	kmutex_t *vphm;
11891 
11892 	rp = VTOR4(vp);
11893 
11894 	ASSERT(rp->r_flags & R4COMMIT);
11895 
11896 	/* make sure we're looking at the master vnode, not a shadow */
11897 
11898 	if (IS_SHADOW(vp, rp))
11899 		vp = RTOV4(rp);
11900 
11901 	vphm = page_vnode_mutex(vp);
11902 	mutex_enter(vphm);
11903 
11904 	/*
11905 	 * If there are no pages associated with this vnode, then
11906 	 * just return.
11907 	 */
11908 	if ((pp = vp->v_pages) == NULL) {
11909 		mutex_exit(vphm);
11910 		return;
11911 	}
11912 
11913 	/*
11914 	 * Step through all of the pages associated with this vnode
11915 	 * looking for pages which need to be committed.
11916 	 */
11917 	do {
11918 		/* Skip marker pages. */
11919 		if (pp->p_hash == PVN_VPLIST_HASH_TAG)
11920 			continue;
11921 
11922 		/*
11923 		 * First short-cut everything (without the page_lock)
11924 		 * and see if this page does not need to be committed
11925 		 * or is modified if so then we'll just skip it.
11926 		 */
11927 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp))
11928 			continue;
11929 
11930 		/*
11931 		 * Attempt to lock the page.  If we can't, then
11932 		 * someone else is messing with it or we have been
11933 		 * called from nfs4_dispose and this is the page that
11934 		 * nfs4_dispose was called with.. anyway just skip it.
11935 		 */
11936 		if (!page_trylock(pp, SE_EXCL))
11937 			continue;
11938 
11939 		/*
11940 		 * Lets check again now that we have the page lock.
11941 		 */
11942 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
11943 			page_unlock(pp);
11944 			continue;
11945 		}
11946 
11947 		/* this had better not be a free page */
11948 		ASSERT(PP_ISFREE(pp) == 0);
11949 
11950 		/*
11951 		 * The page needs to be committed and we locked it.
11952 		 * Update the base and length parameters and add it
11953 		 * to r_pages.
11954 		 */
11955 		if (rp->r_commit.c_pages == NULL) {
11956 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
11957 			rp->r_commit.c_commlen = PAGESIZE;
11958 		} else if (pp->p_offset < rp->r_commit.c_commbase) {
11959 			rp->r_commit.c_commlen = rp->r_commit.c_commbase -
11960 			    (offset3)pp->p_offset + rp->r_commit.c_commlen;
11961 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
11962 		} else if ((rp->r_commit.c_commbase + rp->r_commit.c_commlen)
11963 		    <= pp->p_offset) {
11964 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
11965 			    rp->r_commit.c_commbase + PAGESIZE;
11966 		}
11967 		page_add(&rp->r_commit.c_pages, pp);
11968 	} while ((pp = pp->p_vpnext) != vp->v_pages);
11969 
11970 	mutex_exit(vphm);
11971 }
11972 
11973 /*
11974  * This routine is used to gather together a page list of the pages
11975  * which are to be committed on the server.  This routine must not
11976  * be called if the calling thread holds any locked pages.
11977  *
11978  * The calling thread must have set R4COMMIT.  This bit is used to
11979  * serialize access to the commit structure in the rnode.  As long
11980  * as the thread has set R4COMMIT, then it can manipulate the commit
11981  * structure without requiring any other locks.
11982  */
11983 static void
11984 nfs4_get_commit_range(vnode_t *vp, u_offset_t soff, size_t len)
11985 {
11986 
11987 	rnode4_t *rp;
11988 	page_t *pp;
11989 	u_offset_t end;
11990 	u_offset_t off;
11991 	ASSERT(len != 0);
11992 	rp = VTOR4(vp);
11993 	ASSERT(rp->r_flags & R4COMMIT);
11994 
11995 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
11996 
11997 	/* make sure we're looking at the master vnode, not a shadow */
11998 
11999 	if (IS_SHADOW(vp, rp))
12000 		vp = RTOV4(rp);
12001 
12002 	/*
12003 	 * If there are no pages associated with this vnode, then
12004 	 * just return.
12005 	 */
12006 	if ((pp = vp->v_pages) == NULL)
12007 		return;
12008 	/*
12009 	 * Calculate the ending offset.
12010 	 */
12011 	end = soff + len;
12012 	for (off = soff; off < end; off += PAGESIZE) {
12013 		/*
12014 		 * Lookup each page by vp, offset.
12015 		 */
12016 		if ((pp = page_lookup_nowait(vp, off, SE_EXCL)) == NULL)
12017 			continue;
12018 		/*
12019 		 * If this page does not need to be committed or is
12020 		 * modified, then just skip it.
12021 		 */
12022 		if (pp->p_fsdata == C_NOCOMMIT || hat_ismod(pp)) {
12023 			page_unlock(pp);
12024 			continue;
12025 		}
12026 
12027 		ASSERT(PP_ISFREE(pp) == 0);
12028 		/*
12029 		 * The page needs to be committed and we locked it.
12030 		 * Update the base and length parameters and add it
12031 		 * to r_pages.
12032 		 */
12033 		if (rp->r_commit.c_pages == NULL) {
12034 			rp->r_commit.c_commbase = (offset3)pp->p_offset;
12035 			rp->r_commit.c_commlen = PAGESIZE;
12036 		} else {
12037 			rp->r_commit.c_commlen = (offset3)pp->p_offset -
12038 			    rp->r_commit.c_commbase + PAGESIZE;
12039 		}
12040 		page_add(&rp->r_commit.c_pages, pp);
12041 	}
12042 }
12043 
12044 /*
12045  * Called from nfs4_close(), nfs4_fsync() and nfs4_delmap().
12046  * Flushes and commits data to the server.
12047  */
12048 static int
12049 nfs4_putpage_commit(vnode_t *vp, offset_t poff, size_t plen, cred_t *cr)
12050 {
12051 	int error;
12052 	verifier4 write_verf;
12053 	rnode4_t *rp = VTOR4(vp);
12054 
12055 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12056 
12057 	/*
12058 	 * Flush the data portion of the file and then commit any
12059 	 * portions which need to be committed.  This may need to
12060 	 * be done twice if the server has changed state since
12061 	 * data was last written.  The data will need to be
12062 	 * rewritten to the server and then a new commit done.
12063 	 *
12064 	 * In fact, this may need to be done several times if the
12065 	 * server is having problems and crashing while we are
12066 	 * attempting to do this.
12067 	 */
12068 
12069 top:
12070 	/*
12071 	 * Do a flush based on the poff and plen arguments.  This
12072 	 * will synchronously write out any modified pages in the
12073 	 * range specified by (poff, plen). This starts all of the
12074 	 * i/o operations which will be waited for in the next
12075 	 * call to nfs4_putpage
12076 	 */
12077 
12078 	mutex_enter(&rp->r_statelock);
12079 	write_verf = rp->r_writeverf;
12080 	mutex_exit(&rp->r_statelock);
12081 
12082 	error = nfs4_putpage(vp, poff, plen, B_ASYNC, cr, NULL);
12083 	if (error == EAGAIN)
12084 		error = 0;
12085 
12086 	/*
12087 	 * Do a flush based on the poff and plen arguments.  This
12088 	 * will synchronously write out any modified pages in the
12089 	 * range specified by (poff, plen) and wait until all of
12090 	 * the asynchronous i/o's in that range are done as well.
12091 	 */
12092 	if (!error)
12093 		error = nfs4_putpage(vp, poff, plen, 0, cr, NULL);
12094 
12095 	if (error)
12096 		return (error);
12097 
12098 	mutex_enter(&rp->r_statelock);
12099 	if (rp->r_writeverf != write_verf) {
12100 		mutex_exit(&rp->r_statelock);
12101 		goto top;
12102 	}
12103 	mutex_exit(&rp->r_statelock);
12104 
12105 	/*
12106 	 * Now commit any pages which might need to be committed.
12107 	 * If the error, NFS_VERF_MISMATCH, is returned, then
12108 	 * start over with the flush operation.
12109 	 */
12110 	error = nfs4_commit_vp(vp, poff, plen, cr, NFS4_WRITE_WAIT);
12111 
12112 	if (error == NFS_VERF_MISMATCH)
12113 		goto top;
12114 
12115 	return (error);
12116 }
12117 
12118 /*
12119  * nfs4_commit_vp()  will wait for other pending commits and
12120  * will either commit the whole file or a range, plen dictates
12121  * if we commit whole file. a value of zero indicates the whole
12122  * file. Called from nfs4_putpage_commit() or nfs4_sync_putapage()
12123  */
12124 static int
12125 nfs4_commit_vp(vnode_t *vp, u_offset_t poff, size_t plen,
12126     cred_t *cr, int wait_on_writes)
12127 {
12128 	rnode4_t *rp;
12129 	page_t *plist;
12130 	offset3 offset;
12131 	count3 len;
12132 
12133 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12134 
12135 	rp = VTOR4(vp);
12136 
12137 	/*
12138 	 *  before we gather commitable pages make
12139 	 *  sure there are no outstanding async writes
12140 	 */
12141 	if (rp->r_count && wait_on_writes == NFS4_WRITE_WAIT) {
12142 		mutex_enter(&rp->r_statelock);
12143 		while (rp->r_count > 0) {
12144 			cv_wait(&rp->r_cv, &rp->r_statelock);
12145 		}
12146 		mutex_exit(&rp->r_statelock);
12147 	}
12148 
12149 	/*
12150 	 * Set the `commit inprogress' state bit.  We must
12151 	 * first wait until any current one finishes.
12152 	 */
12153 	mutex_enter(&rp->r_statelock);
12154 	while (rp->r_flags & R4COMMIT) {
12155 		rp->r_flags |= R4COMMITWAIT;
12156 		cv_wait(&rp->r_commit.c_cv, &rp->r_statelock);
12157 		rp->r_flags &= ~R4COMMITWAIT;
12158 	}
12159 	rp->r_flags |= R4COMMIT;
12160 	mutex_exit(&rp->r_statelock);
12161 
12162 	/*
12163 	 * Gather all of the pages which need to be
12164 	 * committed.
12165 	 */
12166 	if (plen == 0)
12167 		nfs4_get_commit(vp);
12168 	else
12169 		nfs4_get_commit_range(vp, poff, plen);
12170 
12171 	/*
12172 	 * Clear the `commit inprogress' bit and disconnect the
12173 	 * page list which was gathered by nfs4_get_commit.
12174 	 */
12175 	plist = rp->r_commit.c_pages;
12176 	rp->r_commit.c_pages = NULL;
12177 	offset = rp->r_commit.c_commbase;
12178 	len = rp->r_commit.c_commlen;
12179 	mutex_enter(&rp->r_statelock);
12180 	rp->r_flags &= ~R4COMMIT;
12181 	cv_broadcast(&rp->r_commit.c_cv);
12182 	mutex_exit(&rp->r_statelock);
12183 
12184 	/*
12185 	 * If any pages need to be committed, commit them and
12186 	 * then unlock them so that they can be freed some
12187 	 * time later.
12188 	 */
12189 	if (plist == NULL)
12190 		return (0);
12191 
12192 	/*
12193 	 * No error occurred during the flush portion
12194 	 * of this operation, so now attempt to commit
12195 	 * the data to stable storage on the server.
12196 	 *
12197 	 * This will unlock all of the pages on the list.
12198 	 */
12199 	return (nfs4_sync_commit(vp, plist, offset, len, cr));
12200 }
12201 
12202 static int
12203 nfs4_sync_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12204     cred_t *cr)
12205 {
12206 	int error;
12207 	page_t *pp;
12208 
12209 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12210 
12211 	error = nfs4_commit(vp, (offset4)offset, (count3)count, cr);
12212 
12213 	/*
12214 	 * If we got an error, then just unlock all of the pages
12215 	 * on the list.
12216 	 */
12217 	if (error) {
12218 		while (plist != NULL) {
12219 			pp = plist;
12220 			page_sub(&plist, pp);
12221 			page_unlock(pp);
12222 		}
12223 		return (error);
12224 	}
12225 	/*
12226 	 * We've tried as hard as we can to commit the data to stable
12227 	 * storage on the server.  We just unlock the pages and clear
12228 	 * the commit required state.  They will get freed later.
12229 	 */
12230 	while (plist != NULL) {
12231 		pp = plist;
12232 		page_sub(&plist, pp);
12233 		pp->p_fsdata = C_NOCOMMIT;
12234 		page_unlock(pp);
12235 	}
12236 
12237 	return (error);
12238 }
12239 
12240 static void
12241 do_nfs4_async_commit(vnode_t *vp, page_t *plist, offset3 offset, count3 count,
12242     cred_t *cr)
12243 {
12244 
12245 	(void) nfs4_sync_commit(vp, plist, offset, count, cr);
12246 }
12247 
12248 /*ARGSUSED*/
12249 static int
12250 nfs4_setsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12251 	caller_context_t *ct)
12252 {
12253 	int		error = 0;
12254 	mntinfo4_t	*mi;
12255 	vattr_t		va;
12256 	vsecattr_t	nfsace4_vsap;
12257 
12258 	mi = VTOMI4(vp);
12259 	if (nfs_zone() != mi->mi_zone)
12260 		return (EIO);
12261 	if (mi->mi_flags & MI4_ACL) {
12262 		/* if we have a delegation, return it */
12263 		if (VTOR4(vp)->r_deleg_type != OPEN_DELEGATE_NONE)
12264 			(void) nfs4delegreturn(VTOR4(vp),
12265 			    NFS4_DR_REOPEN|NFS4_DR_PUSH);
12266 
12267 		error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask,
12268 		    NFS4_ACL_SET);
12269 		if (error) /* EINVAL */
12270 			return (error);
12271 
12272 		if (vsecattr->vsa_mask & (VSA_ACL | VSA_DFACL)) {
12273 			/*
12274 			 * These are aclent_t type entries.
12275 			 */
12276 			error = vs_aent_to_ace4(vsecattr, &nfsace4_vsap,
12277 			    vp->v_type == VDIR, FALSE);
12278 			if (error)
12279 				return (error);
12280 		} else {
12281 			/*
12282 			 * These are ace_t type entries.
12283 			 */
12284 			error = vs_acet_to_ace4(vsecattr, &nfsace4_vsap,
12285 			    FALSE);
12286 			if (error)
12287 				return (error);
12288 		}
12289 		bzero(&va, sizeof (va));
12290 		error = nfs4setattr(vp, &va, flag, cr, &nfsace4_vsap);
12291 		vs_ace4_destroy(&nfsace4_vsap);
12292 		return (error);
12293 	}
12294 	return (ENOSYS);
12295 }
12296 
12297 /* ARGSUSED */
12298 int
12299 nfs4_getsecattr(vnode_t *vp, vsecattr_t *vsecattr, int flag, cred_t *cr,
12300 	caller_context_t *ct)
12301 {
12302 	int		error;
12303 	mntinfo4_t	*mi;
12304 	nfs4_ga_res_t	gar;
12305 	rnode4_t	*rp = VTOR4(vp);
12306 
12307 	mi = VTOMI4(vp);
12308 	if (nfs_zone() != mi->mi_zone)
12309 		return (EIO);
12310 
12311 	bzero(&gar, sizeof (gar));
12312 	gar.n4g_vsa.vsa_mask = vsecattr->vsa_mask;
12313 
12314 	/*
12315 	 * vsecattr->vsa_mask holds the original acl request mask.
12316 	 * This is needed when determining what to return.
12317 	 * (See: nfs4_create_getsecattr_return())
12318 	 */
12319 	error = nfs4_is_acl_mask_valid(vsecattr->vsa_mask, NFS4_ACL_GET);
12320 	if (error) /* EINVAL */
12321 		return (error);
12322 
12323 	/*
12324 	 * If this is a referral stub, don't try to go OTW for an ACL
12325 	 */
12326 	if (RP_ISSTUB_REFERRAL(VTOR4(vp)))
12327 		return (fs_fab_acl(vp, vsecattr, flag, cr, ct));
12328 
12329 	if (mi->mi_flags & MI4_ACL) {
12330 		/*
12331 		 * Check if the data is cached and the cache is valid.  If it
12332 		 * is we don't go over the wire.
12333 		 */
12334 		if (rp->r_secattr != NULL && ATTRCACHE4_VALID(vp)) {
12335 			mutex_enter(&rp->r_statelock);
12336 			if (rp->r_secattr != NULL) {
12337 				error = nfs4_create_getsecattr_return(
12338 				    rp->r_secattr, vsecattr, rp->r_attr.va_uid,
12339 				    rp->r_attr.va_gid,
12340 				    vp->v_type == VDIR);
12341 				if (!error) { /* error == 0 - Success! */
12342 					mutex_exit(&rp->r_statelock);
12343 					return (error);
12344 				}
12345 			}
12346 			mutex_exit(&rp->r_statelock);
12347 		}
12348 
12349 		/*
12350 		 * The getattr otw call will always get both the acl, in
12351 		 * the form of a list of nfsace4's, and the number of acl
12352 		 * entries; independent of the value of gar.n4g_vsa.vsa_mask.
12353 		 */
12354 		gar.n4g_va.va_mask = AT_ALL;
12355 		error =  nfs4_getattr_otw(vp, &gar, cr, 1);
12356 		if (error) {
12357 			vs_ace4_destroy(&gar.n4g_vsa);
12358 			if (error == ENOTSUP || error == EOPNOTSUPP)
12359 				error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12360 			return (error);
12361 		}
12362 
12363 		if (!(gar.n4g_resbmap & FATTR4_ACL_MASK)) {
12364 			/*
12365 			 * No error was returned, but according to the response
12366 			 * bitmap, neither was an acl.
12367 			 */
12368 			vs_ace4_destroy(&gar.n4g_vsa);
12369 			error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12370 			return (error);
12371 		}
12372 
12373 		/*
12374 		 * Update the cache with the ACL.
12375 		 */
12376 		nfs4_acl_fill_cache(rp, &gar.n4g_vsa);
12377 
12378 		error = nfs4_create_getsecattr_return(&gar.n4g_vsa,
12379 		    vsecattr, gar.n4g_va.va_uid, gar.n4g_va.va_gid,
12380 		    vp->v_type == VDIR);
12381 		vs_ace4_destroy(&gar.n4g_vsa);
12382 		if ((error) && (vsecattr->vsa_mask &
12383 		    (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) &&
12384 		    (error != EACCES)) {
12385 			error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12386 		}
12387 		return (error);
12388 	}
12389 	error = fs_fab_acl(vp, vsecattr, flag, cr, ct);
12390 	return (error);
12391 }
12392 
12393 /*
12394  * The function returns:
12395  * 	- 0 (zero) if the passed in "acl_mask" is a valid request.
12396  * 	- EINVAL if the passed in "acl_mask" is an invalid request.
12397  *
12398  * In the case of getting an acl (op == NFS4_ACL_GET) the mask is invalid if:
12399  * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12400  *
12401  * In the case of setting an acl (op == NFS4_ACL_SET) the mask is invalid if:
12402  * - We have a mixture of ACE and ACL requests (e.g. VSA_ACL | VSA_ACE)
12403  * - We have a count field set without the corresponding acl field set. (e.g. -
12404  * VSA_ACECNT is set, but VSA_ACE is not)
12405  */
12406 static int
12407 nfs4_is_acl_mask_valid(uint_t acl_mask, nfs4_acl_op_t op)
12408 {
12409 	/* Shortcut the masks that are always valid. */
12410 	if (acl_mask == (VSA_ACE | VSA_ACECNT))
12411 		return (0);
12412 	if (acl_mask == (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT))
12413 		return (0);
12414 
12415 	if (acl_mask & (VSA_ACE | VSA_ACECNT)) {
12416 		/*
12417 		 * We can't have any VSA_ACL type stuff in the mask now.
12418 		 */
12419 		if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12420 		    VSA_DFACLCNT))
12421 			return (EINVAL);
12422 
12423 		if (op == NFS4_ACL_SET) {
12424 			if ((acl_mask & VSA_ACECNT) && !(acl_mask & VSA_ACE))
12425 				return (EINVAL);
12426 		}
12427 	}
12428 
12429 	if (acl_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL | VSA_DFACLCNT)) {
12430 		/*
12431 		 * We can't have any VSA_ACE type stuff in the mask now.
12432 		 */
12433 		if (acl_mask & (VSA_ACE | VSA_ACECNT))
12434 			return (EINVAL);
12435 
12436 		if (op == NFS4_ACL_SET) {
12437 			if ((acl_mask & VSA_ACLCNT) && !(acl_mask & VSA_ACL))
12438 				return (EINVAL);
12439 
12440 			if ((acl_mask & VSA_DFACLCNT) &&
12441 			    !(acl_mask & VSA_DFACL))
12442 				return (EINVAL);
12443 		}
12444 	}
12445 	return (0);
12446 }
12447 
12448 /*
12449  * The theory behind creating the correct getsecattr return is simply this:
12450  * "Don't return anything that the caller is not expecting to have to free."
12451  */
12452 static int
12453 nfs4_create_getsecattr_return(vsecattr_t *filled_vsap, vsecattr_t *vsap,
12454     uid_t uid, gid_t gid, int isdir)
12455 {
12456 	int error = 0;
12457 	/* Save the mask since the translators modify it. */
12458 	uint_t	orig_mask = vsap->vsa_mask;
12459 
12460 	if (orig_mask & (VSA_ACE | VSA_ACECNT)) {
12461 		error = vs_ace4_to_acet(filled_vsap, vsap, uid, gid, FALSE);
12462 
12463 		if (error)
12464 			return (error);
12465 
12466 		/*
12467 		 * If the caller only asked for the ace count (VSA_ACECNT)
12468 		 * don't give them the full acl (VSA_ACE), free it.
12469 		 */
12470 		if (!orig_mask & VSA_ACE) {
12471 			if (vsap->vsa_aclentp != NULL) {
12472 				kmem_free(vsap->vsa_aclentp,
12473 				    vsap->vsa_aclcnt * sizeof (ace_t));
12474 				vsap->vsa_aclentp = NULL;
12475 			}
12476 		}
12477 		vsap->vsa_mask = orig_mask;
12478 
12479 	} else if (orig_mask & (VSA_ACL | VSA_ACLCNT | VSA_DFACL |
12480 	    VSA_DFACLCNT)) {
12481 		error = vs_ace4_to_aent(filled_vsap, vsap, uid, gid,
12482 		    isdir, FALSE);
12483 
12484 		if (error)
12485 			return (error);
12486 
12487 		/*
12488 		 * If the caller only asked for the acl count (VSA_ACLCNT)
12489 		 * and/or the default acl count (VSA_DFACLCNT) don't give them
12490 		 * the acl (VSA_ACL) or default acl (VSA_DFACL), free it.
12491 		 */
12492 		if (!orig_mask & VSA_ACL) {
12493 			if (vsap->vsa_aclentp != NULL) {
12494 				kmem_free(vsap->vsa_aclentp,
12495 				    vsap->vsa_aclcnt * sizeof (aclent_t));
12496 				vsap->vsa_aclentp = NULL;
12497 			}
12498 		}
12499 
12500 		if (!orig_mask & VSA_DFACL) {
12501 			if (vsap->vsa_dfaclentp != NULL) {
12502 				kmem_free(vsap->vsa_dfaclentp,
12503 				    vsap->vsa_dfaclcnt * sizeof (aclent_t));
12504 				vsap->vsa_dfaclentp = NULL;
12505 			}
12506 		}
12507 		vsap->vsa_mask = orig_mask;
12508 	}
12509 	return (0);
12510 }
12511 
12512 /* ARGSUSED */
12513 int
12514 nfs4_shrlock(vnode_t *vp, int cmd, struct shrlock *shr, int flag, cred_t *cr,
12515     caller_context_t *ct)
12516 {
12517 	int error;
12518 
12519 	if (nfs_zone() != VTOMI4(vp)->mi_zone)
12520 		return (EIO);
12521 	/*
12522 	 * check for valid cmd parameter
12523 	 */
12524 	if (cmd != F_SHARE && cmd != F_UNSHARE && cmd != F_HASREMOTELOCKS)
12525 		return (EINVAL);
12526 
12527 	/*
12528 	 * Check access permissions
12529 	 */
12530 	if ((cmd & F_SHARE) &&
12531 	    (((shr->s_access & F_RDACC) && (flag & FREAD) == 0) ||
12532 	    (shr->s_access == F_WRACC && (flag & FWRITE) == 0)))
12533 		return (EBADF);
12534 
12535 	/*
12536 	 * If the filesystem is mounted using local locking, pass the
12537 	 * request off to the local share code.
12538 	 */
12539 	if (VTOMI4(vp)->mi_flags & MI4_LLOCK)
12540 		return (fs_shrlock(vp, cmd, shr, flag, cr, ct));
12541 
12542 	switch (cmd) {
12543 	case F_SHARE:
12544 	case F_UNSHARE:
12545 		/*
12546 		 * This will be properly implemented later,
12547 		 * see RFE: 4823948 .
12548 		 */
12549 		error = EAGAIN;
12550 		break;
12551 
12552 	case F_HASREMOTELOCKS:
12553 		/*
12554 		 * NFS client can't store remote locks itself
12555 		 */
12556 		shr->s_access = 0;
12557 		error = 0;
12558 		break;
12559 
12560 	default:
12561 		error = EINVAL;
12562 		break;
12563 	}
12564 
12565 	return (error);
12566 }
12567 
12568 /*
12569  * Common code called by directory ops to update the attrcache
12570  */
12571 static int
12572 nfs4_update_attrcache(nfsstat4 status, nfs4_ga_res_t *garp,
12573     hrtime_t t, vnode_t *vp, cred_t *cr)
12574 {
12575 	int error = 0;
12576 
12577 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12578 
12579 	if (status != NFS4_OK) {
12580 		/* getattr not done or failed */
12581 		PURGE_ATTRCACHE4(vp);
12582 		return (error);
12583 	}
12584 
12585 	if (garp) {
12586 		nfs4_attr_cache(vp, garp, t, cr, FALSE, NULL);
12587 	} else {
12588 		PURGE_ATTRCACHE4(vp);
12589 	}
12590 	return (error);
12591 }
12592 
12593 /*
12594  * Update directory caches for directory modification ops (link, rename, etc.)
12595  * When dinfo is NULL, manage dircaches in the old way.
12596  */
12597 static void
12598 nfs4_update_dircaches(change_info4 *cinfo, vnode_t *dvp, vnode_t *vp, char *nm,
12599     dirattr_info_t *dinfo)
12600 {
12601 	rnode4_t	*drp = VTOR4(dvp);
12602 
12603 	ASSERT(nfs_zone() == VTOMI4(dvp)->mi_zone);
12604 
12605 	/* Purge rddir cache for dir since it changed */
12606 	if (drp->r_dir != NULL)
12607 		nfs4_purge_rddir_cache(dvp);
12608 
12609 	/*
12610 	 * If caller provided dinfo, then use it to manage dir caches.
12611 	 */
12612 	if (dinfo != NULL) {
12613 		if (vp != NULL) {
12614 			mutex_enter(&VTOR4(vp)->r_statev4_lock);
12615 			if (!VTOR4(vp)->created_v4) {
12616 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12617 				dnlc_update(dvp, nm, vp);
12618 			} else {
12619 				/*
12620 				 * XXX don't update if the created_v4 flag is
12621 				 * set
12622 				 */
12623 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12624 				NFS4_DEBUG(nfs4_client_state_debug,
12625 				    (CE_NOTE, "nfs4_update_dircaches: "
12626 				    "don't update dnlc: created_v4 flag"));
12627 			}
12628 		}
12629 
12630 		nfs4_attr_cache(dvp, dinfo->di_garp, dinfo->di_time_call,
12631 		    dinfo->di_cred, FALSE, cinfo);
12632 
12633 		return;
12634 	}
12635 
12636 	/*
12637 	 * Caller didn't provide dinfo, then check change_info4 to update DNLC.
12638 	 * Since caller modified dir but didn't receive post-dirmod-op dir
12639 	 * attrs, the dir's attrs must be purged.
12640 	 *
12641 	 * XXX this check and dnlc update/purge should really be atomic,
12642 	 * XXX but can't use rnode statelock because it'll deadlock in
12643 	 * XXX dnlc_purge_vp, however, the risk is minimal even if a race
12644 	 * XXX does occur.
12645 	 *
12646 	 * XXX We also may want to check that atomic is true in the
12647 	 * XXX change_info struct. If it is not, the change_info may
12648 	 * XXX reflect changes by more than one clients which means that
12649 	 * XXX our cache may not be valid.
12650 	 */
12651 	PURGE_ATTRCACHE4(dvp);
12652 	if (drp->r_change == cinfo->before) {
12653 		/* no changes took place in the directory prior to our link */
12654 		if (vp != NULL) {
12655 			mutex_enter(&VTOR4(vp)->r_statev4_lock);
12656 			if (!VTOR4(vp)->created_v4) {
12657 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12658 				dnlc_update(dvp, nm, vp);
12659 			} else {
12660 				/*
12661 				 * XXX dont' update if the created_v4 flag
12662 				 * is set
12663 				 */
12664 				mutex_exit(&VTOR4(vp)->r_statev4_lock);
12665 				NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
12666 				    "nfs4_update_dircaches: don't"
12667 				    " update dnlc: created_v4 flag"));
12668 			}
12669 		}
12670 	} else {
12671 		/* Another client modified directory - purge its dnlc cache */
12672 		dnlc_purge_vp(dvp);
12673 	}
12674 }
12675 
12676 /*
12677  * The OPEN_CONFIRM operation confirms the sequence number used in OPENing a
12678  * file.
12679  *
12680  * The 'reopening_file' boolean should be set to TRUE if we are reopening this
12681  * file (ie: client recovery) and otherwise set to FALSE.
12682  *
12683  * 'nfs4_start/end_op' should have been called by the proper (ie: not recovery
12684  * initiated) calling functions.
12685  *
12686  * 'resend' is set to TRUE if this is a OPEN_CONFIRM issued as a result
12687  * of resending a 'lost' open request.
12688  *
12689  * 'num_bseqid_retryp' makes sure we don't loop forever on a broken
12690  * server that hands out BAD_SEQID on open confirm.
12691  *
12692  * Errors are returned via the nfs4_error_t parameter.
12693  */
12694 void
12695 nfs4open_confirm(vnode_t *vp, seqid4 *seqid, stateid4 *stateid, cred_t *cr,
12696     bool_t reopening_file, bool_t *retry_open, nfs4_open_owner_t *oop,
12697     bool_t resend, nfs4_error_t *ep, int *num_bseqid_retryp)
12698 {
12699 	COMPOUND4args_clnt args;
12700 	COMPOUND4res_clnt res;
12701 	nfs_argop4 argop[2];
12702 	nfs_resop4 *resop;
12703 	int doqueue = 1;
12704 	mntinfo4_t *mi;
12705 	OPEN_CONFIRM4args *open_confirm_args;
12706 	int needrecov;
12707 
12708 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12709 #if DEBUG
12710 	mutex_enter(&oop->oo_lock);
12711 	ASSERT(oop->oo_seqid_inuse);
12712 	mutex_exit(&oop->oo_lock);
12713 #endif
12714 
12715 recov_retry_confirm:
12716 	nfs4_error_zinit(ep);
12717 	*retry_open = FALSE;
12718 
12719 	if (resend)
12720 		args.ctag = TAG_OPEN_CONFIRM_LOST;
12721 	else
12722 		args.ctag = TAG_OPEN_CONFIRM;
12723 
12724 	args.array_len = 2;
12725 	args.array = argop;
12726 
12727 	/* putfh target fh */
12728 	argop[0].argop = OP_CPUTFH;
12729 	argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(vp)->r_fh;
12730 
12731 	argop[1].argop = OP_OPEN_CONFIRM;
12732 	open_confirm_args = &argop[1].nfs_argop4_u.opopen_confirm;
12733 
12734 	(*seqid) += 1;
12735 	open_confirm_args->seqid = *seqid;
12736 	open_confirm_args->open_stateid = *stateid;
12737 
12738 	mi = VTOMI4(vp);
12739 
12740 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
12741 
12742 	if (!ep->error && nfs4_need_to_bump_seqid(&res)) {
12743 		nfs4_set_open_seqid((*seqid), oop, args.ctag);
12744 	}
12745 
12746 	needrecov = nfs4_needs_recovery(ep, FALSE, mi->mi_vfsp);
12747 	if (!needrecov && ep->error)
12748 		return;
12749 
12750 	if (needrecov) {
12751 		bool_t abort = FALSE;
12752 
12753 		if (reopening_file == FALSE) {
12754 			nfs4_bseqid_entry_t *bsep = NULL;
12755 
12756 			if (!ep->error && res.status == NFS4ERR_BAD_SEQID)
12757 				bsep = nfs4_create_bseqid_entry(oop, NULL,
12758 				    vp, 0, args.ctag,
12759 				    open_confirm_args->seqid);
12760 
12761 			abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL,
12762 			    NULL, NULL, OP_OPEN_CONFIRM, bsep, NULL, NULL);
12763 			if (bsep) {
12764 				kmem_free(bsep, sizeof (*bsep));
12765 				if (num_bseqid_retryp &&
12766 				    --(*num_bseqid_retryp) == 0)
12767 					abort = TRUE;
12768 			}
12769 		}
12770 		if ((ep->error == ETIMEDOUT ||
12771 		    res.status == NFS4ERR_RESOURCE) &&
12772 		    abort == FALSE && resend == FALSE) {
12773 			if (!ep->error)
12774 				(void) xdr_free(xdr_COMPOUND4res_clnt,
12775 				    (caddr_t)&res);
12776 
12777 			delay(SEC_TO_TICK(confirm_retry_sec));
12778 			goto recov_retry_confirm;
12779 		}
12780 		/* State may have changed so retry the entire OPEN op */
12781 		if (abort == FALSE)
12782 			*retry_open = TRUE;
12783 		else
12784 			*retry_open = FALSE;
12785 		if (!ep->error)
12786 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12787 		return;
12788 	}
12789 
12790 	if (res.status) {
12791 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12792 		return;
12793 	}
12794 
12795 	resop = &res.array[1];  /* open confirm res */
12796 	bcopy(&resop->nfs_resop4_u.opopen_confirm.open_stateid,
12797 	    stateid, sizeof (*stateid));
12798 
12799 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
12800 }
12801 
12802 /*
12803  * Return the credentials associated with a client state object.  The
12804  * caller is responsible for freeing the credentials.
12805  */
12806 
12807 static cred_t *
12808 state_to_cred(nfs4_open_stream_t *osp)
12809 {
12810 	cred_t *cr;
12811 
12812 	/*
12813 	 * It's ok to not lock the open stream and open owner to get
12814 	 * the oo_cred since this is only written once (upon creation)
12815 	 * and will not change.
12816 	 */
12817 	cr = osp->os_open_owner->oo_cred;
12818 	crhold(cr);
12819 
12820 	return (cr);
12821 }
12822 
12823 /*
12824  * nfs4_find_sysid
12825  *
12826  * Find the sysid for the knetconfig associated with the given mi.
12827  */
12828 static struct lm_sysid *
12829 nfs4_find_sysid(mntinfo4_t *mi)
12830 {
12831 	ASSERT(nfs_zone() == mi->mi_zone);
12832 
12833 	/*
12834 	 * Switch from RDMA knconf to original mount knconf
12835 	 */
12836 	return (lm_get_sysid(ORIG_KNCONF(mi), &mi->mi_curr_serv->sv_addr,
12837 	    mi->mi_curr_serv->sv_hostname, NULL));
12838 }
12839 
12840 #ifdef DEBUG
12841 /*
12842  * Return a string version of the call type for easy reading.
12843  */
12844 static char *
12845 nfs4frlock_get_call_type(nfs4_lock_call_type_t ctype)
12846 {
12847 	switch (ctype) {
12848 	case NFS4_LCK_CTYPE_NORM:
12849 		return ("NORMAL");
12850 	case NFS4_LCK_CTYPE_RECLAIM:
12851 		return ("RECLAIM");
12852 	case NFS4_LCK_CTYPE_RESEND:
12853 		return ("RESEND");
12854 	case NFS4_LCK_CTYPE_REINSTATE:
12855 		return ("REINSTATE");
12856 	default:
12857 		cmn_err(CE_PANIC, "nfs4frlock_get_call_type: got illegal "
12858 		    "type %d", ctype);
12859 		return ("");
12860 	}
12861 }
12862 #endif
12863 
12864 /*
12865  * Map the frlock cmd and lock type to the NFSv4 over-the-wire lock type
12866  * Unlock requests don't have an over-the-wire locktype, so we just return
12867  * something non-threatening.
12868  */
12869 
12870 static nfs_lock_type4
12871 flk_to_locktype(int cmd, int l_type)
12872 {
12873 	ASSERT(l_type == F_RDLCK || l_type == F_WRLCK || l_type == F_UNLCK);
12874 
12875 	switch (l_type) {
12876 	case F_UNLCK:
12877 		return (READ_LT);
12878 	case F_RDLCK:
12879 		if (cmd == F_SETLK)
12880 			return (READ_LT);
12881 		else
12882 			return (READW_LT);
12883 	case F_WRLCK:
12884 		if (cmd == F_SETLK)
12885 			return (WRITE_LT);
12886 		else
12887 			return (WRITEW_LT);
12888 	}
12889 	panic("flk_to_locktype");
12890 	/*NOTREACHED*/
12891 }
12892 
12893 /*
12894  * Do some preliminary checks for nfs4frlock.
12895  */
12896 static int
12897 nfs4frlock_validate_args(int cmd, flock64_t *flk, int flag, vnode_t *vp,
12898     u_offset_t offset)
12899 {
12900 	int error = 0;
12901 
12902 	/*
12903 	 * If we are setting a lock, check that the file is opened
12904 	 * with the correct mode.
12905 	 */
12906 	if (cmd == F_SETLK || cmd == F_SETLKW) {
12907 		if ((flk->l_type == F_RDLCK && (flag & FREAD) == 0) ||
12908 		    (flk->l_type == F_WRLCK && (flag & FWRITE) == 0)) {
12909 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12910 			    "nfs4frlock_validate_args: file was opened with "
12911 			    "incorrect mode"));
12912 			return (EBADF);
12913 		}
12914 	}
12915 
12916 	/* Convert the offset. It may need to be restored before returning. */
12917 	if (error = convoff(vp, flk, 0, offset)) {
12918 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12919 		    "nfs4frlock_validate_args: convoff  =>  error= %d\n",
12920 		    error));
12921 		return (error);
12922 	}
12923 
12924 	return (error);
12925 }
12926 
12927 /*
12928  * Set the flock64's lm_sysid for nfs4frlock.
12929  */
12930 static int
12931 nfs4frlock_get_sysid(struct lm_sysid **lspp, vnode_t *vp, flock64_t *flk)
12932 {
12933 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
12934 
12935 	/* Find the lm_sysid */
12936 	*lspp = nfs4_find_sysid(VTOMI4(vp));
12937 
12938 	if (*lspp == NULL) {
12939 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
12940 		    "nfs4frlock_get_sysid: no sysid, return ENOLCK"));
12941 		return (ENOLCK);
12942 	}
12943 
12944 	flk->l_sysid = lm_sysidt(*lspp);
12945 
12946 	return (0);
12947 }
12948 
12949 /*
12950  * Do the remaining preliminary setup for nfs4frlock.
12951  */
12952 static void
12953 nfs4frlock_pre_setup(clock_t *tick_delayp, nfs4_recov_state_t *recov_statep,
12954     flock64_t *flk, short *whencep, vnode_t *vp, cred_t *search_cr,
12955     cred_t **cred_otw)
12956 {
12957 	/*
12958 	 * set tick_delay to the base delay time.
12959 	 * (NFS4_BASE_WAIT_TIME is in secs)
12960 	 */
12961 
12962 	*tick_delayp = drv_usectohz(NFS4_BASE_WAIT_TIME * 1000 * 1000);
12963 
12964 	/*
12965 	 * If lock is relative to EOF, we need the newest length of the
12966 	 * file. Therefore invalidate the ATTR_CACHE.
12967 	 */
12968 
12969 	*whencep = flk->l_whence;
12970 
12971 	if (*whencep == 2)		/* SEEK_END */
12972 		PURGE_ATTRCACHE4(vp);
12973 
12974 	recov_statep->rs_flags = 0;
12975 	recov_statep->rs_num_retry_despite_err = 0;
12976 	*cred_otw = nfs4_get_otw_cred(search_cr, VTOMI4(vp), NULL);
12977 }
12978 
12979 /*
12980  * Initialize and allocate the data structures necessary for
12981  * the nfs4frlock call.
12982  * Allocates argsp's op array, frees up the saved_rqstpp if there is one.
12983  */
12984 static void
12985 nfs4frlock_call_init(COMPOUND4args_clnt *argsp, COMPOUND4args_clnt **argspp,
12986     nfs_argop4 **argopp, nfs4_op_hint_t *op_hintp, flock64_t *flk, int cmd,
12987     bool_t *retry, bool_t *did_start_fop, COMPOUND4res_clnt **respp,
12988     bool_t *skip_get_err, nfs4_lost_rqst_t *lost_rqstp)
12989 {
12990 	int		argoplist_size;
12991 	int		num_ops = 2;
12992 
12993 	*retry = FALSE;
12994 	*did_start_fop = FALSE;
12995 	*skip_get_err = FALSE;
12996 	lost_rqstp->lr_op = 0;
12997 	argoplist_size  = num_ops * sizeof (nfs_argop4);
12998 	/* fill array with zero */
12999 	*argopp = kmem_zalloc(argoplist_size, KM_SLEEP);
13000 
13001 	*argspp = argsp;
13002 	*respp = NULL;
13003 
13004 	argsp->array_len = num_ops;
13005 	argsp->array = *argopp;
13006 
13007 	/* initialize in case of error; will get real value down below */
13008 	argsp->ctag = TAG_NONE;
13009 
13010 	if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK)
13011 		*op_hintp = OH_LOCKU;
13012 	else
13013 		*op_hintp = OH_OTHER;
13014 }
13015 
13016 /*
13017  * Call the nfs4_start_fop() for nfs4frlock, if necessary.  Assign
13018  * the proper nfs4_server_t for this instance of nfs4frlock.
13019  * Returns 0 (success) or an errno value.
13020  */
13021 static int
13022 nfs4frlock_start_call(nfs4_lock_call_type_t ctype, vnode_t *vp,
13023     nfs4_op_hint_t op_hint, nfs4_recov_state_t *recov_statep,
13024     bool_t *did_start_fop, bool_t *startrecovp)
13025 {
13026 	int error = 0;
13027 	rnode4_t *rp;
13028 
13029 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13030 
13031 	if (ctype == NFS4_LCK_CTYPE_NORM) {
13032 		error = nfs4_start_fop(VTOMI4(vp), vp, NULL, op_hint,
13033 		    recov_statep, startrecovp);
13034 		if (error)
13035 			return (error);
13036 		*did_start_fop = TRUE;
13037 	} else {
13038 		*did_start_fop = FALSE;
13039 		*startrecovp = FALSE;
13040 	}
13041 
13042 	if (!error) {
13043 		rp = VTOR4(vp);
13044 
13045 		/* If the file failed recovery, just quit. */
13046 		mutex_enter(&rp->r_statelock);
13047 		if (rp->r_flags & R4RECOVERR) {
13048 			error = EIO;
13049 		}
13050 		mutex_exit(&rp->r_statelock);
13051 	}
13052 
13053 	return (error);
13054 }
13055 
13056 /*
13057  * Setup the LOCK4/LOCKU4 arguments for resending a lost lock request.  A
13058  * resend nfs4frlock call is initiated by the recovery framework.
13059  * Acquires the lop and oop seqid synchronization.
13060  */
13061 static void
13062 nfs4frlock_setup_resend_lock_args(nfs4_lost_rqst_t *resend_rqstp,
13063     COMPOUND4args_clnt *argsp, nfs_argop4 *argop, nfs4_lock_owner_t **lopp,
13064     nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13065     LOCK4args **lock_argsp, LOCKU4args **locku_argsp)
13066 {
13067 	mntinfo4_t *mi = VTOMI4(resend_rqstp->lr_vp);
13068 	int error;
13069 
13070 	NFS4_DEBUG((nfs4_lost_rqst_debug || nfs4_client_lock_debug),
13071 	    (CE_NOTE,
13072 	    "nfs4frlock_setup_resend_lock_args: have lost lock to resend"));
13073 	ASSERT(resend_rqstp != NULL);
13074 	ASSERT(resend_rqstp->lr_op == OP_LOCK ||
13075 	    resend_rqstp->lr_op == OP_LOCKU);
13076 
13077 	*oopp = resend_rqstp->lr_oop;
13078 	if (resend_rqstp->lr_oop) {
13079 		open_owner_hold(resend_rqstp->lr_oop);
13080 		error = nfs4_start_open_seqid_sync(resend_rqstp->lr_oop, mi);
13081 		ASSERT(error == 0);	/* recov thread always succeeds */
13082 	}
13083 
13084 	/* Must resend this lost lock/locku request. */
13085 	ASSERT(resend_rqstp->lr_lop != NULL);
13086 	*lopp = resend_rqstp->lr_lop;
13087 	lock_owner_hold(resend_rqstp->lr_lop);
13088 	error = nfs4_start_lock_seqid_sync(resend_rqstp->lr_lop, mi);
13089 	ASSERT(error == 0);	/* recov thread always succeeds */
13090 
13091 	*ospp = resend_rqstp->lr_osp;
13092 	if (*ospp)
13093 		open_stream_hold(resend_rqstp->lr_osp);
13094 
13095 	if (resend_rqstp->lr_op == OP_LOCK) {
13096 		LOCK4args *lock_args;
13097 
13098 		argop->argop = OP_LOCK;
13099 		*lock_argsp = lock_args = &argop->nfs_argop4_u.oplock;
13100 		lock_args->locktype = resend_rqstp->lr_locktype;
13101 		lock_args->reclaim =
13102 		    (resend_rqstp->lr_ctype == NFS4_LCK_CTYPE_RECLAIM);
13103 		lock_args->offset = resend_rqstp->lr_flk->l_start;
13104 		lock_args->length = resend_rqstp->lr_flk->l_len;
13105 		if (lock_args->length == 0)
13106 			lock_args->length = ~lock_args->length;
13107 		nfs4_setup_lock_args(*lopp, *oopp, *ospp,
13108 		    mi2clientid(mi), &lock_args->locker);
13109 
13110 		switch (resend_rqstp->lr_ctype) {
13111 		case NFS4_LCK_CTYPE_RESEND:
13112 			argsp->ctag = TAG_LOCK_RESEND;
13113 			break;
13114 		case NFS4_LCK_CTYPE_REINSTATE:
13115 			argsp->ctag = TAG_LOCK_REINSTATE;
13116 			break;
13117 		case NFS4_LCK_CTYPE_RECLAIM:
13118 			argsp->ctag = TAG_LOCK_RECLAIM;
13119 			break;
13120 		default:
13121 			argsp->ctag = TAG_LOCK_UNKNOWN;
13122 			break;
13123 		}
13124 	} else {
13125 		LOCKU4args *locku_args;
13126 		nfs4_lock_owner_t *lop = resend_rqstp->lr_lop;
13127 
13128 		argop->argop = OP_LOCKU;
13129 		*locku_argsp = locku_args = &argop->nfs_argop4_u.oplocku;
13130 		locku_args->locktype = READ_LT;
13131 		locku_args->seqid = lop->lock_seqid + 1;
13132 		mutex_enter(&lop->lo_lock);
13133 		locku_args->lock_stateid = lop->lock_stateid;
13134 		mutex_exit(&lop->lo_lock);
13135 		locku_args->offset = resend_rqstp->lr_flk->l_start;
13136 		locku_args->length = resend_rqstp->lr_flk->l_len;
13137 		if (locku_args->length == 0)
13138 			locku_args->length = ~locku_args->length;
13139 
13140 		switch (resend_rqstp->lr_ctype) {
13141 		case NFS4_LCK_CTYPE_RESEND:
13142 			argsp->ctag = TAG_LOCKU_RESEND;
13143 			break;
13144 		case NFS4_LCK_CTYPE_REINSTATE:
13145 			argsp->ctag = TAG_LOCKU_REINSTATE;
13146 			break;
13147 		default:
13148 			argsp->ctag = TAG_LOCK_UNKNOWN;
13149 			break;
13150 		}
13151 	}
13152 }
13153 
13154 /*
13155  * Setup the LOCKT4 arguments.
13156  */
13157 static void
13158 nfs4frlock_setup_lockt_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13159     LOCKT4args **lockt_argsp, COMPOUND4args_clnt *argsp, flock64_t *flk,
13160     rnode4_t *rp)
13161 {
13162 	LOCKT4args *lockt_args;
13163 
13164 	ASSERT(nfs_zone() == VTOMI4(RTOV4(rp))->mi_zone);
13165 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13166 	argop->argop = OP_LOCKT;
13167 	argsp->ctag = TAG_LOCKT;
13168 	lockt_args = &argop->nfs_argop4_u.oplockt;
13169 
13170 	/*
13171 	 * The locktype will be READ_LT unless it's
13172 	 * a write lock. We do this because the Solaris
13173 	 * system call allows the combination of
13174 	 * F_UNLCK and F_GETLK* and so in that case the
13175 	 * unlock is mapped to a read.
13176 	 */
13177 	if (flk->l_type == F_WRLCK)
13178 		lockt_args->locktype = WRITE_LT;
13179 	else
13180 		lockt_args->locktype = READ_LT;
13181 
13182 	lockt_args->owner.clientid = mi2clientid(VTOMI4(RTOV4(rp)));
13183 	/* set the lock owner4 args */
13184 	nfs4_setlockowner_args(&lockt_args->owner, rp,
13185 	    ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13186 	    flk->l_pid);
13187 	lockt_args->offset = flk->l_start;
13188 	lockt_args->length = flk->l_len;
13189 	if (flk->l_len == 0)
13190 		lockt_args->length = ~lockt_args->length;
13191 
13192 	*lockt_argsp = lockt_args;
13193 }
13194 
13195 /*
13196  * If the client is holding a delegation, and the open stream to be used
13197  * with this lock request is a delegation open stream, then re-open the stream.
13198  * Sets the nfs4_error_t to all zeros unless the open stream has already
13199  * failed a reopen or we couldn't find the open stream.  NFS4ERR_DELAY
13200  * means the caller should retry (like a recovery retry).
13201  */
13202 static void
13203 nfs4frlock_check_deleg(vnode_t *vp, nfs4_error_t *ep, cred_t *cr, int lt)
13204 {
13205 	open_delegation_type4	dt;
13206 	bool_t			reopen_needed, force;
13207 	nfs4_open_stream_t	*osp;
13208 	open_claim_type4 	oclaim;
13209 	rnode4_t		*rp = VTOR4(vp);
13210 	mntinfo4_t		*mi = VTOMI4(vp);
13211 
13212 	ASSERT(nfs_zone() == mi->mi_zone);
13213 
13214 	nfs4_error_zinit(ep);
13215 
13216 	mutex_enter(&rp->r_statev4_lock);
13217 	dt = rp->r_deleg_type;
13218 	mutex_exit(&rp->r_statev4_lock);
13219 
13220 	if (dt != OPEN_DELEGATE_NONE) {
13221 		nfs4_open_owner_t	*oop;
13222 
13223 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
13224 		if (!oop) {
13225 			ep->stat = NFS4ERR_IO;
13226 			return;
13227 		}
13228 		/* returns with 'os_sync_lock' held */
13229 		osp = find_open_stream(oop, rp);
13230 		if (!osp) {
13231 			open_owner_rele(oop);
13232 			ep->stat = NFS4ERR_IO;
13233 			return;
13234 		}
13235 
13236 		if (osp->os_failed_reopen) {
13237 			NFS4_DEBUG((nfs4_open_stream_debug ||
13238 			    nfs4_client_lock_debug), (CE_NOTE,
13239 			    "nfs4frlock_check_deleg: os_failed_reopen set "
13240 			    "for osp %p, cr %p, rp %s", (void *)osp,
13241 			    (void *)cr, rnode4info(rp)));
13242 			mutex_exit(&osp->os_sync_lock);
13243 			open_stream_rele(osp, rp);
13244 			open_owner_rele(oop);
13245 			ep->stat = NFS4ERR_IO;
13246 			return;
13247 		}
13248 
13249 		/*
13250 		 * Determine whether a reopen is needed.  If this
13251 		 * is a delegation open stream, then send the open
13252 		 * to the server to give visibility to the open owner.
13253 		 * Even if it isn't a delegation open stream, we need
13254 		 * to check if the previous open CLAIM_DELEGATE_CUR
13255 		 * was sufficient.
13256 		 */
13257 
13258 		reopen_needed = osp->os_delegation ||
13259 		    ((lt == F_RDLCK &&
13260 		    !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_READ)) ||
13261 		    (lt == F_WRLCK &&
13262 		    !(osp->os_dc_openacc & OPEN4_SHARE_ACCESS_WRITE)));
13263 
13264 		mutex_exit(&osp->os_sync_lock);
13265 		open_owner_rele(oop);
13266 
13267 		if (reopen_needed) {
13268 			/*
13269 			 * Always use CLAIM_PREVIOUS after server reboot.
13270 			 * The server will reject CLAIM_DELEGATE_CUR if
13271 			 * it is used during the grace period.
13272 			 */
13273 			mutex_enter(&mi->mi_lock);
13274 			if (mi->mi_recovflags & MI4R_SRV_REBOOT) {
13275 				oclaim = CLAIM_PREVIOUS;
13276 				force = TRUE;
13277 			} else {
13278 				oclaim = CLAIM_DELEGATE_CUR;
13279 				force = FALSE;
13280 			}
13281 			mutex_exit(&mi->mi_lock);
13282 
13283 			nfs4_reopen(vp, osp, ep, oclaim, force, FALSE);
13284 			if (ep->error == EAGAIN) {
13285 				nfs4_error_zinit(ep);
13286 				ep->stat = NFS4ERR_DELAY;
13287 			}
13288 		}
13289 		open_stream_rele(osp, rp);
13290 		osp = NULL;
13291 	}
13292 }
13293 
13294 /*
13295  * Setup the LOCKU4 arguments.
13296  * Returns errors via the nfs4_error_t.
13297  * NFS4_OK		no problems.  *go_otwp is TRUE if call should go
13298  *			over-the-wire.  The caller must release the
13299  *			reference on *lopp.
13300  * NFS4ERR_DELAY	caller should retry (like recovery retry)
13301  * (other)		unrecoverable error.
13302  */
13303 static void
13304 nfs4frlock_setup_locku_args(nfs4_lock_call_type_t ctype, nfs_argop4 *argop,
13305     LOCKU4args **locku_argsp, flock64_t *flk,
13306     nfs4_lock_owner_t **lopp, nfs4_error_t *ep, COMPOUND4args_clnt *argsp,
13307     vnode_t *vp, int flag, u_offset_t offset, cred_t *cr,
13308     bool_t *skip_get_err, bool_t *go_otwp)
13309 {
13310 	nfs4_lock_owner_t	*lop = NULL;
13311 	LOCKU4args		*locku_args;
13312 	pid_t			pid;
13313 	bool_t			is_spec = FALSE;
13314 	rnode4_t		*rp = VTOR4(vp);
13315 
13316 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13317 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13318 
13319 	nfs4frlock_check_deleg(vp, ep, cr, F_UNLCK);
13320 	if (ep->error || ep->stat)
13321 		return;
13322 
13323 	argop->argop = OP_LOCKU;
13324 	if (ctype == NFS4_LCK_CTYPE_REINSTATE)
13325 		argsp->ctag = TAG_LOCKU_REINSTATE;
13326 	else
13327 		argsp->ctag = TAG_LOCKU;
13328 	locku_args = &argop->nfs_argop4_u.oplocku;
13329 	*locku_argsp = locku_args;
13330 
13331 	/*
13332 	 * XXX what should locku_args->locktype be?
13333 	 * setting to ALWAYS be READ_LT so at least
13334 	 * it is a valid locktype.
13335 	 */
13336 
13337 	locku_args->locktype = READ_LT;
13338 
13339 	pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pidp->pid_id :
13340 	    flk->l_pid;
13341 
13342 	/*
13343 	 * Get the lock owner stateid.  If no lock owner
13344 	 * exists, return success.
13345 	 */
13346 	lop = find_lock_owner(rp, pid, LOWN_ANY);
13347 	*lopp = lop;
13348 	if (lop && CLNT_ISSPECIAL(&lop->lock_stateid))
13349 		is_spec = TRUE;
13350 	if (!lop || is_spec) {
13351 		/*
13352 		 * No lock owner so no locks to unlock.
13353 		 * Return success.  If there was a failed
13354 		 * reclaim earlier, the lock might still be
13355 		 * registered with the local locking code,
13356 		 * so notify it of the unlock.
13357 		 *
13358 		 * If the lockowner is using a special stateid,
13359 		 * then the original lock request (that created
13360 		 * this lockowner) was never successful, so we
13361 		 * have no lock to undo OTW.
13362 		 */
13363 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13364 		    "nfs4frlock_setup_locku_args: LOCKU: no lock owner "
13365 		    "(%ld) so return success", (long)pid));
13366 
13367 		if (ctype == NFS4_LCK_CTYPE_NORM)
13368 			flk->l_pid = curproc->p_pid;
13369 		nfs4_register_lock_locally(vp, flk, flag, offset);
13370 		/*
13371 		 * Release our hold and NULL out so final_cleanup
13372 		 * doesn't try to end a lock seqid sync we
13373 		 * never started.
13374 		 */
13375 		if (is_spec) {
13376 			lock_owner_rele(lop);
13377 			*lopp = NULL;
13378 		}
13379 		*skip_get_err = TRUE;
13380 		*go_otwp = FALSE;
13381 		return;
13382 	}
13383 
13384 	ep->error = nfs4_start_lock_seqid_sync(lop, VTOMI4(vp));
13385 	if (ep->error == EAGAIN) {
13386 		lock_owner_rele(lop);
13387 		*lopp = NULL;
13388 		return;
13389 	}
13390 
13391 	mutex_enter(&lop->lo_lock);
13392 	locku_args->lock_stateid = lop->lock_stateid;
13393 	mutex_exit(&lop->lo_lock);
13394 	locku_args->seqid = lop->lock_seqid + 1;
13395 
13396 	/* leave the ref count on lop, rele after RPC call */
13397 
13398 	locku_args->offset = flk->l_start;
13399 	locku_args->length = flk->l_len;
13400 	if (flk->l_len == 0)
13401 		locku_args->length = ~locku_args->length;
13402 
13403 	*go_otwp = TRUE;
13404 }
13405 
13406 /*
13407  * Setup the LOCK4 arguments.
13408  *
13409  * Returns errors via the nfs4_error_t.
13410  * NFS4_OK		no problems
13411  * NFS4ERR_DELAY	caller should retry (like recovery retry)
13412  * (other)		unrecoverable error
13413  */
13414 static void
13415 nfs4frlock_setup_lock_args(nfs4_lock_call_type_t ctype, LOCK4args **lock_argsp,
13416     nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13417     nfs4_lock_owner_t **lopp, nfs_argop4 *argop, COMPOUND4args_clnt *argsp,
13418     flock64_t *flk, int cmd, vnode_t *vp, cred_t *cr, nfs4_error_t *ep)
13419 {
13420 	LOCK4args		*lock_args;
13421 	nfs4_open_owner_t	*oop = NULL;
13422 	nfs4_open_stream_t	*osp = NULL;
13423 	nfs4_lock_owner_t	*lop = NULL;
13424 	pid_t			pid;
13425 	rnode4_t		*rp = VTOR4(vp);
13426 
13427 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13428 
13429 	nfs4frlock_check_deleg(vp, ep, cr, flk->l_type);
13430 	if (ep->error || ep->stat != NFS4_OK)
13431 		return;
13432 
13433 	argop->argop = OP_LOCK;
13434 	if (ctype == NFS4_LCK_CTYPE_NORM)
13435 		argsp->ctag = TAG_LOCK;
13436 	else if (ctype == NFS4_LCK_CTYPE_RECLAIM)
13437 		argsp->ctag = TAG_RELOCK;
13438 	else
13439 		argsp->ctag = TAG_LOCK_REINSTATE;
13440 	lock_args = &argop->nfs_argop4_u.oplock;
13441 	lock_args->locktype = flk_to_locktype(cmd, flk->l_type);
13442 	lock_args->reclaim = ctype == NFS4_LCK_CTYPE_RECLAIM ? 1 : 0;
13443 	/*
13444 	 * Get the lock owner.  If no lock owner exists,
13445 	 * create a 'temporary' one and grab the open seqid
13446 	 * synchronization (which puts a hold on the open
13447 	 * owner and open stream).
13448 	 * This also grabs the lock seqid synchronization.
13449 	 */
13450 	pid = ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid : flk->l_pid;
13451 	ep->stat =
13452 	    nfs4_find_or_create_lock_owner(pid, rp, cr, &oop, &osp, &lop);
13453 
13454 	if (ep->stat != NFS4_OK)
13455 		goto out;
13456 
13457 	nfs4_setup_lock_args(lop, oop, osp, mi2clientid(VTOMI4(vp)),
13458 	    &lock_args->locker);
13459 
13460 	lock_args->offset = flk->l_start;
13461 	lock_args->length = flk->l_len;
13462 	if (flk->l_len == 0)
13463 		lock_args->length = ~lock_args->length;
13464 	*lock_argsp = lock_args;
13465 out:
13466 	*oopp = oop;
13467 	*ospp = osp;
13468 	*lopp = lop;
13469 }
13470 
13471 /*
13472  * After we get the reply from the server, record the proper information
13473  * for possible resend lock requests.
13474  *
13475  * Allocates memory for the saved_rqstp if we have a lost lock to save.
13476  */
13477 static void
13478 nfs4frlock_save_lost_rqst(nfs4_lock_call_type_t ctype, int error,
13479     nfs_lock_type4 locktype, nfs4_open_owner_t *oop,
13480     nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
13481     nfs4_lost_rqst_t *lost_rqstp, cred_t *cr, vnode_t *vp)
13482 {
13483 	bool_t unlock = (flk->l_type == F_UNLCK);
13484 
13485 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13486 	ASSERT(ctype == NFS4_LCK_CTYPE_NORM ||
13487 	    ctype == NFS4_LCK_CTYPE_REINSTATE);
13488 
13489 	if (error != 0 && !unlock) {
13490 		NFS4_DEBUG((nfs4_lost_rqst_debug ||
13491 		    nfs4_client_lock_debug), (CE_NOTE,
13492 		    "nfs4frlock_save_lost_rqst: set lo_pending_rqsts to 1 "
13493 		    " for lop %p", (void *)lop));
13494 		ASSERT(lop != NULL);
13495 		mutex_enter(&lop->lo_lock);
13496 		lop->lo_pending_rqsts = 1;
13497 		mutex_exit(&lop->lo_lock);
13498 	}
13499 
13500 	lost_rqstp->lr_putfirst = FALSE;
13501 	lost_rqstp->lr_op = 0;
13502 
13503 	/*
13504 	 * For lock/locku requests, we treat EINTR as ETIMEDOUT for
13505 	 * recovery purposes so that the lock request that was sent
13506 	 * can be saved and re-issued later.  Ditto for EIO from a forced
13507 	 * unmount.  This is done to have the client's local locking state
13508 	 * match the v4 server's state; that is, the request was
13509 	 * potentially received and accepted by the server but the client
13510 	 * thinks it was not.
13511 	 */
13512 	if (error == ETIMEDOUT || error == EINTR ||
13513 	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
13514 		NFS4_DEBUG((nfs4_lost_rqst_debug ||
13515 		    nfs4_client_lock_debug), (CE_NOTE,
13516 		    "nfs4frlock_save_lost_rqst: got a lost %s lock for "
13517 		    "lop %p oop %p osp %p", unlock ? "LOCKU" : "LOCK",
13518 		    (void *)lop, (void *)oop, (void *)osp));
13519 		if (unlock)
13520 			lost_rqstp->lr_op = OP_LOCKU;
13521 		else {
13522 			lost_rqstp->lr_op = OP_LOCK;
13523 			lost_rqstp->lr_locktype = locktype;
13524 		}
13525 		/*
13526 		 * Objects are held and rele'd via the recovery code.
13527 		 * See nfs4_save_lost_rqst.
13528 		 */
13529 		lost_rqstp->lr_vp = vp;
13530 		lost_rqstp->lr_dvp = NULL;
13531 		lost_rqstp->lr_oop = oop;
13532 		lost_rqstp->lr_osp = osp;
13533 		lost_rqstp->lr_lop = lop;
13534 		lost_rqstp->lr_cr = cr;
13535 		switch (ctype) {
13536 		case NFS4_LCK_CTYPE_NORM:
13537 			flk->l_pid = ttoproc(curthread)->p_pid;
13538 			lost_rqstp->lr_ctype = NFS4_LCK_CTYPE_RESEND;
13539 			break;
13540 		case NFS4_LCK_CTYPE_REINSTATE:
13541 			lost_rqstp->lr_putfirst = TRUE;
13542 			lost_rqstp->lr_ctype = ctype;
13543 			break;
13544 		default:
13545 			break;
13546 		}
13547 		lost_rqstp->lr_flk = flk;
13548 	}
13549 }
13550 
13551 /*
13552  * Update lop's seqid.  Also update the seqid stored in a resend request,
13553  * if any.  (Some recovery errors increment the seqid, and we may have to
13554  * send the resend request again.)
13555  */
13556 
13557 static void
13558 nfs4frlock_bump_seqid(LOCK4args *lock_args, LOCKU4args *locku_args,
13559     nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop, nfs4_tag_type_t tag_type)
13560 {
13561 	if (lock_args) {
13562 		if (lock_args->locker.new_lock_owner == TRUE)
13563 			nfs4_get_and_set_next_open_seqid(oop, tag_type);
13564 		else {
13565 			ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13566 			nfs4_set_lock_seqid(lop->lock_seqid + 1, lop);
13567 		}
13568 	} else if (locku_args) {
13569 		ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
13570 		nfs4_set_lock_seqid(lop->lock_seqid +1, lop);
13571 	}
13572 }
13573 
13574 /*
13575  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13576  * COMPOUND4 args/res for calls that need to retry.
13577  * Switches the *cred_otwp to base_cr.
13578  */
13579 static void
13580 nfs4frlock_check_access(vnode_t *vp, nfs4_op_hint_t op_hint,
13581     nfs4_recov_state_t *recov_statep, int needrecov, bool_t *did_start_fop,
13582     COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp, int error,
13583     nfs4_lock_owner_t **lopp, nfs4_open_owner_t **oopp,
13584     nfs4_open_stream_t **ospp, cred_t *base_cr, cred_t **cred_otwp)
13585 {
13586 	nfs4_open_owner_t	*oop = *oopp;
13587 	nfs4_open_stream_t	*osp = *ospp;
13588 	nfs4_lock_owner_t	*lop = *lopp;
13589 	nfs_argop4		*argop = (*argspp)->array;
13590 
13591 	if (*did_start_fop) {
13592 		nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13593 		    needrecov);
13594 		*did_start_fop = FALSE;
13595 	}
13596 	ASSERT((*argspp)->array_len == 2);
13597 	if (argop[1].argop == OP_LOCK)
13598 		nfs4args_lock_free(&argop[1]);
13599 	else if (argop[1].argop == OP_LOCKT)
13600 		nfs4args_lockt_free(&argop[1]);
13601 	kmem_free(argop, 2 * sizeof (nfs_argop4));
13602 	if (!error)
13603 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13604 	*argspp = NULL;
13605 	*respp = NULL;
13606 
13607 	if (lop) {
13608 		nfs4_end_lock_seqid_sync(lop);
13609 		lock_owner_rele(lop);
13610 		*lopp = NULL;
13611 	}
13612 
13613 	/* need to free up the reference on osp for lock args */
13614 	if (osp != NULL) {
13615 		open_stream_rele(osp, VTOR4(vp));
13616 		*ospp = NULL;
13617 	}
13618 
13619 	/* need to free up the reference on oop for lock args */
13620 	if (oop != NULL) {
13621 		nfs4_end_open_seqid_sync(oop);
13622 		open_owner_rele(oop);
13623 		*oopp = NULL;
13624 	}
13625 
13626 	crfree(*cred_otwp);
13627 	*cred_otwp = base_cr;
13628 	crhold(*cred_otwp);
13629 }
13630 
13631 /*
13632  * Function to process the client's recovery for nfs4frlock.
13633  * Returns TRUE if we should retry the lock request; FALSE otherwise.
13634  *
13635  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13636  * COMPOUND4 args/res for calls that need to retry.
13637  *
13638  * Note: the rp's r_lkserlock is *not* dropped during this path.
13639  */
13640 static bool_t
13641 nfs4frlock_recovery(int needrecov, nfs4_error_t *ep,
13642     COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13643     LOCK4args *lock_args, LOCKU4args *locku_args,
13644     nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
13645     nfs4_lock_owner_t **lopp, rnode4_t *rp, vnode_t *vp,
13646     nfs4_recov_state_t *recov_statep, nfs4_op_hint_t op_hint,
13647     bool_t *did_start_fop, nfs4_lost_rqst_t *lost_rqstp, flock64_t *flk)
13648 {
13649 	nfs4_open_owner_t	*oop = *oopp;
13650 	nfs4_open_stream_t	*osp = *ospp;
13651 	nfs4_lock_owner_t	*lop = *lopp;
13652 
13653 	bool_t abort, retry;
13654 
13655 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13656 	ASSERT((*argspp) != NULL);
13657 	ASSERT((*respp) != NULL);
13658 	if (lock_args || locku_args)
13659 		ASSERT(lop != NULL);
13660 
13661 	NFS4_DEBUG((nfs4_client_lock_debug || nfs4_client_recov_debug),
13662 	    (CE_NOTE, "nfs4frlock_recovery: initiating recovery\n"));
13663 
13664 	retry = TRUE;
13665 	abort = FALSE;
13666 	if (needrecov) {
13667 		nfs4_bseqid_entry_t *bsep = NULL;
13668 		nfs_opnum4 op;
13669 
13670 		op = lock_args ? OP_LOCK : locku_args ? OP_LOCKU : OP_LOCKT;
13671 
13672 		if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID) {
13673 			seqid4 seqid;
13674 
13675 			if (lock_args) {
13676 				if (lock_args->locker.new_lock_owner == TRUE)
13677 					seqid = lock_args->locker.locker4_u.
13678 					    open_owner.open_seqid;
13679 				else
13680 					seqid = lock_args->locker.locker4_u.
13681 					    lock_owner.lock_seqid;
13682 			} else if (locku_args) {
13683 				seqid = locku_args->seqid;
13684 			} else {
13685 				seqid = 0;
13686 			}
13687 
13688 			bsep = nfs4_create_bseqid_entry(oop, lop, vp,
13689 			    flk->l_pid, (*argspp)->ctag, seqid);
13690 		}
13691 
13692 		abort = nfs4_start_recovery(ep, VTOMI4(vp), vp, NULL, NULL,
13693 		    (lost_rqstp && (lost_rqstp->lr_op == OP_LOCK ||
13694 		    lost_rqstp->lr_op == OP_LOCKU)) ? lost_rqstp :
13695 		    NULL, op, bsep, NULL, NULL);
13696 
13697 		if (bsep)
13698 			kmem_free(bsep, sizeof (*bsep));
13699 	}
13700 
13701 	/*
13702 	 * Return that we do not want to retry the request for 3 cases:
13703 	 * 1. If we received EINTR or are bailing out because of a forced
13704 	 *    unmount, we came into this code path just for the sake of
13705 	 *    initiating recovery, we now need to return the error.
13706 	 * 2. If we have aborted recovery.
13707 	 * 3. We received NFS4ERR_BAD_SEQID.
13708 	 */
13709 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp) ||
13710 	    abort == TRUE || (ep->error == 0 && ep->stat == NFS4ERR_BAD_SEQID))
13711 		retry = FALSE;
13712 
13713 	if (*did_start_fop == TRUE) {
13714 		nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint, recov_statep,
13715 		    needrecov);
13716 		*did_start_fop = FALSE;
13717 	}
13718 
13719 	if (retry == TRUE) {
13720 		nfs_argop4	*argop;
13721 
13722 		argop = (*argspp)->array;
13723 		ASSERT((*argspp)->array_len == 2);
13724 
13725 		if (argop[1].argop == OP_LOCK)
13726 			nfs4args_lock_free(&argop[1]);
13727 		else if (argop[1].argop == OP_LOCKT)
13728 			nfs4args_lockt_free(&argop[1]);
13729 		kmem_free(argop, 2 * sizeof (nfs_argop4));
13730 		if (!ep->error)
13731 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)*respp);
13732 		*respp = NULL;
13733 		*argspp = NULL;
13734 	}
13735 
13736 	if (lop != NULL) {
13737 		nfs4_end_lock_seqid_sync(lop);
13738 		lock_owner_rele(lop);
13739 	}
13740 
13741 	*lopp = NULL;
13742 
13743 	/* need to free up the reference on osp for lock args */
13744 	if (osp != NULL) {
13745 		open_stream_rele(osp, rp);
13746 		*ospp = NULL;
13747 	}
13748 
13749 	/* need to free up the reference on oop for lock args */
13750 	if (oop != NULL) {
13751 		nfs4_end_open_seqid_sync(oop);
13752 		open_owner_rele(oop);
13753 		*oopp = NULL;
13754 	}
13755 
13756 	return (retry);
13757 }
13758 
13759 /*
13760  * Handles the successful reply from the server for nfs4frlock.
13761  */
13762 static void
13763 nfs4frlock_results_ok(nfs4_lock_call_type_t ctype, int cmd, flock64_t *flk,
13764     vnode_t *vp, int flag, u_offset_t offset,
13765     nfs4_lost_rqst_t *resend_rqstp)
13766 {
13767 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13768 	if ((cmd == F_SETLK || cmd == F_SETLKW) &&
13769 	    (flk->l_type == F_RDLCK || flk->l_type == F_WRLCK)) {
13770 		if (ctype == NFS4_LCK_CTYPE_NORM) {
13771 			flk->l_pid = ttoproc(curthread)->p_pid;
13772 			/*
13773 			 * We do not register lost locks locally in
13774 			 * the 'resend' case since the user/application
13775 			 * doesn't think we have the lock.
13776 			 */
13777 			ASSERT(!resend_rqstp);
13778 			nfs4_register_lock_locally(vp, flk, flag, offset);
13779 		}
13780 	}
13781 }
13782 
13783 /*
13784  * Handle the DENIED reply from the server for nfs4frlock.
13785  * Returns TRUE if we should retry the request; FALSE otherwise.
13786  *
13787  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
13788  * COMPOUND4 args/res for calls that need to retry.  Can also
13789  * drop and regrab the r_lkserlock.
13790  */
13791 static bool_t
13792 nfs4frlock_results_denied(nfs4_lock_call_type_t ctype, LOCK4args *lock_args,
13793     LOCKT4args *lockt_args, nfs4_open_owner_t **oopp,
13794     nfs4_open_stream_t **ospp, nfs4_lock_owner_t **lopp, int cmd,
13795     vnode_t *vp, flock64_t *flk, nfs4_op_hint_t op_hint,
13796     nfs4_recov_state_t *recov_statep, int needrecov,
13797     COMPOUND4args_clnt **argspp, COMPOUND4res_clnt **respp,
13798     clock_t *tick_delayp, short *whencep, int *errorp,
13799     nfs_resop4 *resop, cred_t *cr, bool_t *did_start_fop,
13800     bool_t *skip_get_err)
13801 {
13802 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13803 
13804 	if (lock_args) {
13805 		nfs4_open_owner_t	*oop = *oopp;
13806 		nfs4_open_stream_t	*osp = *ospp;
13807 		nfs4_lock_owner_t	*lop = *lopp;
13808 		int			intr;
13809 
13810 		/*
13811 		 * Blocking lock needs to sleep and retry from the request.
13812 		 *
13813 		 * Do not block and wait for 'resend' or 'reinstate'
13814 		 * lock requests, just return the error.
13815 		 *
13816 		 * Note: reclaim requests have cmd == F_SETLK, not F_SETLKW.
13817 		 */
13818 		if (cmd == F_SETLKW) {
13819 			rnode4_t *rp = VTOR4(vp);
13820 			nfs_argop4 *argop = (*argspp)->array;
13821 
13822 			ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
13823 
13824 			nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
13825 			    recov_statep, needrecov);
13826 			*did_start_fop = FALSE;
13827 			ASSERT((*argspp)->array_len == 2);
13828 			if (argop[1].argop == OP_LOCK)
13829 				nfs4args_lock_free(&argop[1]);
13830 			else if (argop[1].argop == OP_LOCKT)
13831 				nfs4args_lockt_free(&argop[1]);
13832 			kmem_free(argop, 2 * sizeof (nfs_argop4));
13833 			if (*respp)
13834 				(void) xdr_free(xdr_COMPOUND4res_clnt,
13835 				    (caddr_t)*respp);
13836 			*argspp = NULL;
13837 			*respp = NULL;
13838 			nfs4_end_lock_seqid_sync(lop);
13839 			lock_owner_rele(lop);
13840 			*lopp = NULL;
13841 			if (osp != NULL) {
13842 				open_stream_rele(osp, rp);
13843 				*ospp = NULL;
13844 			}
13845 			if (oop != NULL) {
13846 				nfs4_end_open_seqid_sync(oop);
13847 				open_owner_rele(oop);
13848 				*oopp = NULL;
13849 			}
13850 
13851 			nfs_rw_exit(&rp->r_lkserlock);
13852 
13853 			intr = nfs4_block_and_wait(tick_delayp, rp);
13854 
13855 			if (intr) {
13856 				(void) nfs_rw_enter_sig(&rp->r_lkserlock,
13857 				    RW_WRITER, FALSE);
13858 				*errorp = EINTR;
13859 				return (FALSE);
13860 			}
13861 
13862 			(void) nfs_rw_enter_sig(&rp->r_lkserlock,
13863 			    RW_WRITER, FALSE);
13864 
13865 			/*
13866 			 * Make sure we are still safe to lock with
13867 			 * regards to mmapping.
13868 			 */
13869 			if (!nfs4_safelock(vp, flk, cr)) {
13870 				*errorp = EAGAIN;
13871 				return (FALSE);
13872 			}
13873 
13874 			return (TRUE);
13875 		}
13876 		if (ctype == NFS4_LCK_CTYPE_NORM)
13877 			*errorp = EAGAIN;
13878 		*skip_get_err = TRUE;
13879 		flk->l_whence = 0;
13880 		*whencep = 0;
13881 		return (FALSE);
13882 	} else if (lockt_args) {
13883 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13884 		    "nfs4frlock_results_denied: OP_LOCKT DENIED"));
13885 
13886 		denied_to_flk(&resop->nfs_resop4_u.oplockt.denied,
13887 		    flk, lockt_args);
13888 
13889 		/* according to NLM code */
13890 		*errorp = 0;
13891 		*whencep = 0;
13892 		*skip_get_err = TRUE;
13893 		return (FALSE);
13894 	}
13895 	return (FALSE);
13896 }
13897 
13898 /*
13899  * Handles all NFS4 errors besides NFS4_OK and NFS4ERR_DENIED for nfs4frlock.
13900  */
13901 static void
13902 nfs4frlock_results_default(COMPOUND4res_clnt *resp, int *errorp)
13903 {
13904 	switch (resp->status) {
13905 	case NFS4ERR_ACCESS:
13906 	case NFS4ERR_ADMIN_REVOKED:
13907 	case NFS4ERR_BADHANDLE:
13908 	case NFS4ERR_BAD_RANGE:
13909 	case NFS4ERR_BAD_SEQID:
13910 	case NFS4ERR_BAD_STATEID:
13911 	case NFS4ERR_BADXDR:
13912 	case NFS4ERR_DEADLOCK:
13913 	case NFS4ERR_DELAY:
13914 	case NFS4ERR_EXPIRED:
13915 	case NFS4ERR_FHEXPIRED:
13916 	case NFS4ERR_GRACE:
13917 	case NFS4ERR_INVAL:
13918 	case NFS4ERR_ISDIR:
13919 	case NFS4ERR_LEASE_MOVED:
13920 	case NFS4ERR_LOCK_NOTSUPP:
13921 	case NFS4ERR_LOCK_RANGE:
13922 	case NFS4ERR_MOVED:
13923 	case NFS4ERR_NOFILEHANDLE:
13924 	case NFS4ERR_NO_GRACE:
13925 	case NFS4ERR_OLD_STATEID:
13926 	case NFS4ERR_OPENMODE:
13927 	case NFS4ERR_RECLAIM_BAD:
13928 	case NFS4ERR_RECLAIM_CONFLICT:
13929 	case NFS4ERR_RESOURCE:
13930 	case NFS4ERR_SERVERFAULT:
13931 	case NFS4ERR_STALE:
13932 	case NFS4ERR_STALE_CLIENTID:
13933 	case NFS4ERR_STALE_STATEID:
13934 		return;
13935 	default:
13936 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
13937 		    "nfs4frlock_results_default: got unrecognizable "
13938 		    "res.status %d", resp->status));
13939 		*errorp = NFS4ERR_INVAL;
13940 	}
13941 }
13942 
13943 /*
13944  * The lock request was successful, so update the client's state.
13945  */
13946 static void
13947 nfs4frlock_update_state(LOCK4args *lock_args, LOCKU4args *locku_args,
13948     LOCKT4args *lockt_args, nfs_resop4 *resop, nfs4_lock_owner_t *lop,
13949     vnode_t *vp, flock64_t *flk, cred_t *cr,
13950     nfs4_lost_rqst_t *resend_rqstp)
13951 {
13952 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
13953 
13954 	if (lock_args) {
13955 		LOCK4res *lock_res;
13956 
13957 		lock_res = &resop->nfs_resop4_u.oplock;
13958 		/* update the stateid with server's response */
13959 
13960 		if (lock_args->locker.new_lock_owner == TRUE) {
13961 			mutex_enter(&lop->lo_lock);
13962 			lop->lo_just_created = NFS4_PERM_CREATED;
13963 			mutex_exit(&lop->lo_lock);
13964 		}
13965 
13966 		nfs4_set_lock_stateid(lop, lock_res->LOCK4res_u.lock_stateid);
13967 
13968 		/*
13969 		 * If the lock was the result of a resending a lost
13970 		 * request, we've synched up the stateid and seqid
13971 		 * with the server, but now the server might be out of sync
13972 		 * with what the application thinks it has for locks.
13973 		 * Clean that up here.  It's unclear whether we should do
13974 		 * this even if the filesystem has been forcibly unmounted.
13975 		 * For most servers, it's probably wasted effort, but
13976 		 * RFC3530 lets servers require that unlocks exactly match
13977 		 * the locks that are held.
13978 		 */
13979 		if (resend_rqstp != NULL &&
13980 		    resend_rqstp->lr_ctype != NFS4_LCK_CTYPE_REINSTATE) {
13981 			nfs4_reinstitute_local_lock_state(vp, flk, cr, lop);
13982 		} else {
13983 			flk->l_whence = 0;
13984 		}
13985 	} else if (locku_args) {
13986 		LOCKU4res *locku_res;
13987 
13988 		locku_res = &resop->nfs_resop4_u.oplocku;
13989 
13990 		/* Update the stateid with the server's response */
13991 		nfs4_set_lock_stateid(lop, locku_res->lock_stateid);
13992 	} else if (lockt_args) {
13993 		/* Switch the lock type to express success, see fcntl */
13994 		flk->l_type = F_UNLCK;
13995 		flk->l_whence = 0;
13996 	}
13997 }
13998 
13999 /*
14000  * Do final cleanup before exiting nfs4frlock.
14001  * Calls nfs4_end_fop, drops the seqid syncs, and frees up the
14002  * COMPOUND4 args/res for calls that haven't already.
14003  */
14004 static void
14005 nfs4frlock_final_cleanup(nfs4_lock_call_type_t ctype, COMPOUND4args_clnt *argsp,
14006     COMPOUND4res_clnt *resp, vnode_t *vp, nfs4_op_hint_t op_hint,
14007     nfs4_recov_state_t *recov_statep, int needrecov, nfs4_open_owner_t *oop,
14008     nfs4_open_stream_t *osp, nfs4_lock_owner_t *lop, flock64_t *flk,
14009     short whence, u_offset_t offset, struct lm_sysid *ls,
14010     int *errorp, LOCK4args *lock_args, LOCKU4args *locku_args,
14011     bool_t did_start_fop, bool_t skip_get_err,
14012     cred_t *cred_otw, cred_t *cred)
14013 {
14014 	mntinfo4_t	*mi = VTOMI4(vp);
14015 	rnode4_t	*rp = VTOR4(vp);
14016 	int		error = *errorp;
14017 	nfs_argop4	*argop;
14018 	int	do_flush_pages = 0;
14019 
14020 	ASSERT(nfs_zone() == mi->mi_zone);
14021 	/*
14022 	 * The client recovery code wants the raw status information,
14023 	 * so don't map the NFS status code to an errno value for
14024 	 * non-normal call types.
14025 	 */
14026 	if (ctype == NFS4_LCK_CTYPE_NORM) {
14027 		if (*errorp == 0 && resp != NULL && skip_get_err == FALSE)
14028 			*errorp = geterrno4(resp->status);
14029 		if (did_start_fop == TRUE)
14030 			nfs4_end_fop(mi, vp, NULL, op_hint, recov_statep,
14031 			    needrecov);
14032 
14033 		/*
14034 		 * We've established a new lock on the server, so invalidate
14035 		 * the pages associated with the vnode to get the most up to
14036 		 * date pages from the server after acquiring the lock. We
14037 		 * want to be sure that the read operation gets the newest data.
14038 		 * N.B.
14039 		 * We used to do this in nfs4frlock_results_ok but that doesn't
14040 		 * work since VOP_PUTPAGE can call nfs4_commit which calls
14041 		 * nfs4_start_fop. We flush the pages below after calling
14042 		 * nfs4_end_fop above
14043 		 * The flush of the page cache must be done after
14044 		 * nfs4_end_open_seqid_sync() to avoid a 4-way hang.
14045 		 */
14046 		if (!error && resp && resp->status == NFS4_OK)
14047 			do_flush_pages = 1;
14048 	}
14049 	if (argsp) {
14050 		ASSERT(argsp->array_len == 2);
14051 		argop = argsp->array;
14052 		if (argop[1].argop == OP_LOCK)
14053 			nfs4args_lock_free(&argop[1]);
14054 		else if (argop[1].argop == OP_LOCKT)
14055 			nfs4args_lockt_free(&argop[1]);
14056 		kmem_free(argop, 2 * sizeof (nfs_argop4));
14057 		if (resp)
14058 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)resp);
14059 	}
14060 
14061 	/* free the reference on the lock owner */
14062 	if (lop != NULL) {
14063 		nfs4_end_lock_seqid_sync(lop);
14064 		lock_owner_rele(lop);
14065 	}
14066 
14067 	/* need to free up the reference on osp for lock args */
14068 	if (osp != NULL)
14069 		open_stream_rele(osp, rp);
14070 
14071 	/* need to free up the reference on oop for lock args */
14072 	if (oop != NULL) {
14073 		nfs4_end_open_seqid_sync(oop);
14074 		open_owner_rele(oop);
14075 	}
14076 
14077 	if (do_flush_pages)
14078 		nfs4_flush_pages(vp, cred);
14079 
14080 	(void) convoff(vp, flk, whence, offset);
14081 
14082 	lm_rel_sysid(ls);
14083 
14084 	/*
14085 	 * Record debug information in the event we get EINVAL.
14086 	 */
14087 	mutex_enter(&mi->mi_lock);
14088 	if (*errorp == EINVAL && (lock_args || locku_args) &&
14089 	    (!(mi->mi_flags & MI4_POSIX_LOCK))) {
14090 		if (!(mi->mi_flags & MI4_LOCK_DEBUG)) {
14091 			zcmn_err(getzoneid(), CE_NOTE,
14092 			    "%s operation failed with "
14093 			    "EINVAL probably since the server, %s,"
14094 			    " doesn't support POSIX style locking",
14095 			    lock_args ? "LOCK" : "LOCKU",
14096 			    mi->mi_curr_serv->sv_hostname);
14097 			mi->mi_flags |= MI4_LOCK_DEBUG;
14098 		}
14099 	}
14100 	mutex_exit(&mi->mi_lock);
14101 
14102 	if (cred_otw)
14103 		crfree(cred_otw);
14104 }
14105 
14106 /*
14107  * This calls the server and the local locking code.
14108  *
14109  * Client locks are registerred locally by oring the sysid with
14110  * LM_SYSID_CLIENT. The server registers locks locally using just the sysid.
14111  * We need to distinguish between the two to avoid collision in case one
14112  * machine is used as both client and server.
14113  *
14114  * Blocking lock requests will continually retry to acquire the lock
14115  * forever.
14116  *
14117  * The ctype is defined as follows:
14118  * NFS4_LCK_CTYPE_NORM: normal lock request.
14119  *
14120  * NFS4_LCK_CTYPE_RECLAIM:  bypass the usual calls for synchronizing with client
14121  * recovery, get the pid from flk instead of curproc, and don't reregister
14122  * the lock locally.
14123  *
14124  * NFS4_LCK_CTYPE_RESEND: same as NFS4_LCK_CTYPE_RECLAIM, with the addition
14125  * that we will use the information passed in via resend_rqstp to setup the
14126  * lock/locku request.  This resend is the exact same request as the 'lost
14127  * lock', and is initiated by the recovery framework. A successful resend
14128  * request can initiate one or more reinstate requests.
14129  *
14130  * NFS4_LCK_CTYPE_REINSTATE: same as NFS4_LCK_CTYPE_RESEND, except that it
14131  * does not trigger additional reinstate requests.  This lock call type is
14132  * set for setting the v4 server's locking state back to match what the
14133  * client's local locking state is in the event of a received 'lost lock'.
14134  *
14135  * Errors are returned via the nfs4_error_t parameter.
14136  */
14137 void
14138 nfs4frlock(nfs4_lock_call_type_t ctype, vnode_t *vp, int cmd, flock64_t *flk,
14139     int flag, u_offset_t offset, cred_t *cr, nfs4_error_t *ep,
14140     nfs4_lost_rqst_t *resend_rqstp, int *did_reclaimp)
14141 {
14142 	COMPOUND4args_clnt	args, *argsp = NULL;
14143 	COMPOUND4res_clnt	res, *resp = NULL;
14144 	nfs_argop4	*argop;
14145 	nfs_resop4	*resop;
14146 	rnode4_t	*rp;
14147 	int		doqueue = 1;
14148 	clock_t		tick_delay;  /* delay in clock ticks */
14149 	struct lm_sysid	*ls;
14150 	LOCK4args	*lock_args = NULL;
14151 	LOCKU4args	*locku_args = NULL;
14152 	LOCKT4args	*lockt_args = NULL;
14153 	nfs4_open_owner_t *oop = NULL;
14154 	nfs4_open_stream_t *osp = NULL;
14155 	nfs4_lock_owner_t *lop = NULL;
14156 	bool_t		needrecov = FALSE;
14157 	nfs4_recov_state_t recov_state;
14158 	short		whence;
14159 	nfs4_op_hint_t	op_hint;
14160 	nfs4_lost_rqst_t lost_rqst;
14161 	bool_t		retry = FALSE;
14162 	bool_t		did_start_fop = FALSE;
14163 	bool_t		skip_get_err = FALSE;
14164 	cred_t		*cred_otw = NULL;
14165 	bool_t		recovonly;	/* just queue request */
14166 	int		frc_no_reclaim = 0;
14167 #ifdef DEBUG
14168 	char *name;
14169 #endif
14170 
14171 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14172 
14173 #ifdef DEBUG
14174 	name = fn_name(VTOSV(vp)->sv_name);
14175 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4frlock: "
14176 	    "%s: cmd %d, type %d, offset %llu, start %"PRIx64", "
14177 	    "length %"PRIu64", pid %d, sysid %d, call type %s, "
14178 	    "resend request %s", name, cmd, flk->l_type, offset, flk->l_start,
14179 	    flk->l_len, ctype == NFS4_LCK_CTYPE_NORM ? curproc->p_pid :
14180 	    flk->l_pid, flk->l_sysid, nfs4frlock_get_call_type(ctype),
14181 	    resend_rqstp ? "TRUE" : "FALSE"));
14182 	kmem_free(name, MAXNAMELEN);
14183 #endif
14184 
14185 	nfs4_error_zinit(ep);
14186 	ep->error = nfs4frlock_validate_args(cmd, flk, flag, vp, offset);
14187 	if (ep->error)
14188 		return;
14189 	ep->error = nfs4frlock_get_sysid(&ls, vp, flk);
14190 	if (ep->error)
14191 		return;
14192 	nfs4frlock_pre_setup(&tick_delay, &recov_state, flk, &whence,
14193 	    vp, cr, &cred_otw);
14194 
14195 recov_retry:
14196 	nfs4frlock_call_init(&args, &argsp, &argop, &op_hint, flk, cmd,
14197 	    &retry, &did_start_fop, &resp, &skip_get_err, &lost_rqst);
14198 	rp = VTOR4(vp);
14199 
14200 	ep->error = nfs4frlock_start_call(ctype, vp, op_hint, &recov_state,
14201 	    &did_start_fop, &recovonly);
14202 
14203 	if (ep->error)
14204 		goto out;
14205 
14206 	if (recovonly) {
14207 		/*
14208 		 * Leave the request for the recovery system to deal with.
14209 		 */
14210 		ASSERT(ctype == NFS4_LCK_CTYPE_NORM);
14211 		ASSERT(cmd != F_GETLK);
14212 		ASSERT(flk->l_type == F_UNLCK);
14213 
14214 		nfs4_error_init(ep, EINTR);
14215 		needrecov = TRUE;
14216 		lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14217 		if (lop != NULL) {
14218 			nfs4frlock_save_lost_rqst(ctype, ep->error, READ_LT,
14219 			    NULL, NULL, lop, flk, &lost_rqst, cr, vp);
14220 			(void) nfs4_start_recovery(ep,
14221 			    VTOMI4(vp), vp, NULL, NULL,
14222 			    (lost_rqst.lr_op == OP_LOCK ||
14223 			    lost_rqst.lr_op == OP_LOCKU) ?
14224 			    &lost_rqst : NULL, OP_LOCKU, NULL, NULL, NULL);
14225 			lock_owner_rele(lop);
14226 			lop = NULL;
14227 		}
14228 		flk->l_pid = curproc->p_pid;
14229 		nfs4_register_lock_locally(vp, flk, flag, offset);
14230 		goto out;
14231 	}
14232 
14233 	/* putfh directory fh */
14234 	argop[0].argop = OP_CPUTFH;
14235 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
14236 
14237 	/*
14238 	 * Set up the over-the-wire arguments and get references to the
14239 	 * open owner, etc.
14240 	 */
14241 
14242 	if (ctype == NFS4_LCK_CTYPE_RESEND ||
14243 	    ctype == NFS4_LCK_CTYPE_REINSTATE) {
14244 		nfs4frlock_setup_resend_lock_args(resend_rqstp, argsp,
14245 		    &argop[1], &lop, &oop, &osp, &lock_args, &locku_args);
14246 	} else {
14247 		bool_t go_otw = TRUE;
14248 
14249 		ASSERT(resend_rqstp == NULL);
14250 
14251 		switch (cmd) {
14252 		case F_GETLK:
14253 		case F_O_GETLK:
14254 			nfs4frlock_setup_lockt_args(ctype, &argop[1],
14255 			    &lockt_args, argsp, flk, rp);
14256 			break;
14257 		case F_SETLKW:
14258 		case F_SETLK:
14259 			if (flk->l_type == F_UNLCK)
14260 				nfs4frlock_setup_locku_args(ctype,
14261 				    &argop[1], &locku_args, flk,
14262 				    &lop, ep, argsp,
14263 				    vp, flag, offset, cr,
14264 				    &skip_get_err, &go_otw);
14265 			else
14266 				nfs4frlock_setup_lock_args(ctype,
14267 				    &lock_args, &oop, &osp, &lop, &argop[1],
14268 				    argsp, flk, cmd, vp, cr, ep);
14269 
14270 			if (ep->error)
14271 				goto out;
14272 
14273 			switch (ep->stat) {
14274 			case NFS4_OK:
14275 				break;
14276 			case NFS4ERR_DELAY:
14277 				/* recov thread never gets this error */
14278 				ASSERT(resend_rqstp == NULL);
14279 				ASSERT(did_start_fop);
14280 
14281 				nfs4_end_fop(VTOMI4(vp), vp, NULL, op_hint,
14282 				    &recov_state, TRUE);
14283 				did_start_fop = FALSE;
14284 				if (argop[1].argop == OP_LOCK)
14285 					nfs4args_lock_free(&argop[1]);
14286 				else if (argop[1].argop == OP_LOCKT)
14287 					nfs4args_lockt_free(&argop[1]);
14288 				kmem_free(argop, 2 * sizeof (nfs_argop4));
14289 				argsp = NULL;
14290 				goto recov_retry;
14291 			default:
14292 				ep->error = EIO;
14293 				goto out;
14294 			}
14295 			break;
14296 		default:
14297 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14298 			    "nfs4_frlock: invalid cmd %d", cmd));
14299 			ep->error = EINVAL;
14300 			goto out;
14301 		}
14302 
14303 		if (!go_otw)
14304 			goto out;
14305 	}
14306 
14307 	/* XXX should we use the local reclock as a cache ? */
14308 	/*
14309 	 * Unregister the lock with the local locking code before
14310 	 * contacting the server.  This avoids a potential race where
14311 	 * another process gets notified that it has been granted a lock
14312 	 * before we can unregister ourselves locally.
14313 	 */
14314 	if ((cmd == F_SETLK || cmd == F_SETLKW) && flk->l_type == F_UNLCK) {
14315 		if (ctype == NFS4_LCK_CTYPE_NORM)
14316 			flk->l_pid = ttoproc(curthread)->p_pid;
14317 		nfs4_register_lock_locally(vp, flk, flag, offset);
14318 	}
14319 
14320 	/*
14321 	 * Send the server the lock request.  Continually loop with a delay
14322 	 * if get error NFS4ERR_DENIED (for blocking locks) or NFS4ERR_GRACE.
14323 	 */
14324 	resp = &res;
14325 
14326 	NFS4_DEBUG((nfs4_client_call_debug || nfs4_client_lock_debug),
14327 	    (CE_NOTE,
14328 	    "nfs4frlock: %s call, rp %s", needrecov ? "recov" : "first",
14329 	    rnode4info(rp)));
14330 
14331 	if (lock_args && frc_no_reclaim) {
14332 		ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14333 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14334 		    "nfs4frlock: frc_no_reclaim: clearing reclaim"));
14335 		lock_args->reclaim = FALSE;
14336 		if (did_reclaimp)
14337 			*did_reclaimp = 0;
14338 	}
14339 
14340 	/*
14341 	 * Do the OTW call.
14342 	 */
14343 	rfs4call(VTOMI4(vp), argsp, resp, cred_otw, &doqueue, 0, ep);
14344 
14345 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14346 	    "nfs4frlock: error %d, status %d", ep->error, resp->status));
14347 
14348 	needrecov = nfs4_needs_recovery(ep, TRUE, vp->v_vfsp);
14349 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14350 	    "nfs4frlock: needrecov %d", needrecov));
14351 
14352 	if (ep->error == 0 && nfs4_need_to_bump_seqid(resp))
14353 		nfs4frlock_bump_seqid(lock_args, locku_args, oop, lop,
14354 		    args.ctag);
14355 
14356 	/*
14357 	 * Check if one of these mutually exclusive error cases has
14358 	 * happened:
14359 	 *   need to swap credentials due to access error
14360 	 *   recovery is needed
14361 	 *   different error (only known case is missing Kerberos ticket)
14362 	 */
14363 
14364 	if ((ep->error == EACCES ||
14365 	    (ep->error == 0 && resp->status == NFS4ERR_ACCESS)) &&
14366 	    cred_otw != cr) {
14367 		nfs4frlock_check_access(vp, op_hint, &recov_state, needrecov,
14368 		    &did_start_fop, &argsp, &resp, ep->error, &lop, &oop, &osp,
14369 		    cr, &cred_otw);
14370 		goto recov_retry;
14371 	}
14372 
14373 	if (needrecov) {
14374 		/*
14375 		 * LOCKT requests don't need to recover from lost
14376 		 * requests since they don't create/modify state.
14377 		 */
14378 		if ((ep->error == EINTR ||
14379 		    NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) &&
14380 		    lockt_args)
14381 			goto out;
14382 		/*
14383 		 * Do not attempt recovery for requests initiated by
14384 		 * the recovery framework.  Let the framework redrive them.
14385 		 */
14386 		if (ctype != NFS4_LCK_CTYPE_NORM)
14387 			goto out;
14388 		else {
14389 			ASSERT(resend_rqstp == NULL);
14390 		}
14391 
14392 		nfs4frlock_save_lost_rqst(ctype, ep->error,
14393 		    flk_to_locktype(cmd, flk->l_type),
14394 		    oop, osp, lop, flk, &lost_rqst, cred_otw, vp);
14395 
14396 		retry = nfs4frlock_recovery(needrecov, ep, &argsp,
14397 		    &resp, lock_args, locku_args, &oop, &osp, &lop,
14398 		    rp, vp, &recov_state, op_hint, &did_start_fop,
14399 		    cmd != F_GETLK ? &lost_rqst : NULL, flk);
14400 
14401 		if (retry) {
14402 			ASSERT(oop == NULL);
14403 			ASSERT(osp == NULL);
14404 			ASSERT(lop == NULL);
14405 			goto recov_retry;
14406 		}
14407 		goto out;
14408 	}
14409 
14410 	/*
14411 	 * Bail out if have reached this point with ep->error set. Can
14412 	 * happen if (ep->error == EACCES && !needrecov && cred_otw == cr).
14413 	 * This happens if Kerberos ticket has expired or has been
14414 	 * destroyed.
14415 	 */
14416 	if (ep->error != 0)
14417 		goto out;
14418 
14419 	/*
14420 	 * Process the reply.
14421 	 */
14422 	switch (resp->status) {
14423 	case NFS4_OK:
14424 		resop = &resp->array[1];
14425 		nfs4frlock_results_ok(ctype, cmd, flk, vp, flag, offset,
14426 		    resend_rqstp);
14427 		/*
14428 		 * Have a successful lock operation, now update state.
14429 		 */
14430 		nfs4frlock_update_state(lock_args, locku_args, lockt_args,
14431 		    resop, lop, vp, flk, cr, resend_rqstp);
14432 		break;
14433 
14434 	case NFS4ERR_DENIED:
14435 		resop = &resp->array[1];
14436 		retry = nfs4frlock_results_denied(ctype, lock_args, lockt_args,
14437 		    &oop, &osp, &lop, cmd, vp, flk, op_hint,
14438 		    &recov_state, needrecov, &argsp, &resp,
14439 		    &tick_delay, &whence, &ep->error, resop, cr,
14440 		    &did_start_fop, &skip_get_err);
14441 
14442 		if (retry) {
14443 			ASSERT(oop == NULL);
14444 			ASSERT(osp == NULL);
14445 			ASSERT(lop == NULL);
14446 			goto recov_retry;
14447 		}
14448 		break;
14449 	/*
14450 	 * If the server won't let us reclaim, fall-back to trying to lock
14451 	 * the file from scratch. Code elsewhere will check the changeinfo
14452 	 * to ensure the file hasn't been changed.
14453 	 */
14454 	case NFS4ERR_NO_GRACE:
14455 		if (lock_args && lock_args->reclaim == TRUE) {
14456 			ASSERT(ctype == NFS4_LCK_CTYPE_RECLAIM);
14457 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14458 			    "nfs4frlock: reclaim: NFS4ERR_NO_GRACE"));
14459 			frc_no_reclaim = 1;
14460 			/* clean up before retrying */
14461 			needrecov = 0;
14462 			(void) nfs4frlock_recovery(needrecov, ep, &argsp, &resp,
14463 			    lock_args, locku_args, &oop, &osp, &lop, rp, vp,
14464 			    &recov_state, op_hint, &did_start_fop, NULL, flk);
14465 			goto recov_retry;
14466 		}
14467 		/* FALLTHROUGH */
14468 
14469 	default:
14470 		nfs4frlock_results_default(resp, &ep->error);
14471 		break;
14472 	}
14473 out:
14474 	/*
14475 	 * Process and cleanup from error.  Make interrupted unlock
14476 	 * requests look successful, since they will be handled by the
14477 	 * client recovery code.
14478 	 */
14479 	nfs4frlock_final_cleanup(ctype, argsp, resp, vp, op_hint, &recov_state,
14480 	    needrecov, oop, osp, lop, flk, whence, offset, ls, &ep->error,
14481 	    lock_args, locku_args, did_start_fop,
14482 	    skip_get_err, cred_otw, cr);
14483 
14484 	if (ep->error == EINTR && flk->l_type == F_UNLCK &&
14485 	    (cmd == F_SETLK || cmd == F_SETLKW))
14486 		ep->error = 0;
14487 }
14488 
14489 /*
14490  * nfs4_safelock:
14491  *
14492  * Return non-zero if the given lock request can be handled without
14493  * violating the constraints on concurrent mapping and locking.
14494  */
14495 
14496 static int
14497 nfs4_safelock(vnode_t *vp, const struct flock64 *bfp, cred_t *cr)
14498 {
14499 	rnode4_t *rp = VTOR4(vp);
14500 	struct vattr va;
14501 	int error;
14502 
14503 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14504 	ASSERT(rp->r_mapcnt >= 0);
14505 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock %s: "
14506 	    "(%"PRIx64", %"PRIx64"); mapcnt = %ld", bfp->l_type == F_WRLCK ?
14507 	    "write" : bfp->l_type == F_RDLCK ? "read" : "unlock",
14508 	    bfp->l_start, bfp->l_len, rp->r_mapcnt));
14509 
14510 	if (rp->r_mapcnt == 0)
14511 		return (1);		/* always safe if not mapped */
14512 
14513 	/*
14514 	 * If the file is already mapped and there are locks, then they
14515 	 * should be all safe locks.  So adding or removing a lock is safe
14516 	 * as long as the new request is safe (i.e., whole-file, meaning
14517 	 * length and starting offset are both zero).
14518 	 */
14519 
14520 	if (bfp->l_start != 0 || bfp->l_len != 0) {
14521 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14522 		    "cannot lock a memory mapped file unless locking the "
14523 		    "entire file: start %"PRIx64", len %"PRIx64,
14524 		    bfp->l_start, bfp->l_len));
14525 		return (0);
14526 	}
14527 
14528 	/* mandatory locking and mapping don't mix */
14529 	va.va_mask = AT_MODE;
14530 	error = VOP_GETATTR(vp, &va, 0, cr, NULL);
14531 	if (error != 0) {
14532 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14533 		    "getattr error %d", error));
14534 		return (0);		/* treat errors conservatively */
14535 	}
14536 	if (MANDLOCK(vp, va.va_mode)) {
14537 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_safelock: "
14538 		    "cannot mandatory lock and mmap a file"));
14539 		return (0);
14540 	}
14541 
14542 	return (1);
14543 }
14544 
14545 
14546 /*
14547  * Register the lock locally within Solaris.
14548  * As the client, we "or" the sysid with LM_SYSID_CLIENT when
14549  * recording locks locally.
14550  *
14551  * This should handle conflicts/cooperation with NFS v2/v3 since all locks
14552  * are registered locally.
14553  */
14554 void
14555 nfs4_register_lock_locally(vnode_t *vp, struct flock64 *flk, int flag,
14556     u_offset_t offset)
14557 {
14558 	int oldsysid;
14559 	int error;
14560 #ifdef DEBUG
14561 	char *name;
14562 #endif
14563 
14564 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14565 
14566 #ifdef DEBUG
14567 	name = fn_name(VTOSV(vp)->sv_name);
14568 	NFS4_DEBUG(nfs4_client_lock_debug,
14569 	    (CE_NOTE, "nfs4_register_lock_locally: %s: type %d, "
14570 	    "start %"PRIx64", length %"PRIx64", pid %ld, sysid %d",
14571 	    name, flk->l_type, flk->l_start, flk->l_len, (long)flk->l_pid,
14572 	    flk->l_sysid));
14573 	kmem_free(name, MAXNAMELEN);
14574 #endif
14575 
14576 	/* register the lock with local locking */
14577 	oldsysid = flk->l_sysid;
14578 	flk->l_sysid |= LM_SYSID_CLIENT;
14579 	error = reclock(vp, flk, SETFLCK, flag, offset, NULL);
14580 #ifdef DEBUG
14581 	if (error != 0) {
14582 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14583 		    "nfs4_register_lock_locally: could not register with"
14584 		    " local locking"));
14585 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14586 		    "error %d, vp 0x%p, pid %d, sysid 0x%x",
14587 		    error, (void *)vp, flk->l_pid, flk->l_sysid));
14588 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14589 		    "type %d off 0x%" PRIx64 " len 0x%" PRIx64,
14590 		    flk->l_type, flk->l_start, flk->l_len));
14591 		(void) reclock(vp, flk, 0, flag, offset, NULL);
14592 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_CONT,
14593 		    "blocked by pid %d sysid 0x%x type %d "
14594 		    "off 0x%" PRIx64 " len 0x%" PRIx64,
14595 		    flk->l_pid, flk->l_sysid, flk->l_type, flk->l_start,
14596 		    flk->l_len));
14597 	}
14598 #endif
14599 	flk->l_sysid = oldsysid;
14600 }
14601 
14602 /*
14603  * nfs4_lockrelease:
14604  *
14605  * Release any locks on the given vnode that are held by the current
14606  * process.  Also removes the lock owner (if one exists) from the rnode's
14607  * list.
14608  */
14609 static int
14610 nfs4_lockrelease(vnode_t *vp, int flag, offset_t offset, cred_t *cr)
14611 {
14612 	flock64_t ld;
14613 	int ret, error;
14614 	rnode4_t *rp;
14615 	nfs4_lock_owner_t *lop;
14616 	nfs4_recov_state_t recov_state;
14617 	mntinfo4_t *mi;
14618 	bool_t possible_orphan = FALSE;
14619 	bool_t recovonly;
14620 
14621 	ASSERT((uintptr_t)vp > KERNELBASE);
14622 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14623 
14624 	rp = VTOR4(vp);
14625 	mi = VTOMI4(vp);
14626 
14627 	/*
14628 	 * If we have not locked anything then we can
14629 	 * just return since we have no work to do.
14630 	 */
14631 	if (rp->r_lo_head.lo_next_rnode == &rp->r_lo_head) {
14632 		return (0);
14633 	}
14634 
14635 	/*
14636 	 * We need to comprehend that another thread may
14637 	 * kick off recovery and the lock_owner we have stashed
14638 	 * in lop might be invalid so we should NOT cache it
14639 	 * locally!
14640 	 */
14641 	recov_state.rs_flags = 0;
14642 	recov_state.rs_num_retry_despite_err = 0;
14643 	error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14644 	    &recovonly);
14645 	if (error) {
14646 		mutex_enter(&rp->r_statelock);
14647 		rp->r_flags |= R4LODANGLERS;
14648 		mutex_exit(&rp->r_statelock);
14649 		return (error);
14650 	}
14651 
14652 	lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14653 
14654 	/*
14655 	 * Check if the lock owner might have a lock (request was sent but
14656 	 * no response was received).  Also check if there are any remote
14657 	 * locks on the file.  (In theory we shouldn't have to make this
14658 	 * second check if there's no lock owner, but for now we'll be
14659 	 * conservative and do it anyway.)  If either condition is true,
14660 	 * send an unlock for the entire file to the server.
14661 	 *
14662 	 * Note that no explicit synchronization is needed here.  At worst,
14663 	 * flk_has_remote_locks() will return a false positive, in which case
14664 	 * the unlock call wastes time but doesn't harm correctness.
14665 	 */
14666 
14667 	if (lop) {
14668 		mutex_enter(&lop->lo_lock);
14669 		possible_orphan = lop->lo_pending_rqsts;
14670 		mutex_exit(&lop->lo_lock);
14671 		lock_owner_rele(lop);
14672 	}
14673 
14674 	nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14675 
14676 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14677 	    "nfs4_lockrelease: possible orphan %d, remote locks %d, for "
14678 	    "lop %p.", possible_orphan, flk_has_remote_locks(vp),
14679 	    (void *)lop));
14680 
14681 	if (possible_orphan || flk_has_remote_locks(vp)) {
14682 		ld.l_type = F_UNLCK;    /* set to unlock entire file */
14683 		ld.l_whence = 0;	/* unlock from start of file */
14684 		ld.l_start = 0;
14685 		ld.l_len = 0;		/* do entire file */
14686 
14687 		ret = VOP_FRLOCK(vp, F_SETLK, &ld, flag, offset, NULL,
14688 		    cr, NULL);
14689 
14690 		if (ret != 0) {
14691 			/*
14692 			 * If VOP_FRLOCK fails, make sure we unregister
14693 			 * local locks before we continue.
14694 			 */
14695 			ld.l_pid = ttoproc(curthread)->p_pid;
14696 			nfs4_register_lock_locally(vp, &ld, flag, offset);
14697 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
14698 			    "nfs4_lockrelease: lock release error on vp"
14699 			    " %p: error %d.\n", (void *)vp, ret));
14700 		}
14701 	}
14702 
14703 	recov_state.rs_flags = 0;
14704 	recov_state.rs_num_retry_despite_err = 0;
14705 	error = nfs4_start_fop(mi, vp, NULL, OH_LOCKU, &recov_state,
14706 	    &recovonly);
14707 	if (error) {
14708 		mutex_enter(&rp->r_statelock);
14709 		rp->r_flags |= R4LODANGLERS;
14710 		mutex_exit(&rp->r_statelock);
14711 		return (error);
14712 	}
14713 
14714 	/*
14715 	 * So, here we're going to need to retrieve the lock-owner
14716 	 * again (in case recovery has done a switch-a-roo) and
14717 	 * remove it because we can.
14718 	 */
14719 	lop = find_lock_owner(rp, curproc->p_pid, LOWN_ANY);
14720 
14721 	if (lop) {
14722 		nfs4_rnode_remove_lock_owner(rp, lop);
14723 		lock_owner_rele(lop);
14724 	}
14725 
14726 	nfs4_end_fop(mi, vp, NULL, OH_LOCKU, &recov_state, 0);
14727 	return (0);
14728 }
14729 
14730 /*
14731  * Wait for 'tick_delay' clock ticks.
14732  * Implement exponential backoff until hit the lease_time of this nfs4_server.
14733  * NOTE: lock_lease_time is in seconds.
14734  *
14735  * XXX For future improvements, should implement a waiting queue scheme.
14736  */
14737 static int
14738 nfs4_block_and_wait(clock_t *tick_delay, rnode4_t *rp)
14739 {
14740 	long milliseconds_delay;
14741 	time_t lock_lease_time;
14742 
14743 	/* wait tick_delay clock ticks or siginteruptus */
14744 	if (delay_sig(*tick_delay)) {
14745 		return (EINTR);
14746 	}
14747 	NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE, "nfs4_block_and_wait: "
14748 	    "reissue the lock request: blocked for %ld clock ticks: %ld "
14749 	    "milliseconds", *tick_delay, drv_hztousec(*tick_delay) / 1000));
14750 
14751 	/* get the lease time */
14752 	lock_lease_time = r2lease_time(rp);
14753 
14754 	/* drv_hztousec converts ticks to microseconds */
14755 	milliseconds_delay = drv_hztousec(*tick_delay) / 1000;
14756 	if (milliseconds_delay < lock_lease_time * 1000) {
14757 		*tick_delay = 2 * *tick_delay;
14758 		if (drv_hztousec(*tick_delay) > lock_lease_time * 1000 * 1000)
14759 			*tick_delay = drv_usectohz(lock_lease_time*1000*1000);
14760 	}
14761 	return (0);
14762 }
14763 
14764 
14765 void
14766 nfs4_vnops_init(void)
14767 {
14768 }
14769 
14770 void
14771 nfs4_vnops_fini(void)
14772 {
14773 }
14774 
14775 /*
14776  * Return a reference to the directory (parent) vnode for a given vnode,
14777  * using the saved pathname information and the directory file handle.  The
14778  * caller is responsible for disposing of the reference.
14779  * Returns zero or an errno value.
14780  *
14781  * Caller should set need_start_op to FALSE if it is the recovery
14782  * thread, or if a start_fop has already been done.  Otherwise, TRUE.
14783  */
14784 int
14785 vtodv(vnode_t *vp, vnode_t **dvpp, cred_t *cr, bool_t need_start_op)
14786 {
14787 	svnode_t *svnp;
14788 	vnode_t *dvp = NULL;
14789 	servinfo4_t *svp;
14790 	nfs4_fname_t *mfname;
14791 	int error;
14792 
14793 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14794 
14795 	if (vp->v_flag & VROOT) {
14796 		nfs4_sharedfh_t *sfh;
14797 		nfs_fh4 fh;
14798 		mntinfo4_t *mi;
14799 
14800 		ASSERT(vp->v_type == VREG);
14801 
14802 		mi = VTOMI4(vp);
14803 		svp = mi->mi_curr_serv;
14804 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14805 		fh.nfs_fh4_len = svp->sv_pfhandle.fh_len;
14806 		fh.nfs_fh4_val = svp->sv_pfhandle.fh_buf;
14807 		sfh = sfh4_get(&fh, VTOMI4(vp));
14808 		nfs_rw_exit(&svp->sv_lock);
14809 		mfname = mi->mi_fname;
14810 		fn_hold(mfname);
14811 		dvp = makenfs4node_by_fh(sfh, NULL, &mfname, NULL, mi, cr, 0);
14812 		sfh4_rele(&sfh);
14813 
14814 		if (dvp->v_type == VNON)
14815 			dvp->v_type = VDIR;
14816 		*dvpp = dvp;
14817 		return (0);
14818 	}
14819 
14820 	svnp = VTOSV(vp);
14821 
14822 	if (svnp == NULL) {
14823 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14824 		    "shadow node is NULL"));
14825 		return (EINVAL);
14826 	}
14827 
14828 	if (svnp->sv_name == NULL || svnp->sv_dfh == NULL) {
14829 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14830 		    "shadow node name or dfh val == NULL"));
14831 		return (EINVAL);
14832 	}
14833 
14834 	error = nfs4_make_dotdot(svnp->sv_dfh, 0, vp, cr, &dvp,
14835 	    (int)need_start_op);
14836 	if (error != 0) {
14837 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14838 		    "nfs4_make_dotdot returned %d", error));
14839 		return (error);
14840 	}
14841 	if (!dvp) {
14842 		NFS4_DEBUG(nfs4_client_shadow_debug, (CE_NOTE, "vtodv: "
14843 		    "nfs4_make_dotdot returned a NULL dvp"));
14844 		return (EIO);
14845 	}
14846 	if (dvp->v_type == VNON)
14847 		dvp->v_type = VDIR;
14848 	ASSERT(dvp->v_type == VDIR);
14849 	if (VTOR4(vp)->r_flags & R4ISXATTR) {
14850 		mutex_enter(&dvp->v_lock);
14851 		dvp->v_flag |= V_XATTRDIR;
14852 		mutex_exit(&dvp->v_lock);
14853 	}
14854 	*dvpp = dvp;
14855 	return (0);
14856 }
14857 
14858 /*
14859  * Copy the (final) component name of vp to fnamep.  maxlen is the maximum
14860  * length that fnamep can accept, including the trailing null.
14861  * Returns 0 if okay, returns an errno value if there was a problem.
14862  */
14863 
14864 int
14865 vtoname(vnode_t *vp, char *fnamep, ssize_t maxlen)
14866 {
14867 	char *fn;
14868 	int err = 0;
14869 	servinfo4_t *svp;
14870 	svnode_t *shvp;
14871 
14872 	/*
14873 	 * If the file being opened has VROOT set, then this is
14874 	 * a "file" mount.  sv_name will not be interesting, so
14875 	 * go back to the servinfo4 to get the original mount
14876 	 * path and strip off all but the final edge.  Otherwise
14877 	 * just return the name from the shadow vnode.
14878 	 */
14879 
14880 	if (vp->v_flag & VROOT) {
14881 
14882 		svp = VTOMI4(vp)->mi_curr_serv;
14883 		(void) nfs_rw_enter_sig(&svp->sv_lock, RW_READER, 0);
14884 
14885 		fn = strrchr(svp->sv_path, '/');
14886 		if (fn == NULL)
14887 			err = EINVAL;
14888 		else
14889 			fn++;
14890 	} else {
14891 		shvp = VTOSV(vp);
14892 		fn = fn_name(shvp->sv_name);
14893 	}
14894 
14895 	if (err == 0)
14896 		if (strlen(fn) < maxlen)
14897 			(void) strcpy(fnamep, fn);
14898 		else
14899 			err = ENAMETOOLONG;
14900 
14901 	if (vp->v_flag & VROOT)
14902 		nfs_rw_exit(&svp->sv_lock);
14903 	else
14904 		kmem_free(fn, MAXNAMELEN);
14905 
14906 	return (err);
14907 }
14908 
14909 /*
14910  * Bookkeeping for a close that doesn't need to go over the wire.
14911  * *have_lockp is set to 0 if 'os_sync_lock' is released; otherwise
14912  * it is left at 1.
14913  */
14914 void
14915 nfs4close_notw(vnode_t *vp, nfs4_open_stream_t *osp, int *have_lockp)
14916 {
14917 	rnode4_t		*rp;
14918 	mntinfo4_t		*mi;
14919 
14920 	mi = VTOMI4(vp);
14921 	rp = VTOR4(vp);
14922 
14923 	NFS4_DEBUG(nfs4close_notw_debug, (CE_NOTE, "nfs4close_notw: "
14924 	    "rp=%p osp=%p", (void *)rp, (void *)osp));
14925 	ASSERT(nfs_zone() == mi->mi_zone);
14926 	ASSERT(mutex_owned(&osp->os_sync_lock));
14927 	ASSERT(*have_lockp);
14928 
14929 	if (!osp->os_valid ||
14930 	    osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
14931 		return;
14932 	}
14933 
14934 	/*
14935 	 * This removes the reference obtained at OPEN; ie,
14936 	 * when the open stream structure was created.
14937 	 *
14938 	 * We don't have to worry about calling 'open_stream_rele'
14939 	 * since we our currently holding a reference to this
14940 	 * open stream which means the count can not go to 0 with
14941 	 * this decrement.
14942 	 */
14943 	ASSERT(osp->os_ref_count >= 2);
14944 	osp->os_ref_count--;
14945 	osp->os_valid = 0;
14946 	mutex_exit(&osp->os_sync_lock);
14947 	*have_lockp = 0;
14948 
14949 	nfs4_dec_state_ref_count(mi);
14950 }
14951 
14952 /*
14953  * Close all remaining open streams on the rnode.  These open streams
14954  * could be here because:
14955  * - The close attempted at either close or delmap failed
14956  * - Some kernel entity did VOP_OPEN but never did VOP_CLOSE
14957  * - Someone did mknod on a regular file but never opened it
14958  */
14959 int
14960 nfs4close_all(vnode_t *vp, cred_t *cr)
14961 {
14962 	nfs4_open_stream_t *osp;
14963 	int error;
14964 	nfs4_error_t e = { 0, NFS4_OK, RPC_SUCCESS };
14965 	rnode4_t *rp;
14966 
14967 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
14968 
14969 	error = 0;
14970 	rp = VTOR4(vp);
14971 
14972 	/*
14973 	 * At this point, all we know is that the last time
14974 	 * someone called vn_rele, the count was 1.  Since then,
14975 	 * the vnode could have been re-activated.  We want to
14976 	 * loop through the open streams and close each one, but
14977 	 * we have to be careful since once we release the rnode
14978 	 * hash bucket lock, someone else is free to come in and
14979 	 * re-activate the rnode and add new open streams.  The
14980 	 * strategy is take the rnode hash bucket lock, verify that
14981 	 * the count is still 1, grab the open stream off the
14982 	 * head of the list and mark it invalid, then release the
14983 	 * rnode hash bucket lock and proceed with that open stream.
14984 	 * This is ok because nfs4close_one() will acquire the proper
14985 	 * open/create to close/destroy synchronization for open
14986 	 * streams, and will ensure that if someone has reopened
14987 	 * the open stream after we've dropped the hash bucket lock
14988 	 * then we'll just simply return without destroying the
14989 	 * open stream.
14990 	 * Repeat until the list is empty.
14991 	 */
14992 
14993 	for (;;) {
14994 
14995 		/* make sure vnode hasn't been reactivated */
14996 		rw_enter(&rp->r_hashq->r_lock, RW_READER);
14997 		mutex_enter(&vp->v_lock);
14998 		if (vp->v_count > 1) {
14999 			mutex_exit(&vp->v_lock);
15000 			rw_exit(&rp->r_hashq->r_lock);
15001 			break;
15002 		}
15003 		/*
15004 		 * Grabbing r_os_lock before releasing v_lock prevents
15005 		 * a window where the rnode/open stream could get
15006 		 * reactivated (and os_force_close set to 0) before we
15007 		 * had a chance to set os_force_close to 1.
15008 		 */
15009 		mutex_enter(&rp->r_os_lock);
15010 		mutex_exit(&vp->v_lock);
15011 
15012 		osp = list_head(&rp->r_open_streams);
15013 		if (!osp) {
15014 			/* nothing left to CLOSE OTW, so return */
15015 			mutex_exit(&rp->r_os_lock);
15016 			rw_exit(&rp->r_hashq->r_lock);
15017 			break;
15018 		}
15019 
15020 		mutex_enter(&rp->r_statev4_lock);
15021 		/* the file can't still be mem mapped */
15022 		ASSERT(rp->r_mapcnt == 0);
15023 		if (rp->created_v4)
15024 			rp->created_v4 = 0;
15025 		mutex_exit(&rp->r_statev4_lock);
15026 
15027 		/*
15028 		 * Grab a ref on this open stream; nfs4close_one
15029 		 * will mark it as invalid
15030 		 */
15031 		mutex_enter(&osp->os_sync_lock);
15032 		osp->os_ref_count++;
15033 		osp->os_force_close = 1;
15034 		mutex_exit(&osp->os_sync_lock);
15035 		mutex_exit(&rp->r_os_lock);
15036 		rw_exit(&rp->r_hashq->r_lock);
15037 
15038 		nfs4close_one(vp, osp, cr, 0, NULL, &e, CLOSE_FORCE, 0, 0, 0);
15039 
15040 		/* Update error if it isn't already non-zero */
15041 		if (error == 0) {
15042 			if (e.error)
15043 				error = e.error;
15044 			else if (e.stat)
15045 				error = geterrno4(e.stat);
15046 		}
15047 
15048 #ifdef	DEBUG
15049 		nfs4close_all_cnt++;
15050 #endif
15051 		/* Release the ref on osp acquired above. */
15052 		open_stream_rele(osp, rp);
15053 
15054 		/* Proceed to the next open stream, if any */
15055 	}
15056 	return (error);
15057 }
15058 
15059 /*
15060  * nfs4close_one - close one open stream for a file if needed.
15061  *
15062  * "close_type" indicates which close path this is:
15063  * CLOSE_NORM: close initiated via VOP_CLOSE.
15064  * CLOSE_DELMAP: close initiated via VOP_DELMAP.
15065  * CLOSE_FORCE: close initiated via VOP_INACTIVE.  This path forces
15066  *	the close and release of client state for this open stream
15067  *	(unless someone else has the open stream open).
15068  * CLOSE_RESEND: indicates the request is a replay of an earlier request
15069  *	(e.g., due to abort because of a signal).
15070  * CLOSE_AFTER_RESEND: close initiated to "undo" a successful resent OPEN.
15071  *
15072  * CLOSE_RESEND and CLOSE_AFTER_RESEND will not attempt to retry after client
15073  * recovery.  Instead, the caller is expected to deal with retries.
15074  *
15075  * The caller can either pass in the osp ('provided_osp') or not.
15076  *
15077  * 'access_bits' represents the access we are closing/downgrading.
15078  *
15079  * 'len', 'prot', and 'mmap_flags' are used for CLOSE_DELMAP.  'len' is the
15080  * number of bytes we are unmapping, 'maxprot' is the mmap protection, and
15081  * 'mmap_flags' tells us the type of sharing (MAP_PRIVATE or MAP_SHARED).
15082  *
15083  * Errors are returned via the nfs4_error_t.
15084  */
15085 void
15086 nfs4close_one(vnode_t *vp, nfs4_open_stream_t *provided_osp, cred_t *cr,
15087     int access_bits, nfs4_lost_rqst_t *lrp, nfs4_error_t *ep,
15088     nfs4_close_type_t close_type, size_t len, uint_t maxprot,
15089     uint_t mmap_flags)
15090 {
15091 	nfs4_open_owner_t *oop;
15092 	nfs4_open_stream_t *osp = NULL;
15093 	int retry = 0;
15094 	int num_retries = NFS4_NUM_RECOV_RETRIES;
15095 	rnode4_t *rp;
15096 	mntinfo4_t *mi;
15097 	nfs4_recov_state_t recov_state;
15098 	cred_t *cred_otw = NULL;
15099 	bool_t recovonly = FALSE;
15100 	int isrecov;
15101 	int force_close;
15102 	int close_failed = 0;
15103 	int did_dec_count = 0;
15104 	int did_start_op = 0;
15105 	int did_force_recovlock = 0;
15106 	int did_start_seqid_sync = 0;
15107 	int have_sync_lock = 0;
15108 
15109 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15110 
15111 	NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE, "closing vp %p osp %p, "
15112 	    "lrp %p, close type %d len %ld prot %x mmap flags %x bits %x",
15113 	    (void *)vp, (void *)provided_osp, (void *)lrp, close_type,
15114 	    len, maxprot, mmap_flags, access_bits));
15115 
15116 	nfs4_error_zinit(ep);
15117 	rp = VTOR4(vp);
15118 	mi = VTOMI4(vp);
15119 	isrecov = (close_type == CLOSE_RESEND ||
15120 	    close_type == CLOSE_AFTER_RESEND);
15121 
15122 	/*
15123 	 * First get the open owner.
15124 	 */
15125 	if (!provided_osp) {
15126 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
15127 	} else {
15128 		oop = provided_osp->os_open_owner;
15129 		ASSERT(oop != NULL);
15130 		open_owner_hold(oop);
15131 	}
15132 
15133 	if (!oop) {
15134 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15135 		    "nfs4close_one: no oop, rp %p, mi %p, cr %p, osp %p, "
15136 		    "close type %d", (void *)rp, (void *)mi, (void *)cr,
15137 		    (void *)provided_osp, close_type));
15138 		ep->error = EIO;
15139 		goto out;
15140 	}
15141 
15142 	cred_otw = nfs4_get_otw_cred(cr, mi, oop);
15143 recov_retry:
15144 	osp = NULL;
15145 	close_failed = 0;
15146 	force_close = (close_type == CLOSE_FORCE);
15147 	retry = 0;
15148 	did_start_op = 0;
15149 	did_force_recovlock = 0;
15150 	did_start_seqid_sync = 0;
15151 	have_sync_lock = 0;
15152 	recovonly = FALSE;
15153 	recov_state.rs_flags = 0;
15154 	recov_state.rs_num_retry_despite_err = 0;
15155 
15156 	/*
15157 	 * Second synchronize with recovery.
15158 	 */
15159 	if (!isrecov) {
15160 		ep->error = nfs4_start_fop(mi, vp, NULL, OH_CLOSE,
15161 		    &recov_state, &recovonly);
15162 		if (!ep->error) {
15163 			did_start_op = 1;
15164 		} else {
15165 			close_failed = 1;
15166 			/*
15167 			 * If we couldn't get start_fop, but have to
15168 			 * cleanup state, then at least acquire the
15169 			 * mi_recovlock so we can synchronize with
15170 			 * recovery.
15171 			 */
15172 			if (close_type == CLOSE_FORCE) {
15173 				(void) nfs_rw_enter_sig(&mi->mi_recovlock,
15174 				    RW_READER, FALSE);
15175 				did_force_recovlock = 1;
15176 			} else
15177 				goto out;
15178 		}
15179 	}
15180 
15181 	/*
15182 	 * We cannot attempt to get the open seqid sync if nfs4_start_fop
15183 	 * set 'recovonly' to TRUE since most likely this is due to
15184 	 * reovery being active (MI4_RECOV_ACTIV).  If recovery is active,
15185 	 * nfs4_start_open_seqid_sync() will fail with EAGAIN asking us
15186 	 * to retry, causing us to loop until recovery finishes.  Plus we
15187 	 * don't need protection over the open seqid since we're not going
15188 	 * OTW, hence don't need to use the seqid.
15189 	 */
15190 	if (recovonly == FALSE) {
15191 		/* need to grab the open owner sync before 'os_sync_lock' */
15192 		ep->error = nfs4_start_open_seqid_sync(oop, mi);
15193 		if (ep->error == EAGAIN) {
15194 			ASSERT(!isrecov);
15195 			if (did_start_op)
15196 				nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15197 				    &recov_state, TRUE);
15198 			if (did_force_recovlock)
15199 				nfs_rw_exit(&mi->mi_recovlock);
15200 			goto recov_retry;
15201 		}
15202 		did_start_seqid_sync = 1;
15203 	}
15204 
15205 	/*
15206 	 * Third get an open stream and acquire 'os_sync_lock' to
15207 	 * sychronize the opening/creating of an open stream with the
15208 	 * closing/destroying of an open stream.
15209 	 */
15210 	if (!provided_osp) {
15211 		/* returns with 'os_sync_lock' held */
15212 		osp = find_open_stream(oop, rp);
15213 		if (!osp) {
15214 			ep->error = EIO;
15215 			goto out;
15216 		}
15217 	} else {
15218 		osp = provided_osp;
15219 		open_stream_hold(osp);
15220 		mutex_enter(&osp->os_sync_lock);
15221 	}
15222 	have_sync_lock = 1;
15223 
15224 	ASSERT(oop == osp->os_open_owner);
15225 
15226 	/*
15227 	 * Fourth, do any special pre-OTW CLOSE processing
15228 	 * based on the specific close type.
15229 	 */
15230 	if ((close_type == CLOSE_NORM || close_type == CLOSE_AFTER_RESEND) &&
15231 	    !did_dec_count) {
15232 		ASSERT(osp->os_open_ref_count > 0);
15233 		osp->os_open_ref_count--;
15234 		did_dec_count = 1;
15235 		if (osp->os_open_ref_count == 0)
15236 			osp->os_final_close = 1;
15237 	}
15238 
15239 	if (close_type == CLOSE_FORCE) {
15240 		/* see if somebody reopened the open stream. */
15241 		if (!osp->os_force_close) {
15242 			NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15243 			    "nfs4close_one: skip CLOSE_FORCE as osp %p "
15244 			    "was reopened, vp %p", (void *)osp, (void *)vp));
15245 			ep->error = 0;
15246 			ep->stat = NFS4_OK;
15247 			goto out;
15248 		}
15249 
15250 		if (!osp->os_final_close && !did_dec_count) {
15251 			osp->os_open_ref_count--;
15252 			did_dec_count = 1;
15253 		}
15254 
15255 		/*
15256 		 * We can't depend on os_open_ref_count being 0 due to the
15257 		 * way executables are opened (VN_RELE to match a VOP_OPEN).
15258 		 */
15259 #ifdef	NOTYET
15260 		ASSERT(osp->os_open_ref_count == 0);
15261 #endif
15262 		if (osp->os_open_ref_count != 0) {
15263 			NFS4_DEBUG(nfs4close_one_debug, (CE_NOTE,
15264 			    "nfs4close_one: should panic here on an "
15265 			    "ASSERT(osp->os_open_ref_count == 0). Ignoring "
15266 			    "since this is probably the exec problem."));
15267 
15268 			osp->os_open_ref_count = 0;
15269 		}
15270 
15271 		/*
15272 		 * There is the possibility that nfs4close_one()
15273 		 * for close_type == CLOSE_DELMAP couldn't find the
15274 		 * open stream, thus couldn't decrement its os_mapcnt;
15275 		 * therefore we can't use this ASSERT yet.
15276 		 */
15277 #ifdef	NOTYET
15278 		ASSERT(osp->os_mapcnt == 0);
15279 #endif
15280 		osp->os_mapcnt = 0;
15281 	}
15282 
15283 	if (close_type == CLOSE_DELMAP && !did_dec_count) {
15284 		ASSERT(osp->os_mapcnt >= btopr(len));
15285 
15286 		if ((mmap_flags & MAP_SHARED) && (maxprot & PROT_WRITE))
15287 			osp->os_mmap_write -= btopr(len);
15288 		if (maxprot & PROT_READ)
15289 			osp->os_mmap_read -= btopr(len);
15290 		if (maxprot & PROT_EXEC)
15291 			osp->os_mmap_read -= btopr(len);
15292 		/* mirror the PROT_NONE check in nfs4_addmap() */
15293 		if (!(maxprot & PROT_READ) && !(maxprot & PROT_WRITE) &&
15294 		    !(maxprot & PROT_EXEC))
15295 			osp->os_mmap_read -= btopr(len);
15296 		osp->os_mapcnt -= btopr(len);
15297 		did_dec_count = 1;
15298 	}
15299 
15300 	if (recovonly) {
15301 		nfs4_lost_rqst_t lost_rqst;
15302 
15303 		/* request should not already be in recovery queue */
15304 		ASSERT(lrp == NULL);
15305 		nfs4_error_init(ep, EINTR);
15306 		nfs4close_save_lost_rqst(ep->error, &lost_rqst, oop,
15307 		    osp, cred_otw, vp);
15308 		mutex_exit(&osp->os_sync_lock);
15309 		have_sync_lock = 0;
15310 		(void) nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15311 		    lost_rqst.lr_op == OP_CLOSE ?
15312 		    &lost_rqst : NULL, OP_CLOSE, NULL, NULL, NULL);
15313 		close_failed = 1;
15314 		force_close = 0;
15315 		goto close_cleanup;
15316 	}
15317 
15318 	/*
15319 	 * If a previous OTW call got NFS4ERR_BAD_SEQID, then
15320 	 * we stopped operating on the open owner's <old oo_name, old seqid>
15321 	 * space, which means we stopped operating on the open stream
15322 	 * too.  So don't go OTW (as the seqid is likely bad, and the
15323 	 * stateid could be stale, potentially triggering a false
15324 	 * setclientid), and just clean up the client's internal state.
15325 	 */
15326 	if (osp->os_orig_oo_name != oop->oo_name) {
15327 		NFS4_DEBUG(nfs4close_one_debug || nfs4_client_recov_debug,
15328 		    (CE_NOTE, "nfs4close_one: skip OTW close for osp %p "
15329 		    "oop %p due to bad seqid (orig oo_name %" PRIx64 " current "
15330 		    "oo_name %" PRIx64")",
15331 		    (void *)osp, (void *)oop, osp->os_orig_oo_name,
15332 		    oop->oo_name));
15333 		close_failed = 1;
15334 	}
15335 
15336 	/* If the file failed recovery, just quit. */
15337 	mutex_enter(&rp->r_statelock);
15338 	if (rp->r_flags & R4RECOVERR) {
15339 		close_failed = 1;
15340 	}
15341 	mutex_exit(&rp->r_statelock);
15342 
15343 	/*
15344 	 * If the force close path failed to obtain start_fop
15345 	 * then skip the OTW close and just remove the state.
15346 	 */
15347 	if (close_failed)
15348 		goto close_cleanup;
15349 
15350 	/*
15351 	 * Fifth, check to see if there are still mapped pages or other
15352 	 * opens using this open stream.  If there are then we can't
15353 	 * close yet but we can see if an OPEN_DOWNGRADE is necessary.
15354 	 */
15355 	if (osp->os_open_ref_count > 0 || osp->os_mapcnt > 0) {
15356 		nfs4_lost_rqst_t	new_lost_rqst;
15357 		bool_t			needrecov = FALSE;
15358 		cred_t			*odg_cred_otw = NULL;
15359 		seqid4			open_dg_seqid = 0;
15360 
15361 		if (osp->os_delegation) {
15362 			/*
15363 			 * If this open stream was never OPENed OTW then we
15364 			 * surely can't DOWNGRADE it (especially since the
15365 			 * osp->open_stateid is really a delegation stateid
15366 			 * when os_delegation is 1).
15367 			 */
15368 			if (access_bits & FREAD)
15369 				osp->os_share_acc_read--;
15370 			if (access_bits & FWRITE)
15371 				osp->os_share_acc_write--;
15372 			osp->os_share_deny_none--;
15373 			nfs4_error_zinit(ep);
15374 			goto out;
15375 		}
15376 		nfs4_open_downgrade(access_bits, 0, oop, osp, vp, cr,
15377 		    lrp, ep, &odg_cred_otw, &open_dg_seqid);
15378 		needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
15379 		if (needrecov && !isrecov) {
15380 			bool_t abort;
15381 			nfs4_bseqid_entry_t *bsep = NULL;
15382 
15383 			if (!ep->error && ep->stat == NFS4ERR_BAD_SEQID)
15384 				bsep = nfs4_create_bseqid_entry(oop, NULL,
15385 				    vp, 0,
15386 				    lrp ? TAG_OPEN_DG_LOST : TAG_OPEN_DG,
15387 				    open_dg_seqid);
15388 
15389 			nfs4open_dg_save_lost_rqst(ep->error, &new_lost_rqst,
15390 			    oop, osp, odg_cred_otw, vp, access_bits, 0);
15391 			mutex_exit(&osp->os_sync_lock);
15392 			have_sync_lock = 0;
15393 			abort = nfs4_start_recovery(ep, mi, vp, NULL, NULL,
15394 			    new_lost_rqst.lr_op == OP_OPEN_DOWNGRADE ?
15395 			    &new_lost_rqst : NULL, OP_OPEN_DOWNGRADE,
15396 			    bsep, NULL, NULL);
15397 			if (odg_cred_otw)
15398 				crfree(odg_cred_otw);
15399 			if (bsep)
15400 				kmem_free(bsep, sizeof (*bsep));
15401 
15402 			if (abort == TRUE)
15403 				goto out;
15404 
15405 			if (did_start_seqid_sync) {
15406 				nfs4_end_open_seqid_sync(oop);
15407 				did_start_seqid_sync = 0;
15408 			}
15409 			open_stream_rele(osp, rp);
15410 
15411 			if (did_start_op)
15412 				nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15413 				    &recov_state, FALSE);
15414 			if (did_force_recovlock)
15415 				nfs_rw_exit(&mi->mi_recovlock);
15416 
15417 			goto recov_retry;
15418 		} else {
15419 			if (odg_cred_otw)
15420 				crfree(odg_cred_otw);
15421 		}
15422 		goto out;
15423 	}
15424 
15425 	/*
15426 	 * If this open stream was created as the results of an open
15427 	 * while holding a delegation, then just release it; no need
15428 	 * to do an OTW close.  Otherwise do a "normal" OTW close.
15429 	 */
15430 	if (osp->os_delegation) {
15431 		nfs4close_notw(vp, osp, &have_sync_lock);
15432 		nfs4_error_zinit(ep);
15433 		goto out;
15434 	}
15435 
15436 	/*
15437 	 * If this stream is not valid, we're done.
15438 	 */
15439 	if (!osp->os_valid) {
15440 		nfs4_error_zinit(ep);
15441 		goto out;
15442 	}
15443 
15444 	/*
15445 	 * Last open or mmap ref has vanished, need to do an OTW close.
15446 	 * First check to see if a close is still necessary.
15447 	 */
15448 	if (osp->os_failed_reopen) {
15449 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15450 		    "don't close OTW osp %p since reopen failed.",
15451 		    (void *)osp));
15452 		/*
15453 		 * Reopen of the open stream failed, hence the
15454 		 * stateid of the open stream is invalid/stale, and
15455 		 * sending this OTW would incorrectly cause another
15456 		 * round of recovery.  In this case, we need to set
15457 		 * the 'os_valid' bit to 0 so another thread doesn't
15458 		 * come in and re-open this open stream before
15459 		 * this "closing" thread cleans up state (decrementing
15460 		 * the nfs4_server_t's state_ref_count and decrementing
15461 		 * the os_ref_count).
15462 		 */
15463 		osp->os_valid = 0;
15464 		/*
15465 		 * This removes the reference obtained at OPEN; ie,
15466 		 * when the open stream structure was created.
15467 		 *
15468 		 * We don't have to worry about calling 'open_stream_rele'
15469 		 * since we our currently holding a reference to this
15470 		 * open stream which means the count can not go to 0 with
15471 		 * this decrement.
15472 		 */
15473 		ASSERT(osp->os_ref_count >= 2);
15474 		osp->os_ref_count--;
15475 		nfs4_error_zinit(ep);
15476 		close_failed = 0;
15477 		goto close_cleanup;
15478 	}
15479 
15480 	ASSERT(osp->os_ref_count > 1);
15481 
15482 	/*
15483 	 * Sixth, try the CLOSE OTW.
15484 	 */
15485 	nfs4close_otw(rp, cred_otw, oop, osp, &retry, &did_start_seqid_sync,
15486 	    close_type, ep, &have_sync_lock);
15487 
15488 	if (ep->error == EINTR || NFS4_FRC_UNMT_ERR(ep->error, vp->v_vfsp)) {
15489 		/*
15490 		 * Let the recovery thread be responsible for
15491 		 * removing the state for CLOSE.
15492 		 */
15493 		close_failed = 1;
15494 		force_close = 0;
15495 		retry = 0;
15496 	}
15497 
15498 	/* See if we need to retry with a different cred */
15499 	if ((ep->error == EACCES ||
15500 	    (ep->error == 0 && ep->stat == NFS4ERR_ACCESS)) &&
15501 	    cred_otw != cr) {
15502 		crfree(cred_otw);
15503 		cred_otw = cr;
15504 		crhold(cred_otw);
15505 		retry = 1;
15506 	}
15507 
15508 	if (ep->error || ep->stat)
15509 		close_failed = 1;
15510 
15511 	if (retry && !isrecov && num_retries-- > 0) {
15512 		if (have_sync_lock) {
15513 			mutex_exit(&osp->os_sync_lock);
15514 			have_sync_lock = 0;
15515 		}
15516 		if (did_start_seqid_sync) {
15517 			nfs4_end_open_seqid_sync(oop);
15518 			did_start_seqid_sync = 0;
15519 		}
15520 		open_stream_rele(osp, rp);
15521 
15522 		if (did_start_op)
15523 			nfs4_end_fop(mi, vp, NULL, OH_CLOSE,
15524 			    &recov_state, FALSE);
15525 		if (did_force_recovlock)
15526 			nfs_rw_exit(&mi->mi_recovlock);
15527 		NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
15528 		    "nfs4close_one: need to retry the close "
15529 		    "operation"));
15530 		goto recov_retry;
15531 	}
15532 close_cleanup:
15533 	/*
15534 	 * Seventh and lastly, process our results.
15535 	 */
15536 	if (close_failed && force_close) {
15537 		/*
15538 		 * It's ok to drop and regrab the 'os_sync_lock' since
15539 		 * nfs4close_notw() will recheck to make sure the
15540 		 * "close"/removal of state should happen.
15541 		 */
15542 		if (!have_sync_lock) {
15543 			mutex_enter(&osp->os_sync_lock);
15544 			have_sync_lock = 1;
15545 		}
15546 		/*
15547 		 * This is last call, remove the ref on the open
15548 		 * stream created by open and clean everything up.
15549 		 */
15550 		osp->os_pending_close = 0;
15551 		nfs4close_notw(vp, osp, &have_sync_lock);
15552 		nfs4_error_zinit(ep);
15553 	}
15554 
15555 	if (!close_failed) {
15556 		if (have_sync_lock) {
15557 			osp->os_pending_close = 0;
15558 			mutex_exit(&osp->os_sync_lock);
15559 			have_sync_lock = 0;
15560 		} else {
15561 			mutex_enter(&osp->os_sync_lock);
15562 			osp->os_pending_close = 0;
15563 			mutex_exit(&osp->os_sync_lock);
15564 		}
15565 		if (did_start_op && recov_state.rs_sp != NULL) {
15566 			mutex_enter(&recov_state.rs_sp->s_lock);
15567 			nfs4_dec_state_ref_count_nolock(recov_state.rs_sp, mi);
15568 			mutex_exit(&recov_state.rs_sp->s_lock);
15569 		} else {
15570 			nfs4_dec_state_ref_count(mi);
15571 		}
15572 		nfs4_error_zinit(ep);
15573 	}
15574 
15575 out:
15576 	if (have_sync_lock)
15577 		mutex_exit(&osp->os_sync_lock);
15578 	if (did_start_op)
15579 		nfs4_end_fop(mi, vp, NULL, OH_CLOSE, &recov_state,
15580 		    recovonly ? TRUE : FALSE);
15581 	if (did_force_recovlock)
15582 		nfs_rw_exit(&mi->mi_recovlock);
15583 	if (cred_otw)
15584 		crfree(cred_otw);
15585 	if (osp)
15586 		open_stream_rele(osp, rp);
15587 	if (oop) {
15588 		if (did_start_seqid_sync)
15589 			nfs4_end_open_seqid_sync(oop);
15590 		open_owner_rele(oop);
15591 	}
15592 }
15593 
15594 /*
15595  * Convert information returned by the server in the LOCK4denied
15596  * structure to the form required by fcntl.
15597  */
15598 static void
15599 denied_to_flk(LOCK4denied *lockt_denied, flock64_t *flk, LOCKT4args *lockt_args)
15600 {
15601 	nfs4_lo_name_t *lo;
15602 
15603 #ifdef	DEBUG
15604 	if (denied_to_flk_debug) {
15605 		lockt_denied_debug = lockt_denied;
15606 		debug_enter("lockt_denied");
15607 	}
15608 #endif
15609 
15610 	flk->l_type = lockt_denied->locktype == READ_LT ? F_RDLCK : F_WRLCK;
15611 	flk->l_whence = 0;	/* aka SEEK_SET */
15612 	flk->l_start = lockt_denied->offset;
15613 	flk->l_len = lockt_denied->length;
15614 
15615 	/*
15616 	 * If the blocking clientid matches our client id, then we can
15617 	 * interpret the lockowner (since we built it).  If not, then
15618 	 * fabricate a sysid and pid.  Note that the l_sysid field
15619 	 * in *flk already has the local sysid.
15620 	 */
15621 
15622 	if (lockt_denied->owner.clientid == lockt_args->owner.clientid) {
15623 
15624 		if (lockt_denied->owner.owner_len == sizeof (*lo)) {
15625 			lo = (nfs4_lo_name_t *)
15626 			    lockt_denied->owner.owner_val;
15627 
15628 			flk->l_pid = lo->ln_pid;
15629 		} else {
15630 			NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15631 			    "denied_to_flk: bad lock owner length\n"));
15632 
15633 			flk->l_pid = lo_to_pid(&lockt_denied->owner);
15634 		}
15635 	} else {
15636 		NFS4_DEBUG(nfs4_client_lock_debug, (CE_NOTE,
15637 		"denied_to_flk: foreign clientid\n"));
15638 
15639 		/*
15640 		 * Construct a new sysid which should be different from
15641 		 * sysids of other systems.
15642 		 */
15643 
15644 		flk->l_sysid++;
15645 		flk->l_pid = lo_to_pid(&lockt_denied->owner);
15646 	}
15647 }
15648 
15649 static pid_t
15650 lo_to_pid(lock_owner4 *lop)
15651 {
15652 	pid_t pid = 0;
15653 	uchar_t *cp;
15654 	int i;
15655 
15656 	cp = (uchar_t *)&lop->clientid;
15657 
15658 	for (i = 0; i < sizeof (lop->clientid); i++)
15659 		pid += (pid_t)*cp++;
15660 
15661 	cp = (uchar_t *)lop->owner_val;
15662 
15663 	for (i = 0; i < lop->owner_len; i++)
15664 		pid += (pid_t)*cp++;
15665 
15666 	return (pid);
15667 }
15668 
15669 /*
15670  * Given a lock pointer, returns the length of that lock.
15671  * "end" is the last locked offset the "l_len" covers from
15672  * the start of the lock.
15673  */
15674 static off64_t
15675 lock_to_end(flock64_t *lock)
15676 {
15677 	off64_t lock_end;
15678 
15679 	if (lock->l_len == 0)
15680 		lock_end = (off64_t)MAXEND;
15681 	else
15682 		lock_end = lock->l_start + lock->l_len - 1;
15683 
15684 	return (lock_end);
15685 }
15686 
15687 /*
15688  * Given the end of a lock, it will return you the length "l_len" for that lock.
15689  */
15690 static off64_t
15691 end_to_len(off64_t start, off64_t end)
15692 {
15693 	off64_t lock_len;
15694 
15695 	ASSERT(end >= start);
15696 	if (end == MAXEND)
15697 		lock_len = 0;
15698 	else
15699 		lock_len = end - start + 1;
15700 
15701 	return (lock_len);
15702 }
15703 
15704 /*
15705  * On given end for a lock it determines if it is the last locked offset
15706  * or not, if so keeps it as is, else adds one to return the length for
15707  * valid start.
15708  */
15709 static off64_t
15710 start_check(off64_t x)
15711 {
15712 	if (x == MAXEND)
15713 		return (x);
15714 	else
15715 		return (x + 1);
15716 }
15717 
15718 /*
15719  * See if these two locks overlap, and if so return 1;
15720  * otherwise, return 0.
15721  */
15722 static int
15723 locks_intersect(flock64_t *llfp, flock64_t *curfp)
15724 {
15725 	off64_t llfp_end, curfp_end;
15726 
15727 	llfp_end = lock_to_end(llfp);
15728 	curfp_end = lock_to_end(curfp);
15729 
15730 	if (((llfp_end >= curfp->l_start) &&
15731 	    (llfp->l_start <= curfp->l_start)) ||
15732 	    ((curfp->l_start <= llfp->l_start) && (curfp_end >= llfp->l_start)))
15733 		return (1);
15734 	return (0);
15735 }
15736 
15737 /*
15738  * Determine what the intersecting lock region is, and add that to the
15739  * 'nl_llpp' locklist in increasing order (by l_start).
15740  */
15741 static void
15742 nfs4_add_lock_range(flock64_t *lost_flp, flock64_t *local_flp,
15743     locklist_t **nl_llpp, vnode_t *vp)
15744 {
15745 	locklist_t *intersect_llp, *tmp_fllp, *cur_fllp;
15746 	off64_t lost_flp_end, local_flp_end, len, start;
15747 
15748 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range:"));
15749 
15750 	if (!locks_intersect(lost_flp, local_flp))
15751 		return;
15752 
15753 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15754 	    "locks intersect"));
15755 
15756 	lost_flp_end = lock_to_end(lost_flp);
15757 	local_flp_end = lock_to_end(local_flp);
15758 
15759 	/* Find the starting point of the intersecting region */
15760 	if (local_flp->l_start > lost_flp->l_start)
15761 		start = local_flp->l_start;
15762 	else
15763 		start = lost_flp->l_start;
15764 
15765 	/* Find the lenght of the intersecting region */
15766 	if (lost_flp_end < local_flp_end)
15767 		len = end_to_len(start, lost_flp_end);
15768 	else
15769 		len = end_to_len(start, local_flp_end);
15770 
15771 	/*
15772 	 * Prepare the flock structure for the intersection found and insert
15773 	 * it into the new list in increasing l_start order. This list contains
15774 	 * intersections of locks registered by the client with the local host
15775 	 * and the lost lock.
15776 	 * The lock type of this lock is the same as that of the local_flp.
15777 	 */
15778 	intersect_llp = (locklist_t *)kmem_alloc(sizeof (locklist_t), KM_SLEEP);
15779 	intersect_llp->ll_flock.l_start = start;
15780 	intersect_llp->ll_flock.l_len = len;
15781 	intersect_llp->ll_flock.l_type = local_flp->l_type;
15782 	intersect_llp->ll_flock.l_pid = local_flp->l_pid;
15783 	intersect_llp->ll_flock.l_sysid = local_flp->l_sysid;
15784 	intersect_llp->ll_flock.l_whence = 0;	/* aka SEEK_SET */
15785 	intersect_llp->ll_vp = vp;
15786 
15787 	tmp_fllp = *nl_llpp;
15788 	cur_fllp = NULL;
15789 	while (tmp_fllp != NULL && tmp_fllp->ll_flock.l_start <
15790 	    intersect_llp->ll_flock.l_start) {
15791 			cur_fllp = tmp_fllp;
15792 			tmp_fllp = tmp_fllp->ll_next;
15793 	}
15794 	if (cur_fllp == NULL) {
15795 		/* first on the list */
15796 		intersect_llp->ll_next = *nl_llpp;
15797 		*nl_llpp = intersect_llp;
15798 	} else {
15799 		intersect_llp->ll_next = cur_fllp->ll_next;
15800 		cur_fllp->ll_next = intersect_llp;
15801 	}
15802 
15803 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE, "nfs4_add_lock_range: "
15804 	    "created lock region: start %"PRIx64" end %"PRIx64" : %s\n",
15805 	    intersect_llp->ll_flock.l_start,
15806 	    intersect_llp->ll_flock.l_start + intersect_llp->ll_flock.l_len,
15807 	    intersect_llp->ll_flock.l_type == F_RDLCK ? "READ" : "WRITE"));
15808 }
15809 
15810 /*
15811  * Our local locking current state is potentially different than
15812  * what the NFSv4 server thinks we have due to a lost lock that was
15813  * resent and then received.  We need to reset our "NFSv4" locking
15814  * state to match the current local locking state for this pid since
15815  * that is what the user/application sees as what the world is.
15816  *
15817  * We cannot afford to drop the open/lock seqid sync since then we can
15818  * get confused about what the current local locking state "is" versus
15819  * "was".
15820  *
15821  * If we are unable to fix up the locks, we send SIGLOST to the affected
15822  * process.  This is not done if the filesystem has been forcibly
15823  * unmounted, in case the process has already exited and a new process
15824  * exists with the same pid.
15825  */
15826 static void
15827 nfs4_reinstitute_local_lock_state(vnode_t *vp, flock64_t *lost_flp, cred_t *cr,
15828     nfs4_lock_owner_t *lop)
15829 {
15830 	locklist_t *locks, *llp, *ri_llp, *tmp_llp;
15831 	mntinfo4_t *mi = VTOMI4(vp);
15832 	const int cmd = F_SETLK;
15833 	off64_t cur_start, llp_ll_flock_end, lost_flp_end;
15834 	flock64_t ul_fl;
15835 
15836 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15837 	    "nfs4_reinstitute_local_lock_state"));
15838 
15839 	/*
15840 	 * Find active locks for this vp from the local locking code.
15841 	 * Scan through this list and find out the locks that intersect with
15842 	 * the lost lock. Once we find the lock that intersects, add the
15843 	 * intersection area as a new lock to a new list "ri_llp". The lock
15844 	 * type of the intersection region lock added to ri_llp is the same
15845 	 * as that found in the active lock list, "list". The intersecting
15846 	 * region locks are added to ri_llp in increasing l_start order.
15847 	 */
15848 	ASSERT(nfs_zone() == mi->mi_zone);
15849 
15850 	locks = flk_active_locks_for_vp(vp);
15851 	ri_llp = NULL;
15852 
15853 	for (llp = locks; llp != NULL; llp = llp->ll_next) {
15854 		ASSERT(llp->ll_vp == vp);
15855 		/*
15856 		 * Pick locks that belong to this pid/lockowner
15857 		 */
15858 		if (llp->ll_flock.l_pid != lost_flp->l_pid)
15859 			continue;
15860 
15861 		nfs4_add_lock_range(lost_flp, &llp->ll_flock, &ri_llp, vp);
15862 	}
15863 
15864 	/*
15865 	 * Now we have the list of intersections with the lost lock. These are
15866 	 * the locks that were/are active before the server replied to the
15867 	 * last/lost lock. Issue these locks to the server here. Playing these
15868 	 * locks to the server will re-establish aur current local locking state
15869 	 * with the v4 server.
15870 	 * If we get an error, send SIGLOST to the application for that lock.
15871 	 */
15872 
15873 	for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15874 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15875 		    "nfs4_reinstitute_local_lock_state: need to issue "
15876 		    "flock: [%"PRIx64" - %"PRIx64"] : %s",
15877 		    llp->ll_flock.l_start,
15878 		    llp->ll_flock.l_start + llp->ll_flock.l_len,
15879 		    llp->ll_flock.l_type == F_RDLCK ? "READ" :
15880 		    llp->ll_flock.l_type == F_WRLCK ? "WRITE" : "INVALID"));
15881 		/*
15882 		 * No need to relock what we already have
15883 		 */
15884 		if (llp->ll_flock.l_type == lost_flp->l_type)
15885 			continue;
15886 
15887 		push_reinstate(vp, cmd, &llp->ll_flock, cr, lop);
15888 	}
15889 
15890 	/*
15891 	 * Now keeping the start of the lost lock as our reference parse the
15892 	 * newly created ri_llp locklist to find the ranges that we have locked
15893 	 * with the v4 server but not in the current local locking. We need
15894 	 * to unlock these ranges.
15895 	 * These ranges can also be reffered to as those ranges, where the lost
15896 	 * lock does not overlap with the locks in the ri_llp but are locked
15897 	 * since the server replied to the lost lock.
15898 	 */
15899 	cur_start = lost_flp->l_start;
15900 	lost_flp_end = lock_to_end(lost_flp);
15901 
15902 	ul_fl.l_type = F_UNLCK;
15903 	ul_fl.l_whence = 0;	/* aka SEEK_SET */
15904 	ul_fl.l_sysid = lost_flp->l_sysid;
15905 	ul_fl.l_pid = lost_flp->l_pid;
15906 
15907 	for (llp = ri_llp; llp != NULL; llp = llp->ll_next) {
15908 		llp_ll_flock_end = lock_to_end(&llp->ll_flock);
15909 
15910 		if (llp->ll_flock.l_start <= cur_start) {
15911 			cur_start = start_check(llp_ll_flock_end);
15912 			continue;
15913 		}
15914 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15915 		    "nfs4_reinstitute_local_lock_state: "
15916 		    "UNLOCK [%"PRIx64" - %"PRIx64"]",
15917 		    cur_start, llp->ll_flock.l_start));
15918 
15919 		ul_fl.l_start = cur_start;
15920 		ul_fl.l_len = end_to_len(cur_start,
15921 		    (llp->ll_flock.l_start - 1));
15922 
15923 		push_reinstate(vp, cmd, &ul_fl, cr, lop);
15924 		cur_start = start_check(llp_ll_flock_end);
15925 	}
15926 
15927 	/*
15928 	 * In the case where the lost lock ends after all intersecting locks,
15929 	 * unlock the last part of the lost lock range.
15930 	 */
15931 	if (cur_start != start_check(lost_flp_end)) {
15932 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
15933 		    "nfs4_reinstitute_local_lock_state: UNLOCK end of the "
15934 		    "lost lock region [%"PRIx64" - %"PRIx64"]",
15935 		    cur_start, lost_flp->l_start + lost_flp->l_len));
15936 
15937 		ul_fl.l_start = cur_start;
15938 		/*
15939 		 * Is it an to-EOF lock? if so unlock till the end
15940 		 */
15941 		if (lost_flp->l_len == 0)
15942 			ul_fl.l_len = 0;
15943 		else
15944 			ul_fl.l_len = start_check(lost_flp_end) - cur_start;
15945 
15946 		push_reinstate(vp, cmd, &ul_fl, cr, lop);
15947 	}
15948 
15949 	if (locks != NULL)
15950 		flk_free_locklist(locks);
15951 
15952 	/* Free up our newly created locklist */
15953 	for (llp = ri_llp; llp != NULL; ) {
15954 		tmp_llp = llp->ll_next;
15955 		kmem_free(llp, sizeof (locklist_t));
15956 		llp = tmp_llp;
15957 	}
15958 
15959 	/*
15960 	 * Now return back to the original calling nfs4frlock()
15961 	 * and let us naturally drop our seqid syncs.
15962 	 */
15963 }
15964 
15965 /*
15966  * Create a lost state record for the given lock reinstantiation request
15967  * and push it onto the lost state queue.
15968  */
15969 static void
15970 push_reinstate(vnode_t *vp, int cmd, flock64_t *flk, cred_t *cr,
15971     nfs4_lock_owner_t *lop)
15972 {
15973 	nfs4_lost_rqst_t req;
15974 	nfs_lock_type4 locktype;
15975 	nfs4_error_t e = { EINTR, NFS4_OK, RPC_SUCCESS };
15976 
15977 	ASSERT(nfs_zone() == VTOMI4(vp)->mi_zone);
15978 
15979 	locktype = flk_to_locktype(cmd, flk->l_type);
15980 	nfs4frlock_save_lost_rqst(NFS4_LCK_CTYPE_REINSTATE, EINTR, locktype,
15981 	    NULL, NULL, lop, flk, &req, cr, vp);
15982 	(void) nfs4_start_recovery(&e, VTOMI4(vp), vp, NULL, NULL,
15983 	    (req.lr_op == OP_LOCK || req.lr_op == OP_LOCKU) ?
15984 	    &req : NULL, flk->l_type == F_UNLCK ? OP_LOCKU : OP_LOCK,
15985 	    NULL, NULL, NULL);
15986 }
15987