xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_client_state.c (revision 22ff04516c85a5caac614d46031edbc085ba3a9e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
27 /* All Rights Reserved */
28 
29 
30 #include <nfs/nfs4_clnt.h>
31 #include <nfs/rnode4.h>
32 #include <sys/systm.h>
33 #include <sys/cmn_err.h>
34 #include <sys/atomic.h>
35 
36 static void	nfs4_free_open_owner(nfs4_open_owner_t *, mntinfo4_t *);
37 static nfs4_open_owner_t *find_freed_open_owner(cred_t *,
38 				nfs4_oo_hash_bucket_t *, mntinfo4_t *);
39 static open_delegation_type4 get_dtype(rnode4_t *);
40 
41 #ifdef DEBUG
42 int nfs4_client_foo_debug = 0x0;
43 int nfs4_client_open_dg = 0x0;
44 /*
45  * If this is non-zero, the lockowner and openowner seqid sync primitives
46  * will intermittently return errors.
47  */
48 static int seqid_sync_faults = 0;
49 #endif
50 
51 stateid4 clnt_special0 = {
52 	0,
53 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
54 };
55 
56 stateid4 clnt_special1 = {
57 	0xffffffff,
58 	{
59 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
60 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
61 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
62 	}
63 };
64 
65 /* finds hash bucket and locks it */
66 static nfs4_oo_hash_bucket_t *
67 lock_bucket(cred_t *cr, mntinfo4_t *mi)
68 {
69 	nfs4_oo_hash_bucket_t *bucketp;
70 	uint32_t hash_key;
71 
72 	hash_key = (uint32_t)(crgetuid(cr) + crgetruid(cr))
73 	    % NFS4_NUM_OO_BUCKETS;
74 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE, "lock_bucket: "
75 	    "hash_key %d for cred %p", hash_key, (void*)cr));
76 
77 	ASSERT(hash_key >= 0 && hash_key < NFS4_NUM_OO_BUCKETS);
78 	ASSERT(mi != NULL);
79 	ASSERT(mutex_owned(&mi->mi_lock));
80 
81 	bucketp = &(mi->mi_oo_list[hash_key]);
82 	mutex_enter(&bucketp->b_lock);
83 	return (bucketp);
84 }
85 
86 /* unlocks hash bucket pointed by bucket_ptr */
87 static void
88 unlock_bucket(nfs4_oo_hash_bucket_t *bucketp)
89 {
90 	mutex_exit(&bucketp->b_lock);
91 }
92 
93 /*
94  * Removes the lock owner from the rnode's lock_owners list and frees the
95  * corresponding reference.
96  */
97 void
98 nfs4_rnode_remove_lock_owner(rnode4_t *rp, nfs4_lock_owner_t *lop)
99 {
100 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
101 	    "nfs4_rnode_remove_lock_owner"));
102 
103 	mutex_enter(&rp->r_statev4_lock);
104 
105 	if (lop->lo_next_rnode == NULL) {
106 		/* already removed from list */
107 		mutex_exit(&rp->r_statev4_lock);
108 		return;
109 	}
110 
111 	ASSERT(lop->lo_prev_rnode != NULL);
112 
113 	lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
114 	lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
115 
116 	lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
117 
118 	mutex_exit(&rp->r_statev4_lock);
119 
120 	/*
121 	 * This would be an appropriate place for
122 	 * RELEASE_LOCKOWNER.  For now, this is overkill
123 	 * because in the common case, close is going to
124 	 * release any lockowners anyway.
125 	 */
126 	lock_owner_rele(lop);
127 }
128 
129 /*
130  * Remove all lock owners from the rnode's lock_owners list.  Frees up
131  * their references from the list.
132  */
133 
134 void
135 nfs4_flush_lock_owners(rnode4_t *rp)
136 {
137 	nfs4_lock_owner_t *lop;
138 
139 	mutex_enter(&rp->r_statev4_lock);
140 	while (rp->r_lo_head.lo_next_rnode != &rp->r_lo_head) {
141 		lop = rp->r_lo_head.lo_next_rnode;
142 		lop->lo_prev_rnode->lo_next_rnode = lop->lo_next_rnode;
143 		lop->lo_next_rnode->lo_prev_rnode = lop->lo_prev_rnode;
144 		lop->lo_next_rnode = lop->lo_prev_rnode = NULL;
145 		lock_owner_rele(lop);
146 	}
147 	mutex_exit(&rp->r_statev4_lock);
148 }
149 
150 void
151 nfs4_clear_open_streams(rnode4_t *rp)
152 {
153 	nfs4_open_stream_t *osp;
154 
155 	mutex_enter(&rp->r_os_lock);
156 	while ((osp = list_head(&rp->r_open_streams)) != NULL) {
157 		open_owner_rele(osp->os_open_owner);
158 		list_remove(&rp->r_open_streams, osp);
159 		mutex_destroy(&osp->os_sync_lock);
160 		osp->os_open_owner = NULL;
161 		kmem_free(osp, sizeof (*osp));
162 	}
163 	mutex_exit(&rp->r_os_lock);
164 }
165 
166 void
167 open_owner_hold(nfs4_open_owner_t *oop)
168 {
169 	mutex_enter(&oop->oo_lock);
170 	oop->oo_ref_count++;
171 	mutex_exit(&oop->oo_lock);
172 }
173 
174 /*
175  * Frees the open owner if the ref count hits zero.
176  */
177 void
178 open_owner_rele(nfs4_open_owner_t *oop)
179 {
180 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
181 	    "open_owner_rele"));
182 
183 	mutex_enter(&oop->oo_lock);
184 	oop->oo_ref_count--;
185 	if (oop->oo_ref_count == 0) {
186 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
187 		    "open_owner_rele: freeing open owner"));
188 		oop->oo_valid = 0;
189 		mutex_exit(&oop->oo_lock);
190 		/*
191 		 * Ok, we don't destroy the open owner, nor do we put it on
192 		 * the mntinfo4's free list just yet.  We are lazy about it
193 		 * and let callers to find_open_owner() do that to keep locking
194 		 * simple.
195 		 */
196 	} else {
197 		mutex_exit(&oop->oo_lock);
198 	}
199 }
200 
201 void
202 open_stream_hold(nfs4_open_stream_t *osp)
203 {
204 	mutex_enter(&osp->os_sync_lock);
205 	osp->os_ref_count++;
206 	mutex_exit(&osp->os_sync_lock);
207 }
208 
209 /*
210  * Frees the open stream and removes it from the rnode4's open streams list if
211  * the ref count drops to zero.
212  */
213 void
214 open_stream_rele(nfs4_open_stream_t *osp, rnode4_t *rp)
215 {
216 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
217 	    "open_stream_rele"));
218 
219 	ASSERT(!mutex_owned(&rp->r_os_lock));
220 
221 	mutex_enter(&osp->os_sync_lock);
222 	ASSERT(osp->os_ref_count > 0);
223 	osp->os_ref_count--;
224 	if (osp->os_ref_count == 0) {
225 		nfs4_open_owner_t *tmp_oop;
226 
227 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
228 		    "open_stream_rele: freeing open stream"));
229 		osp->os_valid = 0;
230 		tmp_oop = osp->os_open_owner;
231 		mutex_exit(&osp->os_sync_lock);
232 
233 		/* now see if we need to destroy the open owner */
234 		open_owner_rele(tmp_oop);
235 
236 		mutex_enter(&rp->r_os_lock);
237 		list_remove(&rp->r_open_streams, osp);
238 		mutex_exit(&rp->r_os_lock);
239 
240 		/* free up osp */
241 		mutex_destroy(&osp->os_sync_lock);
242 		osp->os_open_owner = NULL;
243 		kmem_free(osp, sizeof (*osp));
244 	} else {
245 		mutex_exit(&osp->os_sync_lock);
246 	}
247 }
248 
249 void
250 lock_owner_hold(nfs4_lock_owner_t *lop)
251 {
252 	mutex_enter(&lop->lo_lock);
253 	lop->lo_ref_count++;
254 	mutex_exit(&lop->lo_lock);
255 }
256 
257 /*
258  * Frees the lock owner if the ref count hits zero and
259  * the structure no longer has no locks.
260  */
261 void
262 lock_owner_rele(nfs4_lock_owner_t *lop)
263 {
264 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
265 	    "lock_owner_rele"));
266 
267 	mutex_enter(&lop->lo_lock);
268 	lop->lo_ref_count--;
269 	if (lop->lo_ref_count == 0) {
270 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
271 		    "lock_owner_rele: freeing lock owner: "
272 		    "%x", lop->lo_pid));
273 		lop->lo_valid = 0;
274 		/*
275 		 * If there are no references, the lock_owner should
276 		 * already be off the rnode's list.
277 		 */
278 		ASSERT(lop->lo_next_rnode == NULL);
279 		ASSERT(lop->lo_prev_rnode == NULL);
280 		ASSERT(!(lop->lo_flags & NFS4_LOCK_SEQID_INUSE));
281 		ASSERT(lop->lo_seqid_holder == NULL);
282 		mutex_exit(&lop->lo_lock);
283 
284 		/* free up lop */
285 		cv_destroy(&lop->lo_cv_seqid_sync);
286 		mutex_destroy(&lop->lo_lock);
287 		kmem_free(lop, sizeof (*lop));
288 	} else {
289 		mutex_exit(&lop->lo_lock);
290 	}
291 }
292 
293 /*
294  * This increments the open owner ref count if found.
295  * The argument 'just_created' determines whether we are looking for open
296  * owners with the 'oo_just_created' flag set or not.
297  */
298 nfs4_open_owner_t *
299 find_open_owner_nolock(cred_t *cr, int just_created, mntinfo4_t *mi)
300 {
301 	nfs4_open_owner_t	*oop = NULL, *next_oop;
302 	nfs4_oo_hash_bucket_t	*bucketp;
303 
304 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
305 	    "find_open_owner: cred %p, just_created %d",
306 	    (void*)cr, just_created));
307 
308 	ASSERT(mi != NULL);
309 	ASSERT(mutex_owned(&mi->mi_lock));
310 
311 	bucketp = lock_bucket(cr, mi);
312 
313 	/* got hash bucket, search through open owners */
314 	for (oop = list_head(&bucketp->b_oo_hash_list); oop != NULL; ) {
315 		mutex_enter(&oop->oo_lock);
316 		if (!crcmp(oop->oo_cred, cr) &&
317 		    (oop->oo_just_created == just_created ||
318 		    just_created == NFS4_JUST_CREATED)) {
319 			/* match */
320 			if (oop->oo_valid == 0) {
321 				/* reactivate the open owner */
322 				oop->oo_valid = 1;
323 				ASSERT(oop->oo_ref_count == 0);
324 			}
325 			oop->oo_ref_count++;
326 			mutex_exit(&oop->oo_lock);
327 			unlock_bucket(bucketp);
328 			return (oop);
329 		}
330 		next_oop = list_next(&bucketp->b_oo_hash_list, oop);
331 		if (oop->oo_valid == 0) {
332 			list_remove(&bucketp->b_oo_hash_list, oop);
333 
334 			/*
335 			 * Now we go ahead and put this open owner
336 			 * on the freed list.  This is our lazy method.
337 			 */
338 			nfs4_free_open_owner(oop, mi);
339 		}
340 
341 		mutex_exit(&oop->oo_lock);
342 		oop = next_oop;
343 	}
344 
345 	/* search through recently freed open owners */
346 	oop = find_freed_open_owner(cr, bucketp, mi);
347 
348 	unlock_bucket(bucketp);
349 
350 	return (oop);
351 }
352 
353 nfs4_open_owner_t *
354 find_open_owner(cred_t *cr, int just_created, mntinfo4_t *mi)
355 {
356 	nfs4_open_owner_t *oop;
357 
358 	mutex_enter(&mi->mi_lock);
359 	oop = find_open_owner_nolock(cr, just_created, mi);
360 	mutex_exit(&mi->mi_lock);
361 
362 	return (oop);
363 }
364 
365 /*
366  * This increments osp's ref count if found.
367  * Returns with 'os_sync_lock' held.
368  */
369 nfs4_open_stream_t *
370 find_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
371 {
372 	nfs4_open_stream_t	*osp;
373 
374 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
375 	    "find_open_stream"));
376 
377 	mutex_enter(&rp->r_os_lock);
378 	/* Now, no one can add or delete to rp's open streams list */
379 	for (osp = list_head(&rp->r_open_streams); osp != NULL;
380 	    osp = list_next(&rp->r_open_streams, osp)) {
381 		mutex_enter(&osp->os_sync_lock);
382 		if (osp->os_open_owner == oop && osp->os_valid != 0) {
383 			/* match */
384 			NFS4_DEBUG(nfs4_client_state_debug,
385 			    (CE_NOTE, "find_open_stream "
386 			    "got a match"));
387 
388 			osp->os_ref_count++;
389 			mutex_exit(&rp->r_os_lock);
390 			return (osp);
391 		}
392 		mutex_exit(&osp->os_sync_lock);
393 	}
394 
395 	mutex_exit(&rp->r_os_lock);
396 	return (NULL);
397 }
398 
399 /*
400  * Find the lock owner for the given file and process ID.  If "which" is
401  * LOWN_VALID_STATEID, require that the lock owner contain a valid stateid
402  * from the server.
403  *
404  * This increments the lock owner's ref count if found.  Returns NULL if
405  * there was no match.
406  */
407 nfs4_lock_owner_t *
408 find_lock_owner(rnode4_t *rp, pid_t pid, lown_which_t which)
409 {
410 	nfs4_lock_owner_t	*lop, *next_lop;
411 
412 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
413 	    "find_lock_owner: pid %x, which %d", pid, which));
414 
415 	ASSERT(which == LOWN_ANY || which == LOWN_VALID_STATEID);
416 
417 	/* search by pid */
418 	mutex_enter(&rp->r_statev4_lock);
419 
420 	lop = rp->r_lo_head.lo_next_rnode;
421 	while (lop != &rp->r_lo_head) {
422 		mutex_enter(&lop->lo_lock);
423 		if (lop->lo_pid == pid && lop->lo_valid != 0 &&
424 		    !(lop->lo_flags & NFS4_BAD_SEQID_LOCK)) {
425 			if (which == LOWN_ANY ||
426 			    lop->lo_just_created != NFS4_JUST_CREATED) {
427 				/* Found a matching lock owner */
428 				NFS4_DEBUG(nfs4_client_state_debug,
429 				    (CE_NOTE, "find_lock_owner: "
430 				    "got a match"));
431 
432 				lop->lo_ref_count++;
433 				mutex_exit(&lop->lo_lock);
434 				mutex_exit(&rp->r_statev4_lock);
435 				return (lop);
436 			}
437 		}
438 		next_lop = lop->lo_next_rnode;
439 		mutex_exit(&lop->lo_lock);
440 		lop = next_lop;
441 	}
442 
443 	mutex_exit(&rp->r_statev4_lock);
444 	return (NULL);
445 }
446 
447 /*
448  * This returns the delegation stateid as 'sid'. Returns 1 if a successful
449  * delegation stateid was found, otherwise returns 0.
450  */
451 
452 static int
453 nfs4_get_deleg_stateid(rnode4_t *rp, nfs_opnum4 op, stateid4 *sid)
454 {
455 	ASSERT(!mutex_owned(&rp->r_statev4_lock));
456 
457 	mutex_enter(&rp->r_statev4_lock);
458 	if (((rp->r_deleg_type == OPEN_DELEGATE_WRITE && op == OP_WRITE) ||
459 	    (rp->r_deleg_type != OPEN_DELEGATE_NONE && op != OP_WRITE)) &&
460 	    !rp->r_deleg_return_pending) {
461 
462 		*sid = rp->r_deleg_stateid;
463 		mutex_exit(&rp->r_statev4_lock);
464 		return (1);
465 	}
466 	mutex_exit(&rp->r_statev4_lock);
467 	return (0);
468 }
469 
470 /*
471  * This returns the lock stateid as 'sid'. Returns 1 if a successful lock
472  * stateid was found, otherwise returns 0.
473  */
474 static int
475 nfs4_get_lock_stateid(rnode4_t *rp, pid_t pid, stateid4 *sid)
476 {
477 	nfs4_lock_owner_t *lop;
478 
479 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
480 
481 	if (lop) {
482 		/*
483 		 * Found a matching lock owner, so use a lock
484 		 * stateid rather than an open stateid.
485 		 */
486 		mutex_enter(&lop->lo_lock);
487 		*sid = lop->lock_stateid;
488 		mutex_exit(&lop->lo_lock);
489 		lock_owner_rele(lop);
490 		return (1);
491 	}
492 
493 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
494 	    "nfs4_get_lock_stateid: no lop"));
495 	return (0);
496 }
497 
498 /*
499  * This returns the open stateid as 'sid'. Returns 1 if a successful open
500  * stateid was found, otherwise returns 0.
501  *
502  * Once the stateid is returned to the caller, it is no longer protected;
503  * so the caller must be prepared to handle OLD/BAD_STATEID where
504  * appropiate.
505  */
506 static int
507 nfs4_get_open_stateid(rnode4_t *rp, cred_t *cr, mntinfo4_t *mi, stateid4 *sid)
508 {
509 	nfs4_open_owner_t *oop;
510 	nfs4_open_stream_t *osp;
511 
512 	ASSERT(mi != NULL);
513 
514 	oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
515 	if (!oop) {
516 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
517 		    "nfs4_get_open_stateid: no oop"));
518 		return (0);
519 	}
520 
521 	osp = find_open_stream(oop, rp);
522 	open_owner_rele(oop);
523 	if (!osp) {
524 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
525 		    "nfs4_get_open_stateid: no osp"));
526 		return (0);
527 	}
528 
529 	if (osp->os_failed_reopen) {
530 		NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
531 		    "nfs4_get_open_stateid: osp %p failed reopen",
532 		    (void *)osp));
533 		mutex_exit(&osp->os_sync_lock);
534 		open_stream_rele(osp, rp);
535 		return (0);
536 	}
537 	*sid = osp->open_stateid;
538 	mutex_exit(&osp->os_sync_lock);
539 	open_stream_rele(osp, rp);
540 	return (1);
541 }
542 
543 /*
544  * Returns the delegation stateid if this 'op' is OP_WRITE and the
545  * delegation we hold is a write delegation, OR this 'op' is not
546  * OP_WRITE and we have a delegation held (read or write), otherwise
547  * returns the lock stateid if there is a lock owner, otherwise
548  * returns the open stateid if there is a open stream, otherwise
549  * returns special stateid <seqid = 0, other = 0>.
550  *
551  * Used for WRITE operations.
552  */
553 stateid4
554 nfs4_get_w_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
555 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp)
556 {
557 	stateid4 sid;
558 
559 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
560 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
561 			sid_tp->cur_sid_type = DEL_SID;
562 			return (sid);
563 		}
564 	}
565 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
566 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
567 			sid_tp->cur_sid_type = LOCK_SID;
568 			return (sid);
569 		}
570 	}
571 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
572 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
573 			sid_tp->cur_sid_type = OPEN_SID;
574 			return (sid);
575 		}
576 	}
577 	bzero(&sid, sizeof (stateid4));
578 	sid_tp->cur_sid_type = SPEC_SID;
579 	return (sid);
580 }
581 
582 /*
583  * Returns the delegation stateid if this 'op' is OP_WRITE and the
584  * delegation we hold is a write delegation, OR this 'op' is not
585  * OP_WRITE and we have a delegation held (read or write), otherwise
586  * returns the lock stateid if there is a lock owner, otherwise
587  * returns the open stateid if there is a open stream, otherwise
588  * returns special stateid <seqid = 0, other = 0>.
589  *
590  * This also updates which stateid we are using in 'sid_tp', skips
591  * previously attempted stateids, and skips checking higher priority
592  * stateids than the current level as dictated by 'sid_tp->cur_sid_type'
593  * for async reads.
594  *
595  * Used for READ and SETATTR operations.
596  */
597 stateid4
598 nfs4_get_stateid(cred_t *cr, rnode4_t *rp, pid_t pid, mntinfo4_t *mi,
599 	nfs_opnum4 op, nfs4_stateid_types_t *sid_tp, bool_t async_read)
600 {
601 	stateid4 sid;
602 
603 	/*
604 	 * For asynchronous READs, do not attempt to retry from the start of
605 	 * the stateid priority list, just continue from where you last left
606 	 * off.
607 	 */
608 	if (async_read) {
609 		switch (sid_tp->cur_sid_type) {
610 		case NO_SID:
611 			break;
612 		case DEL_SID:
613 			goto lock_stateid;
614 		case LOCK_SID:
615 			goto open_stateid;
616 		case OPEN_SID:
617 			goto special_stateid;
618 		case SPEC_SID:
619 		default:
620 			cmn_err(CE_PANIC, "nfs4_get_stateid: illegal current "
621 			    "stateid type %d", sid_tp->cur_sid_type);
622 		}
623 	}
624 
625 	if (nfs4_get_deleg_stateid(rp, op, &sid)) {
626 		if (!stateid4_cmp(&sid, &sid_tp->d_sid)) {
627 			sid_tp->cur_sid_type = DEL_SID;
628 			return (sid);
629 		}
630 	}
631 lock_stateid:
632 	if (nfs4_get_lock_stateid(rp, pid, &sid)) {
633 		if (!stateid4_cmp(&sid, &sid_tp->l_sid)) {
634 			sid_tp->cur_sid_type = LOCK_SID;
635 			return (sid);
636 		}
637 	}
638 open_stateid:
639 	if (nfs4_get_open_stateid(rp, cr, mi, &sid)) {
640 		if (!stateid4_cmp(&sid, &sid_tp->o_sid)) {
641 			sid_tp->cur_sid_type = OPEN_SID;
642 			return (sid);
643 		}
644 	}
645 special_stateid:
646 	bzero(&sid, sizeof (stateid4));
647 	sid_tp->cur_sid_type = SPEC_SID;
648 	return	(sid);
649 }
650 
651 void
652 nfs4_set_lock_stateid(nfs4_lock_owner_t *lop, stateid4 stateid)
653 {
654 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
655 	    "nfs4_set_lock_stateid"));
656 
657 	ASSERT(lop);
658 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
659 
660 	mutex_enter(&lop->lo_lock);
661 	lop->lock_stateid = stateid;
662 	mutex_exit(&lop->lo_lock);
663 }
664 
665 /*
666  * Sequence number used when a new open owner is needed.
667  * This is used so as to not confuse the server.  Since a open owner
668  * is based off of cred, a cred could be re-used quickly, and the server
669  * may not release all state for a cred.
670  */
671 static uint64_t open_owner_seq_num = 0;
672 
673 uint64_t
674 nfs4_get_new_oo_name(void)
675 {
676 	return (atomic_inc_64_nv(&open_owner_seq_num));
677 }
678 
679 /*
680  * Create a new open owner and add it to the open owner hash table.
681  */
682 nfs4_open_owner_t *
683 create_open_owner(cred_t *cr, mntinfo4_t *mi)
684 {
685 	nfs4_open_owner_t	*oop;
686 	nfs4_oo_hash_bucket_t	*bucketp;
687 
688 	oop = kmem_alloc(sizeof (nfs4_open_owner_t), KM_SLEEP);
689 	/*
690 	 * Make sure the cred doesn't go away when we put this open owner
691 	 * on the free list, as well as make crcmp() a valid check.
692 	 */
693 	crhold(cr);
694 	oop->oo_cred = cr;
695 	mutex_init(&oop->oo_lock, NULL, MUTEX_DEFAULT, NULL);
696 	oop->oo_ref_count = 1;
697 	oop->oo_valid = 1;
698 	oop->oo_just_created = NFS4_JUST_CREATED;
699 	oop->oo_seqid = 0;
700 	oop->oo_seqid_inuse = 0;
701 	oop->oo_last_good_seqid = 0;
702 	oop->oo_last_good_op = TAG_NONE;
703 	oop->oo_cred_otw = NULL;
704 	cv_init(&oop->oo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
705 
706 	/*
707 	 * A Solaris open_owner is <oo_seq_num>
708 	 */
709 	oop->oo_name = nfs4_get_new_oo_name();
710 
711 	/* now add the struct into the cred hash table */
712 	ASSERT(mutex_owned(&mi->mi_lock));
713 	bucketp = lock_bucket(cr, mi);
714 	list_insert_head(&bucketp->b_oo_hash_list, oop);
715 	unlock_bucket(bucketp);
716 
717 	return (oop);
718 }
719 
720 /*
721  * Create a new open stream and it to the rnode's list.
722  * Increments the ref count on oop.
723  * Returns with 'os_sync_lock' held.
724  */
725 nfs4_open_stream_t *
726 create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp)
727 {
728 	nfs4_open_stream_t	*osp;
729 
730 #ifdef DEBUG
731 	mutex_enter(&oop->oo_lock);
732 	ASSERT(oop->oo_seqid_inuse);
733 	mutex_exit(&oop->oo_lock);
734 #endif
735 
736 	osp = kmem_alloc(sizeof (nfs4_open_stream_t), KM_SLEEP);
737 	osp->os_open_ref_count = 1;
738 	osp->os_mapcnt = 0;
739 	osp->os_ref_count = 2;
740 	osp->os_valid = 1;
741 	osp->os_open_owner = oop;
742 	osp->os_orig_oo_name = oop->oo_name;
743 	bzero(&osp->open_stateid, sizeof (stateid4));
744 	osp->os_share_acc_read = 0;
745 	osp->os_share_acc_write = 0;
746 	osp->os_mmap_read = 0;
747 	osp->os_mmap_write = 0;
748 	osp->os_share_deny_none = 0;
749 	osp->os_share_deny_read = 0;
750 	osp->os_share_deny_write = 0;
751 	osp->os_delegation = 0;
752 	osp->os_dc_openacc = 0;
753 	osp->os_final_close = 0;
754 	osp->os_pending_close = 0;
755 	osp->os_failed_reopen = 0;
756 	osp->os_force_close = 0;
757 	mutex_init(&osp->os_sync_lock, NULL, MUTEX_DEFAULT, NULL);
758 
759 	/* open owner gets a reference */
760 	open_owner_hold(oop);
761 
762 	/* now add the open stream to rp */
763 	mutex_enter(&rp->r_os_lock);
764 	mutex_enter(&osp->os_sync_lock);
765 	list_insert_head(&rp->r_open_streams, osp);
766 	mutex_exit(&rp->r_os_lock);
767 
768 	return (osp);
769 }
770 
771 /*
772  * Returns an open stream with 'os_sync_lock' held.
773  * If the open stream is found (rather than created), its
774  * 'os_open_ref_count' is bumped.
775  *
776  * There is no race with two threads entering this function
777  * and creating two open streams for the same <oop, rp> pair.
778  * This is because the open seqid sync must be acquired, thus
779  * only allowing one thread in at a time.
780  */
781 nfs4_open_stream_t *
782 find_or_create_open_stream(nfs4_open_owner_t *oop, rnode4_t *rp,
783 	int *created_osp)
784 {
785 	nfs4_open_stream_t *osp;
786 
787 #ifdef DEBUG
788 	mutex_enter(&oop->oo_lock);
789 	ASSERT(oop->oo_seqid_inuse);
790 	mutex_exit(&oop->oo_lock);
791 #endif
792 
793 	osp = find_open_stream(oop, rp);
794 	if (!osp) {
795 		osp = create_open_stream(oop, rp);
796 		if (osp)
797 			*created_osp = 1;
798 	} else {
799 		*created_osp = 0;
800 		osp->os_open_ref_count++;
801 	}
802 
803 	return (osp);
804 }
805 
806 static uint64_t lock_owner_seq_num = 0;
807 
808 /*
809  * Create a new lock owner and add it to the rnode's list.
810  * Assumes the rnode's r_statev4_lock is held.
811  * The created lock owner has a reference count of 2: one for the list and
812  * one for the caller to use.  Returns the lock owner locked down.
813  */
814 nfs4_lock_owner_t *
815 create_lock_owner(rnode4_t *rp, pid_t pid)
816 {
817 	nfs4_lock_owner_t	*lop;
818 
819 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
820 	    "create_lock_owner: pid %x", pid));
821 
822 	ASSERT(mutex_owned(&rp->r_statev4_lock));
823 
824 	lop = kmem_alloc(sizeof (nfs4_lock_owner_t), KM_SLEEP);
825 	lop->lo_ref_count = 2;
826 	lop->lo_valid = 1;
827 	bzero(&lop->lock_stateid, sizeof (stateid4));
828 	lop->lo_pid = pid;
829 	lop->lock_seqid = 0;
830 	lop->lo_pending_rqsts = 0;
831 	lop->lo_just_created = NFS4_JUST_CREATED;
832 	lop->lo_flags = 0;
833 	lop->lo_seqid_holder = NULL;
834 
835 	/*
836 	 * A Solaris lock_owner is <seq_num><pid>
837 	 */
838 	lop->lock_owner_name.ln_seq_num =
839 	    atomic_inc_64_nv(&lock_owner_seq_num);
840 	lop->lock_owner_name.ln_pid = pid;
841 
842 	cv_init(&lop->lo_cv_seqid_sync, NULL, CV_DEFAULT, NULL);
843 	mutex_init(&lop->lo_lock, NULL, MUTEX_DEFAULT, NULL);
844 
845 	mutex_enter(&lop->lo_lock);
846 
847 	/* now add the lock owner to rp */
848 	lop->lo_prev_rnode = &rp->r_lo_head;
849 	lop->lo_next_rnode = rp->r_lo_head.lo_next_rnode;
850 	rp->r_lo_head.lo_next_rnode->lo_prev_rnode = lop;
851 	rp->r_lo_head.lo_next_rnode = lop;
852 
853 	return (lop);
854 
855 }
856 
857 /*
858  * This sets the lock seqid of a lock owner.
859  */
860 void
861 nfs4_set_lock_seqid(seqid4 seqid, nfs4_lock_owner_t *lop)
862 {
863 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
864 	    "nfs4_set_lock_seqid"));
865 
866 	ASSERT(lop != NULL);
867 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
868 
869 	lop->lock_seqid = seqid;
870 }
871 
872 static void
873 nfs4_set_new_lock_owner_args(lock_owner4 *owner, pid_t pid)
874 {
875 	nfs4_lo_name_t *cast_namep;
876 
877 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
878 	    "nfs4_set_new_lock_owner_args"));
879 
880 	owner->owner_len = sizeof (*cast_namep);
881 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
882 	/*
883 	 * A Solaris lock_owner is <seq_num><pid>
884 	 */
885 	cast_namep = (nfs4_lo_name_t *)owner->owner_val;
886 	cast_namep->ln_seq_num = atomic_inc_64_nv(&lock_owner_seq_num);
887 	cast_namep->ln_pid = pid;
888 }
889 
890 /*
891  * Fill in the lock owner args.
892  */
893 void
894 nfs4_setlockowner_args(lock_owner4 *owner, rnode4_t *rp, pid_t pid)
895 {
896 	nfs4_lock_owner_t *lop;
897 
898 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
899 	    "nfs4_setlockowner_args"));
900 
901 	/* This increments lop's ref count */
902 	lop = find_lock_owner(rp, pid, LOWN_VALID_STATEID);
903 
904 	if (!lop)
905 		goto make_up_args;
906 
907 	mutex_enter(&lop->lo_lock);
908 	owner->owner_len = sizeof (lop->lock_owner_name);
909 	owner->owner_val = kmem_alloc(owner->owner_len, KM_SLEEP);
910 	bcopy(&lop->lock_owner_name, owner->owner_val,
911 	    owner->owner_len);
912 	mutex_exit(&lop->lo_lock);
913 	lock_owner_rele(lop);
914 	return;
915 
916 make_up_args:
917 	nfs4_set_new_lock_owner_args(owner, pid);
918 }
919 
920 /*
921  * This ends our use of the open owner's open seqid by setting
922  * the appropiate flags and issuing a cv_signal to wake up another
923  * thread waiting to use the open seqid.
924  */
925 
926 void
927 nfs4_end_open_seqid_sync(nfs4_open_owner_t *oop)
928 {
929 	mutex_enter(&oop->oo_lock);
930 	ASSERT(oop->oo_seqid_inuse);
931 	oop->oo_seqid_inuse = 0;
932 	cv_broadcast(&oop->oo_cv_seqid_sync);
933 	mutex_exit(&oop->oo_lock);
934 }
935 
936 /*
937  * This starts our use of the open owner's open seqid by setting
938  * the oo_seqid_inuse to true.  We will wait (forever) with a
939  * cv_wait() until we are woken up.
940  *
941  * Return values:
942  * 0		no problems
943  * EAGAIN	caller should retry (like a recovery retry)
944  */
945 int
946 nfs4_start_open_seqid_sync(nfs4_open_owner_t *oop, mntinfo4_t *mi)
947 {
948 	int error = 0;
949 #ifdef DEBUG
950 	static int ops = 0;		/* fault injection */
951 #endif
952 
953 #ifdef DEBUG
954 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
955 	    ++ops % 5 == 0)
956 		return (EAGAIN);
957 #endif
958 
959 	mutex_enter(&mi->mi_lock);
960 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
961 	    curthread != mi->mi_recovthread)
962 		error = EAGAIN;
963 	mutex_exit(&mi->mi_lock);
964 	if (error != 0)
965 		goto done;
966 
967 	mutex_enter(&oop->oo_lock);
968 
969 	while (oop->oo_seqid_inuse) {
970 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
971 		    "nfs4_start_open_seqid_sync waiting on cv"));
972 
973 		cv_wait(&oop->oo_cv_seqid_sync, &oop->oo_lock);
974 	}
975 
976 	oop->oo_seqid_inuse = 1;
977 
978 	mutex_exit(&oop->oo_lock);
979 
980 	mutex_enter(&mi->mi_lock);
981 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
982 	    curthread != mi->mi_recovthread)
983 		error = EAGAIN;
984 	mutex_exit(&mi->mi_lock);
985 
986 	if (error == EAGAIN)
987 		nfs4_end_open_seqid_sync(oop);
988 
989 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
990 	    "nfs4_start_open_seqid_sync: error=%d", error));
991 
992 done:
993 	return (error);
994 }
995 
996 #ifdef	DEBUG
997 int bypass_otw[2];
998 #endif
999 
1000 /*
1001  * Checks to see if the OPEN OTW is necessary that is, if it's already
1002  * been opened with the same access and deny bits we are now asking for.
1003  * Note, this assumes that *vpp is a rnode.
1004  */
1005 int
1006 nfs4_is_otw_open_necessary(nfs4_open_owner_t *oop, int flag, vnode_t *vp,
1007 	int just_been_created, int *errorp, int acc, nfs4_recov_state_t *rsp)
1008 {
1009 	rnode4_t *rp;
1010 	nfs4_open_stream_t *osp;
1011 	open_delegation_type4 dt;
1012 
1013 	rp = VTOR4(vp);
1014 
1015 	/*
1016 	 * Grab the delegation type.  This function is protected against
1017 	 * the delegation being returned by virtue of start_op (called
1018 	 * by nfs4open_otw) taking the r_deleg_recall_lock in read mode,
1019 	 * delegreturn requires this lock in write mode to proceed.
1020 	 */
1021 	ASSERT(nfs_rw_lock_held(&rp->r_deleg_recall_lock, RW_READER));
1022 	dt = get_dtype(rp);
1023 
1024 	/* returns with 'os_sync_lock' held */
1025 	osp = find_open_stream(oop, rp);
1026 
1027 	if (osp) {
1028 		uint32_t	do_otw = 0;
1029 
1030 		if (osp->os_failed_reopen) {
1031 			NFS4_DEBUG(nfs4_open_stream_debug, (CE_NOTE,
1032 			    "nfs4_is_otw_open_necessary: os_failed_reopen "
1033 			    "set on osp %p, cr %p, rp %s", (void *)osp,
1034 			    (void *)osp->os_open_owner->oo_cred,
1035 			    rnode4info(rp)));
1036 			do_otw = 1;
1037 		}
1038 
1039 		/*
1040 		 * check access/deny bits
1041 		 */
1042 		if (!do_otw && (flag & FREAD))
1043 			if (osp->os_share_acc_read == 0 &&
1044 			    dt == OPEN_DELEGATE_NONE)
1045 				do_otw = 1;
1046 
1047 		if (!do_otw && (flag & FWRITE))
1048 			if (osp->os_share_acc_write == 0 &&
1049 			    dt != OPEN_DELEGATE_WRITE)
1050 				do_otw = 1;
1051 
1052 		if (!do_otw) {
1053 			NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1054 			    "nfs4_is_otw_open_necessary: can skip this "
1055 			    "open OTW"));
1056 			if (!just_been_created) {
1057 				osp->os_open_ref_count++;
1058 				if (flag & FREAD)
1059 					osp->os_share_acc_read++;
1060 				if (flag & FWRITE)
1061 					osp->os_share_acc_write++;
1062 				osp->os_share_deny_none++;
1063 			}
1064 
1065 			/*
1066 			 * Need to reset this bitfield for the possible case
1067 			 * where we were going to OTW CLOSE the file, got a
1068 			 * non-recoverable error, and before we could retry
1069 			 * the CLOSE, OPENed the file again.
1070 			 */
1071 			ASSERT(osp->os_open_owner->oo_seqid_inuse);
1072 			osp->os_final_close = 0;
1073 			osp->os_force_close = 0;
1074 
1075 			mutex_exit(&osp->os_sync_lock);
1076 			open_stream_rele(osp, rp);
1077 
1078 #ifdef	DEBUG
1079 			bypass_otw[0]++;
1080 #endif
1081 
1082 			*errorp = 0;
1083 			return (0);
1084 		}
1085 		mutex_exit(&osp->os_sync_lock);
1086 		open_stream_rele(osp, rp);
1087 
1088 	} else if (dt != OPEN_DELEGATE_NONE) {
1089 		/*
1090 		 * Even if there isn't an open_stream yet, we may still be
1091 		 * able to bypass the otw open if the client owns a delegation.
1092 		 *
1093 		 * If you are asking for for WRITE, but I only have
1094 		 * a read delegation, then you still have to go otw.
1095 		 */
1096 
1097 		if (flag & FWRITE && dt == OPEN_DELEGATE_READ)
1098 			return (1);
1099 
1100 		/*
1101 		 * TODO - evaluate the nfsace4
1102 		 */
1103 
1104 		/*
1105 		 * Check the access flags to make sure the caller
1106 		 * had permission.
1107 		 */
1108 		if (flag & FREAD && !(acc & VREAD))
1109 			return (1);
1110 
1111 		if (flag & FWRITE && !(acc & VWRITE))
1112 			return (1);
1113 
1114 		/*
1115 		 * create_open_stream will add a reference to oop,
1116 		 * this will prevent the open_owner_rele done in
1117 		 * nfs4open_otw from destroying the open_owner.
1118 		 */
1119 
1120 		/* returns with 'os_sync_lock' held */
1121 		osp = create_open_stream(oop, rp);
1122 		if (osp == NULL)
1123 			return (1);
1124 
1125 		osp->open_stateid = rp->r_deleg_stateid;
1126 		osp->os_delegation = 1;
1127 
1128 		if (flag & FREAD)
1129 			osp->os_share_acc_read++;
1130 		if (flag & FWRITE)
1131 			osp->os_share_acc_write++;
1132 
1133 		osp->os_share_deny_none++;
1134 		mutex_exit(&osp->os_sync_lock);
1135 
1136 		open_stream_rele(osp, rp);
1137 
1138 		mutex_enter(&oop->oo_lock);
1139 		oop->oo_just_created = NFS4_PERM_CREATED;
1140 		mutex_exit(&oop->oo_lock);
1141 
1142 		ASSERT(rsp != NULL);
1143 		if (rsp->rs_sp != NULL) {
1144 			mutex_enter(&rsp->rs_sp->s_lock);
1145 			nfs4_inc_state_ref_count_nolock(rsp->rs_sp,
1146 			    VTOMI4(vp));
1147 			mutex_exit(&rsp->rs_sp->s_lock);
1148 		}
1149 #ifdef	DEBUG
1150 		bypass_otw[1]++;
1151 #endif
1152 
1153 		*errorp = 0;
1154 		return (0);
1155 	}
1156 
1157 	return (1);
1158 }
1159 
1160 static open_delegation_type4
1161 get_dtype(rnode4_t *rp)
1162 {
1163 	open_delegation_type4 dt;
1164 
1165 	mutex_enter(&rp->r_statev4_lock);
1166 	ASSERT(!rp->r_deleg_return_inprog);
1167 	if (rp->r_deleg_return_pending)
1168 		dt = OPEN_DELEGATE_NONE;
1169 	else
1170 		dt = rp->r_deleg_type;
1171 	mutex_exit(&rp->r_statev4_lock);
1172 
1173 	return (dt);
1174 }
1175 
1176 /*
1177  * Fill in *locker with the lock state arguments for a LOCK call.  If
1178  * lop->lo_just_created == NFS4_JUST_CREATED, oop and osp must be non-NULL.
1179  * Caller must already hold the necessary seqid sync lock(s).
1180  */
1181 
1182 void
1183 nfs4_setup_lock_args(nfs4_lock_owner_t *lop, nfs4_open_owner_t *oop,
1184 	nfs4_open_stream_t *osp, clientid4 clientid, locker4 *locker)
1185 {
1186 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1187 	if (lop->lo_just_created == NFS4_JUST_CREATED) {
1188 		/* this is a new lock request */
1189 		open_to_lock_owner4 *nown;
1190 
1191 		ASSERT(oop != NULL);
1192 		ASSERT(osp != NULL);
1193 
1194 		locker->new_lock_owner = TRUE;
1195 		nown = &locker->locker4_u.open_owner;
1196 		nown->open_seqid = nfs4_get_open_seqid(oop) + 1;
1197 		mutex_enter(&osp->os_sync_lock);
1198 		nown->open_stateid = osp->open_stateid;
1199 		mutex_exit(&osp->os_sync_lock);
1200 		nown->lock_seqid = lop->lock_seqid; /* initial, so no +1 */
1201 
1202 		nown->lock_owner.clientid = clientid;
1203 		nown->lock_owner.owner_len = sizeof (lop->lock_owner_name);
1204 		nown->lock_owner.owner_val =
1205 		    kmem_alloc(nown->lock_owner.owner_len, KM_SLEEP);
1206 		bcopy(&lop->lock_owner_name, nown->lock_owner.owner_val,
1207 		    nown->lock_owner.owner_len);
1208 	} else {
1209 		exist_lock_owner4 *eown;
1210 		/* have an existing lock owner */
1211 
1212 		locker->new_lock_owner = FALSE;
1213 		eown = &locker->locker4_u.lock_owner;
1214 		mutex_enter(&lop->lo_lock);
1215 		eown->lock_stateid = lop->lock_stateid;
1216 		mutex_exit(&lop->lo_lock);
1217 		eown->lock_seqid = lop->lock_seqid + 1;
1218 	}
1219 }
1220 
1221 /*
1222  * This starts our use of the lock owner's lock seqid by setting
1223  * the lo_flags to NFS4_LOCK_SEQID_INUSE.  We will wait (forever)
1224  * with a cv_wait() until we are woken up.
1225  *
1226  * Return values:
1227  * 0		no problems
1228  * EAGAIN	caller should retry (like a recovery retry)
1229  */
1230 int
1231 nfs4_start_lock_seqid_sync(nfs4_lock_owner_t *lop, mntinfo4_t *mi)
1232 {
1233 	int error = 0;
1234 #ifdef DEBUG
1235 	static int ops = 0;		/* fault injection */
1236 #endif
1237 
1238 #ifdef DEBUG
1239 	if (seqid_sync_faults && curthread != mi->mi_recovthread &&
1240 	    ++ops % 7 == 0)
1241 		return (EAGAIN);
1242 #endif
1243 
1244 	mutex_enter(&mi->mi_lock);
1245 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1246 	    curthread != mi->mi_recovthread)
1247 		error = EAGAIN;
1248 	mutex_exit(&mi->mi_lock);
1249 	if (error != 0)
1250 		goto done;
1251 
1252 	mutex_enter(&lop->lo_lock);
1253 
1254 	ASSERT(lop->lo_seqid_holder != curthread);
1255 	while (lop->lo_flags & NFS4_LOCK_SEQID_INUSE) {
1256 		NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1257 		    "nfs4_start_lock_seqid_sync: waiting on cv"));
1258 
1259 		cv_wait(&lop->lo_cv_seqid_sync, &lop->lo_lock);
1260 	}
1261 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE, "nfs4_start_lock_seqid_sync: "
1262 	    "NFS4_LOCK_SEQID_INUSE"));
1263 
1264 	lop->lo_flags |= NFS4_LOCK_SEQID_INUSE;
1265 	lop->lo_seqid_holder = curthread;
1266 	mutex_exit(&lop->lo_lock);
1267 
1268 	mutex_enter(&mi->mi_lock);
1269 	if ((mi->mi_flags & MI4_RECOV_ACTIV) &&
1270 	    curthread != mi->mi_recovthread)
1271 		error = EAGAIN;
1272 	mutex_exit(&mi->mi_lock);
1273 
1274 	if (error == EAGAIN)
1275 		nfs4_end_lock_seqid_sync(lop);
1276 
1277 	NFS4_DEBUG(nfs4_seqid_sync, (CE_NOTE,
1278 	    "nfs4_start_lock_seqid_sync: error=%d", error));
1279 
1280 done:
1281 	return (error);
1282 }
1283 
1284 /*
1285  * This ends our use of the lock owner's lock seqid by setting
1286  * the appropiate flags and issuing a cv_signal to wake up another
1287  * thread waiting to use the lock seqid.
1288  */
1289 void
1290 nfs4_end_lock_seqid_sync(nfs4_lock_owner_t *lop)
1291 {
1292 	mutex_enter(&lop->lo_lock);
1293 	ASSERT(lop->lo_flags & NFS4_LOCK_SEQID_INUSE);
1294 	ASSERT(lop->lo_seqid_holder == curthread);
1295 	lop->lo_flags &= ~NFS4_LOCK_SEQID_INUSE;
1296 	lop->lo_seqid_holder = NULL;
1297 	cv_broadcast(&lop->lo_cv_seqid_sync);
1298 	mutex_exit(&lop->lo_lock);
1299 }
1300 
1301 /*
1302  * Returns a reference to a lock owner via lopp, which has its lock seqid
1303  * synchronization started.
1304  * If the lock owner is in the 'just_created' state, then we return its open
1305  * owner and open stream and start the open seqid synchronization.
1306  *
1307  * Return value:
1308  * NFS4_OK		no problems
1309  * NFS4ERR_DELAY	there is lost state to recover; caller should retry
1310  * NFS4ERR_IO		no open stream
1311  */
1312 nfsstat4
1313 nfs4_find_or_create_lock_owner(pid_t pid, rnode4_t *rp, cred_t *cr,
1314 	nfs4_open_owner_t **oopp, nfs4_open_stream_t **ospp,
1315 	nfs4_lock_owner_t **lopp)
1316 {
1317 	nfs4_lock_owner_t *lop, *next_lop;
1318 	mntinfo4_t *mi;
1319 	int error = 0;
1320 	nfsstat4 stat;
1321 
1322 	mi = VTOMI4(RTOV4(rp));
1323 
1324 	mutex_enter(&rp->r_statev4_lock);
1325 
1326 	lop = rp->r_lo_head.lo_next_rnode;
1327 	while (lop != &rp->r_lo_head) {
1328 		mutex_enter(&lop->lo_lock);
1329 		if (lop->lo_pid == pid && lop->lo_valid != 0) {
1330 			/* Found a matching lock owner */
1331 			NFS4_DEBUG(nfs4_client_state_debug,
1332 			    (CE_NOTE, "nfs4_find_or_create_lock_owner: "
1333 			    "got a match"));
1334 			lop->lo_ref_count++;
1335 			break;
1336 		}
1337 		next_lop = lop->lo_next_rnode;
1338 		mutex_exit(&lop->lo_lock);
1339 		lop = next_lop;
1340 	}
1341 
1342 	if (lop == &rp->r_lo_head) {
1343 		/* create temporary lock owner */
1344 		lop = create_lock_owner(rp, pid);
1345 	}
1346 	mutex_exit(&rp->r_statev4_lock);
1347 
1348 	/* Have a locked down lock owner struct now */
1349 	if (lop->lo_just_created != NFS4_JUST_CREATED) {
1350 		/* This is an existing lock owner */
1351 		*oopp = NULL;
1352 		*ospp = NULL;
1353 	} else {
1354 		/* Lock owner doesn't exist yet */
1355 
1356 		/* First grab open owner seqid synchronization */
1357 		mutex_exit(&lop->lo_lock);
1358 		*oopp = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1359 		if (*oopp == NULL)
1360 			goto kill_new_lop;
1361 		error = nfs4_start_open_seqid_sync(*oopp, mi);
1362 		if (error == EAGAIN) {
1363 			stat = NFS4ERR_DELAY;
1364 			goto failed;
1365 		}
1366 		*ospp = find_open_stream(*oopp, rp);
1367 		if (*ospp == NULL) {
1368 			nfs4_end_open_seqid_sync(*oopp);
1369 			goto kill_new_lop;
1370 		}
1371 		if ((*ospp)->os_failed_reopen) {
1372 			mutex_exit(&(*ospp)->os_sync_lock);
1373 			NFS4_DEBUG((nfs4_open_stream_debug ||
1374 			    nfs4_client_lock_debug), (CE_NOTE,
1375 			    "nfs4_find_or_create_lock_owner: os_failed_reopen;"
1376 			    "osp %p, cr %p, rp %s", (void *)(*ospp),
1377 			    (void *)cr, rnode4info(rp)));
1378 			nfs4_end_open_seqid_sync(*oopp);
1379 			stat = NFS4ERR_IO;
1380 			goto failed;
1381 		}
1382 		mutex_exit(&(*ospp)->os_sync_lock);
1383 
1384 		/*
1385 		 * Now see if the lock owner has become permanent while we
1386 		 * had released our lock.
1387 		 */
1388 		mutex_enter(&lop->lo_lock);
1389 		if (lop->lo_just_created != NFS4_JUST_CREATED) {
1390 			nfs4_end_open_seqid_sync(*oopp);
1391 			open_stream_rele(*ospp, rp);
1392 			open_owner_rele(*oopp);
1393 			*oopp = NULL;
1394 			*ospp = NULL;
1395 		}
1396 	}
1397 	mutex_exit(&lop->lo_lock);
1398 
1399 	error = nfs4_start_lock_seqid_sync(lop, mi);
1400 	if (error == EAGAIN) {
1401 		if (*oopp != NULL)
1402 			nfs4_end_open_seqid_sync(*oopp);
1403 		stat = NFS4ERR_DELAY;
1404 		goto failed;
1405 	}
1406 	ASSERT(error == 0);
1407 
1408 	*lopp = lop;
1409 	return (NFS4_OK);
1410 
1411 kill_new_lop:
1412 	/*
1413 	 * A previous CLOSE was attempted but got EINTR, but the application
1414 	 * continued to use the unspecified state file descriptor.  But now the
1415 	 * open stream is gone (which could also destroy the open owner), hence
1416 	 * we can no longer continue.  The calling function should return EIO
1417 	 * to the application.
1418 	 */
1419 	NFS4_DEBUG(nfs4_lost_rqst_debug || nfs4_client_lock_debug,
1420 	    (CE_NOTE, "nfs4_find_or_create_lock_owner: destroy newly created "
1421 	    "lop %p, oop %p, osp %p", (void *)lop, (void *)(*oopp),
1422 	    (void *)(*ospp)));
1423 
1424 	nfs4_rnode_remove_lock_owner(rp, lop);
1425 	stat = NFS4ERR_IO;
1426 
1427 failed:
1428 	lock_owner_rele(lop);
1429 	if (*oopp) {
1430 		open_owner_rele(*oopp);
1431 		*oopp = NULL;
1432 	}
1433 	if (*ospp) {
1434 		open_stream_rele(*ospp, rp);
1435 		*ospp = NULL;
1436 	}
1437 	return (stat);
1438 }
1439 
1440 /*
1441  * This function grabs a recently freed open owner off of the freed open
1442  * owner list if there is a match on the cred 'cr'.  It returns NULL if no
1443  * such match is found.  It will set the 'oo_ref_count' and 'oo_valid' back
1444  * to both 1 (sane values) in the case a match is found.
1445  */
1446 static nfs4_open_owner_t *
1447 find_freed_open_owner(cred_t *cr, nfs4_oo_hash_bucket_t *bucketp,
1448 	mntinfo4_t *mi)
1449 {
1450 	nfs4_open_owner_t		*foop;
1451 
1452 	NFS4_DEBUG(nfs4_client_state_debug, (CE_NOTE,
1453 	    "find_freed_open_owner: cred %p", (void*)cr));
1454 
1455 	ASSERT(mutex_owned(&mi->mi_lock));
1456 	ASSERT(mutex_owned(&bucketp->b_lock));
1457 
1458 	/* got hash bucket, search through freed open owners */
1459 	for (foop = list_head(&mi->mi_foo_list); foop != NULL;
1460 	    foop = list_next(&mi->mi_foo_list, foop)) {
1461 		if (!crcmp(foop->oo_cred, cr)) {
1462 			NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1463 			    "find_freed_open_owner: got a match open owner "
1464 			    "%p", (void *)foop));
1465 			foop->oo_ref_count = 1;
1466 			foop->oo_valid = 1;
1467 			list_remove(&mi->mi_foo_list, foop);
1468 			mi->mi_foo_num--;
1469 
1470 			/* now add the struct into the cred hash table */
1471 			list_insert_head(&bucketp->b_oo_hash_list, foop);
1472 			return (foop);
1473 		}
1474 	}
1475 
1476 	return (NULL);
1477 }
1478 
1479 /*
1480  * Insert the newly freed 'oop' into the mi's freed oop list,
1481  * always at the head of the list.  If we've already reached
1482  * our maximum allowed number of freed open owners (mi_foo_max),
1483  * then remove the LRU open owner on the list (namely the tail).
1484  */
1485 static void
1486 nfs4_free_open_owner(nfs4_open_owner_t *oop, mntinfo4_t *mi)
1487 {
1488 	nfs4_open_owner_t *lru_foop;
1489 
1490 	if (mi->mi_foo_num < mi->mi_foo_max) {
1491 		NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1492 		    "nfs4_free_open_owner: num free %d, max free %d, "
1493 		    "insert open owner %p for mntinfo4 %p",
1494 		    mi->mi_foo_num, mi->mi_foo_max, (void *)oop,
1495 		    (void *)mi));
1496 		list_insert_head(&mi->mi_foo_list, oop);
1497 		mi->mi_foo_num++;
1498 		return;
1499 	}
1500 
1501 	/* need to replace a freed open owner */
1502 
1503 	lru_foop = list_tail(&mi->mi_foo_list);
1504 
1505 	NFS4_DEBUG(nfs4_client_foo_debug, (CE_NOTE,
1506 	    "nfs4_free_open_owner: destroy %p, insert %p",
1507 	    (void *)lru_foop, (void *)oop));
1508 
1509 	list_remove(&mi->mi_foo_list, lru_foop);
1510 	nfs4_destroy_open_owner(lru_foop);
1511 
1512 	/* head always has latest freed oop */
1513 	list_insert_head(&mi->mi_foo_list, oop);
1514 }
1515 
1516 void
1517 nfs4_destroy_open_owner(nfs4_open_owner_t *oop)
1518 {
1519 	ASSERT(oop != NULL);
1520 
1521 	crfree(oop->oo_cred);
1522 	if (oop->oo_cred_otw)
1523 		crfree(oop->oo_cred_otw);
1524 	mutex_destroy(&oop->oo_lock);
1525 	cv_destroy(&oop->oo_cv_seqid_sync);
1526 	kmem_free(oop, sizeof (*oop));
1527 }
1528 
1529 seqid4
1530 nfs4_get_open_seqid(nfs4_open_owner_t *oop)
1531 {
1532 	ASSERT(oop->oo_seqid_inuse);
1533 	return (oop->oo_seqid);
1534 }
1535 
1536 /*
1537  * This set's the open seqid for a <open owner/ mntinfo4> pair.
1538  */
1539 void
1540 nfs4_set_open_seqid(seqid4 seqid, nfs4_open_owner_t *oop,
1541 	nfs4_tag_type_t tag_type)
1542 {
1543 	ASSERT(oop->oo_seqid_inuse);
1544 	oop->oo_seqid = seqid;
1545 	oop->oo_last_good_seqid = seqid;
1546 	oop->oo_last_good_op = tag_type;
1547 }
1548 
1549 /*
1550  * This bumps the current open seqid for the open owner 'oop'.
1551  */
1552 void
1553 nfs4_get_and_set_next_open_seqid(nfs4_open_owner_t *oop,
1554     nfs4_tag_type_t tag_type)
1555 {
1556 	ASSERT(oop->oo_seqid_inuse);
1557 	oop->oo_seqid++;
1558 	oop->oo_last_good_seqid = oop->oo_seqid;
1559 	oop->oo_last_good_op = tag_type;
1560 }
1561 
1562 /*
1563  * If no open owner was provided, this function takes the cred to find an
1564  * open owner within the given mntinfo4_t.  Either way we return the
1565  * open owner's OTW credential if it exists; otherwise returns the
1566  * supplied 'cr'.
1567  *
1568  * A hold is put on the returned credential, and it is up to the caller
1569  * to free the cred.
1570  */
1571 cred_t *
1572 nfs4_get_otw_cred(cred_t *cr, mntinfo4_t *mi, nfs4_open_owner_t *provided_oop)
1573 {
1574 	cred_t *ret_cr;
1575 	nfs4_open_owner_t *oop = provided_oop;
1576 
1577 	if (oop == NULL)
1578 		oop = find_open_owner(cr, NFS4_PERM_CREATED, mi);
1579 	if (oop != NULL) {
1580 		mutex_enter(&oop->oo_lock);
1581 		if (oop->oo_cred_otw)
1582 			ret_cr = oop->oo_cred_otw;
1583 		else
1584 			ret_cr = cr;
1585 		crhold(ret_cr);
1586 		mutex_exit(&oop->oo_lock);
1587 		if (provided_oop == NULL)
1588 			open_owner_rele(oop);
1589 	} else {
1590 		ret_cr = cr;
1591 		crhold(ret_cr);
1592 	}
1593 	return (ret_cr);
1594 }
1595 
1596 /*
1597  * Retrieves the next open stream in the rnode's list if an open stream
1598  * is provided; otherwise gets the first open stream in the list.
1599  * The open owner for that open stream is then retrieved, and if its
1600  * oo_cred_otw exists then it is returned; otherwise the provided 'cr'
1601  * is returned.  *osp is set to the 'found' open stream.
1602  *
1603  * Note: we don't set *osp to the open stream retrieved via the
1604  * optimized check since that won't necessarily be at the beginning
1605  * of the rnode list, and if that osp doesn't work we'd like to
1606  * check _all_ open streams (starting from the beginning of the
1607  * rnode list).
1608  */
1609 cred_t *
1610 nfs4_get_otw_cred_by_osp(rnode4_t *rp, cred_t *cr,
1611 	nfs4_open_stream_t **osp, bool_t *first_time, bool_t *last_time)
1612 {
1613 	nfs4_open_stream_t *next_osp = NULL;
1614 	cred_t *ret_cr;
1615 
1616 	ASSERT(cr != NULL);
1617 	/*
1618 	 * As an optimization, try to find the open owner
1619 	 * for the cred provided since that's most likely
1620 	 * to work.
1621 	 */
1622 	if (*first_time) {
1623 		nfs4_open_owner_t *oop;
1624 
1625 		oop = find_open_owner(cr, NFS4_PERM_CREATED, VTOMI4(RTOV4(rp)));
1626 		if (oop) {
1627 			next_osp = find_open_stream(oop, rp);
1628 			if (next_osp)
1629 				mutex_exit(&next_osp->os_sync_lock);
1630 			open_owner_rele(oop);
1631 		}
1632 	}
1633 	if (next_osp == NULL) {
1634 		int delay_rele = 0;
1635 		*first_time = FALSE;
1636 
1637 		/* return the next open stream for this rnode */
1638 		mutex_enter(&rp->r_os_lock);
1639 		/* Now, no one can add or delete to rp's open streams list */
1640 
1641 		if (*osp) {
1642 			next_osp = list_next(&rp->r_open_streams, *osp);
1643 			/*
1644 			 * Delay the rele of *osp until after we drop
1645 			 * r_os_lock to not deadlock with oo_lock
1646 			 * via an open_stream_rele()->open_owner_rele().
1647 			 */
1648 			delay_rele = 1;
1649 		} else {
1650 			next_osp = list_head(&rp->r_open_streams);
1651 		}
1652 		if (next_osp) {
1653 			nfs4_open_stream_t *tmp_osp;
1654 
1655 			/* find the next valid open stream */
1656 			mutex_enter(&next_osp->os_sync_lock);
1657 			while (next_osp && !next_osp->os_valid) {
1658 				tmp_osp =
1659 				    list_next(&rp->r_open_streams, next_osp);
1660 				mutex_exit(&next_osp->os_sync_lock);
1661 				next_osp = tmp_osp;
1662 				if (next_osp)
1663 					mutex_enter(&next_osp->os_sync_lock);
1664 			}
1665 			if (next_osp) {
1666 				next_osp->os_ref_count++;
1667 				mutex_exit(&next_osp->os_sync_lock);
1668 			}
1669 		}
1670 		mutex_exit(&rp->r_os_lock);
1671 		if (delay_rele)
1672 			open_stream_rele(*osp, rp);
1673 	}
1674 
1675 	if (next_osp) {
1676 		nfs4_open_owner_t *oop;
1677 
1678 		oop = next_osp->os_open_owner;
1679 		mutex_enter(&oop->oo_lock);
1680 		if (oop->oo_cred_otw)
1681 			ret_cr = oop->oo_cred_otw;
1682 		else
1683 			ret_cr = cr;
1684 		crhold(ret_cr);
1685 		mutex_exit(&oop->oo_lock);
1686 		if (*first_time) {
1687 			open_stream_rele(next_osp, rp);
1688 			*osp = NULL;
1689 		} else
1690 			*osp = next_osp;
1691 	} else {
1692 		/* just return the cred provided to us */
1693 		*last_time = TRUE;
1694 		*osp = NULL;
1695 		ret_cr = cr;
1696 		crhold(ret_cr);
1697 	}
1698 
1699 	*first_time = FALSE;
1700 	return (ret_cr);
1701 }
1702 
1703 void
1704 nfs4_init_stateid_types(nfs4_stateid_types_t *sid_tp)
1705 {
1706 	bzero(&sid_tp->d_sid, sizeof (stateid4));
1707 	bzero(&sid_tp->l_sid, sizeof (stateid4));
1708 	bzero(&sid_tp->o_sid, sizeof (stateid4));
1709 	sid_tp->cur_sid_type = NO_SID;
1710 }
1711 
1712 void
1713 nfs4_save_stateid(stateid4 *s1, nfs4_stateid_types_t *sid_tp)
1714 {
1715 	NFS4_DEBUG(nfs4_client_recov_debug, (CE_NOTE,
1716 	    "nfs4_save_stateid: saved %s stateid",
1717 	    sid_tp->cur_sid_type == DEL_SID ? "delegation" :
1718 	    sid_tp->cur_sid_type == LOCK_SID ? "lock" :
1719 	    sid_tp->cur_sid_type == OPEN_SID ? "open" : "special"));
1720 
1721 	switch (sid_tp->cur_sid_type) {
1722 	case DEL_SID:
1723 		sid_tp->d_sid = *s1;
1724 		break;
1725 	case LOCK_SID:
1726 		sid_tp->l_sid = *s1;
1727 		break;
1728 	case OPEN_SID:
1729 		sid_tp->o_sid = *s1;
1730 		break;
1731 	case SPEC_SID:
1732 	default:
1733 		cmn_err(CE_PANIC, "nfs4_save_stateid: illegal "
1734 		    "stateid type %d", sid_tp->cur_sid_type);
1735 	}
1736 }
1737 
1738 /*
1739  * We got NFS4ERR_BAD_SEQID.  Setup some arguments to pass to recovery.
1740  * Caller is responsible for freeing.
1741  */
1742 nfs4_bseqid_entry_t *
1743 nfs4_create_bseqid_entry(nfs4_open_owner_t *oop, nfs4_lock_owner_t *lop,
1744     vnode_t *vp, pid_t pid, nfs4_tag_type_t tag, seqid4 seqid)
1745 {
1746 	nfs4_bseqid_entry_t	*bsep;
1747 
1748 	bsep = kmem_alloc(sizeof (*bsep), KM_SLEEP);
1749 	bsep->bs_oop = oop;
1750 	bsep->bs_lop = lop;
1751 	bsep->bs_vp = vp;
1752 	bsep->bs_pid = pid;
1753 	bsep->bs_tag = tag;
1754 	bsep->bs_seqid = seqid;
1755 
1756 	return (bsep);
1757 }
1758 
1759 void
1760 nfs4open_dg_save_lost_rqst(int error, nfs4_lost_rqst_t *lost_rqstp,
1761 	nfs4_open_owner_t *oop, nfs4_open_stream_t *osp, cred_t *cr,
1762 	vnode_t *vp, int access_close, int deny_close)
1763 {
1764 	lost_rqstp->lr_putfirst = FALSE;
1765 
1766 	ASSERT(vp != NULL);
1767 	if (error == ETIMEDOUT || error == EINTR ||
1768 	    NFS4_FRC_UNMT_ERR(error, vp->v_vfsp)) {
1769 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
1770 		    "nfs4open_dg_save_lost_rqst: error %d", error));
1771 
1772 		lost_rqstp->lr_op = OP_OPEN_DOWNGRADE;
1773 		/*
1774 		 * The vp is held and rele'd via the recovery code.
1775 		 * See nfs4_save_lost_rqst.
1776 		 */
1777 		lost_rqstp->lr_vp = vp;
1778 		lost_rqstp->lr_dvp = NULL;
1779 		lost_rqstp->lr_oop = oop;
1780 		lost_rqstp->lr_osp = osp;
1781 		lost_rqstp->lr_lop = NULL;
1782 		lost_rqstp->lr_cr = cr;
1783 		lost_rqstp->lr_flk = NULL;
1784 		lost_rqstp->lr_dg_acc = access_close;
1785 		lost_rqstp->lr_dg_deny = deny_close;
1786 		lost_rqstp->lr_putfirst = FALSE;
1787 	} else {
1788 		lost_rqstp->lr_op = 0;
1789 	}
1790 }
1791 
1792 /*
1793  * Change the access and deny bits of an OPEN.
1794  * If recovery is needed, *recov_credpp is set to the cred used OTW,
1795  * a hold is placed on it, and *recov_seqidp is set to the seqid used OTW.
1796  */
1797 void
1798 nfs4_open_downgrade(int access_close, int deny_close, nfs4_open_owner_t *oop,
1799 	nfs4_open_stream_t *osp, vnode_t *vp, cred_t *cr, nfs4_lost_rqst_t *lrp,
1800 	nfs4_error_t *ep, cred_t **recov_credpp, seqid4 *recov_seqidp)
1801 {
1802 	mntinfo4_t		*mi;
1803 	int			downgrade_acc, downgrade_deny;
1804 	int			new_acc, new_deny;
1805 	COMPOUND4args_clnt	args;
1806 	COMPOUND4res_clnt	res;
1807 	OPEN_DOWNGRADE4res	*odg_res;
1808 	nfs_argop4		argop[3];
1809 	nfs_resop4		*resop;
1810 	rnode4_t		*rp;
1811 	bool_t			needrecov = FALSE;
1812 	int			doqueue = 1;
1813 	seqid4			seqid = 0;
1814 	cred_t			*cred_otw;
1815 	hrtime_t		t;
1816 
1817 	ASSERT(mutex_owned(&osp->os_sync_lock));
1818 #if DEBUG
1819 	mutex_enter(&oop->oo_lock);
1820 	ASSERT(oop->oo_seqid_inuse);
1821 	mutex_exit(&oop->oo_lock);
1822 #endif
1823 
1824 
1825 	if (access_close == 0 && deny_close == 0) {
1826 		nfs4_error_zinit(ep);
1827 		return;
1828 	}
1829 
1830 	cred_otw = nfs4_get_otw_cred(cr, VTOMI4(vp), oop);
1831 
1832 cred_retry:
1833 	nfs4_error_zinit(ep);
1834 	downgrade_acc = 0;
1835 	downgrade_deny = 0;
1836 	mi = VTOMI4(vp);
1837 	rp = VTOR4(vp);
1838 
1839 	/*
1840 	 * Check to see if the open stream got closed before we go OTW,
1841 	 * now that we have acquired the 'os_sync_lock'.
1842 	 */
1843 	if (!osp->os_valid) {
1844 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1845 		    " open stream has already been closed, return success"));
1846 		/* error has already been set */
1847 		goto no_args_out;
1848 	}
1849 
1850 	/* If the file failed recovery, just quit. */
1851 	mutex_enter(&rp->r_statelock);
1852 	if (rp->r_flags & R4RECOVERR) {
1853 		mutex_exit(&rp->r_statelock);
1854 		ep->error = EIO;
1855 		goto no_args_out;
1856 	}
1857 	mutex_exit(&rp->r_statelock);
1858 
1859 	seqid = nfs4_get_open_seqid(oop) + 1;
1860 
1861 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1862 	    "access_close %d, acc_read %"PRIu64" acc_write %"PRIu64"",
1863 	    access_close, osp->os_share_acc_read, osp->os_share_acc_write));
1864 
1865 	/* If we're closing the last READ, need to downgrade */
1866 	if ((access_close & FREAD) && (osp->os_share_acc_read == 1))
1867 		downgrade_acc |= OPEN4_SHARE_ACCESS_READ;
1868 
1869 	/* if we're closing the last WRITE, need to downgrade */
1870 	if ((access_close & FWRITE) && (osp->os_share_acc_write == 1))
1871 		downgrade_acc |= OPEN4_SHARE_ACCESS_WRITE;
1872 
1873 	downgrade_deny = OPEN4_SHARE_DENY_NONE;
1874 
1875 	new_acc = 0;
1876 	new_deny = 0;
1877 
1878 	/* set our new access and deny share bits */
1879 	if ((osp->os_share_acc_read > 0) &&
1880 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_READ))
1881 		new_acc |= OPEN4_SHARE_ACCESS_READ;
1882 	if ((osp->os_share_acc_write > 0) &&
1883 	    !(downgrade_acc & OPEN4_SHARE_ACCESS_WRITE))
1884 		new_acc |= OPEN4_SHARE_ACCESS_WRITE;
1885 
1886 	new_deny = OPEN4_SHARE_DENY_NONE;
1887 
1888 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1889 	    "downgrade acc 0x%x deny 0x%x", downgrade_acc, downgrade_deny));
1890 	NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE, "nfs4_open_downgrade:"
1891 	    "new acc 0x%x deny 0x%x", new_acc, new_deny));
1892 
1893 	/*
1894 	 * Check to see if we aren't actually doing any downgrade or
1895 	 * if this is the last 'close' but the file is still mmapped.
1896 	 * Skip this if this a lost request resend so we don't decrement
1897 	 * the osp's share counts more than once.
1898 	 */
1899 	if (!lrp &&
1900 	    ((downgrade_acc == 0 && downgrade_deny == 0) ||
1901 	    (new_acc == 0 && new_deny == 0))) {
1902 		/*
1903 		 * No downgrade to do, but still need to
1904 		 * update osp's os_share_* counts.
1905 		 */
1906 		NFS4_DEBUG(nfs4_client_open_dg, (CE_NOTE,
1907 		    "nfs4_open_downgrade: just lower the osp's count by %s",
1908 		    (access_close & FREAD) && (access_close & FWRITE) ?
1909 		    "read and write" : (access_close & FREAD) ? "read" :
1910 		    (access_close & FWRITE) ? "write" : "bogus"));
1911 		if (access_close & FREAD)
1912 			osp->os_share_acc_read--;
1913 		if (access_close & FWRITE)
1914 			osp->os_share_acc_write--;
1915 		osp->os_share_deny_none--;
1916 		nfs4_error_zinit(ep);
1917 
1918 		goto no_args_out;
1919 	}
1920 
1921 	if (osp->os_orig_oo_name != oop->oo_name) {
1922 		ep->error = EIO;
1923 		goto no_args_out;
1924 	}
1925 
1926 	/* setup the COMPOUND args */
1927 	if (lrp)
1928 		args.ctag = TAG_OPEN_DG_LOST;
1929 	else
1930 		args.ctag = TAG_OPEN_DG;
1931 
1932 	args.array_len = 3;
1933 	args.array = argop;
1934 
1935 	/* putfh */
1936 	argop[0].argop = OP_CPUTFH;
1937 	argop[0].nfs_argop4_u.opcputfh.sfh = rp->r_fh;
1938 
1939 	argop[1].argop = OP_GETATTR;
1940 	argop[1].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
1941 	argop[1].nfs_argop4_u.opgetattr.mi = mi;
1942 
1943 	ASSERT(mutex_owned(&osp->os_sync_lock));
1944 	ASSERT(osp->os_delegation == FALSE);
1945 
1946 	/* open downgrade */
1947 	argop[2].argop = OP_OPEN_DOWNGRADE;
1948 	argop[2].nfs_argop4_u.opopen_downgrade.open_stateid = osp->open_stateid;
1949 	argop[2].nfs_argop4_u.opopen_downgrade.share_access = new_acc;
1950 	argop[2].nfs_argop4_u.opopen_downgrade.share_deny = new_deny;
1951 	argop[2].nfs_argop4_u.opopen_downgrade.seqid = seqid;
1952 
1953 	t = gethrtime();
1954 
1955 	rfs4call(mi, &args, &res, cred_otw, &doqueue, 0, ep);
1956 
1957 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
1958 		nfs4_set_open_seqid(seqid, oop, args.ctag);
1959 
1960 	if ((ep->error == EACCES ||
1961 	    (ep->error == 0 && res.status == NFS4ERR_ACCESS)) &&
1962 	    cred_otw != cr) {
1963 		crfree(cred_otw);
1964 		cred_otw = cr;
1965 		crhold(cred_otw);
1966 		if (!ep->error)
1967 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
1968 		goto cred_retry;
1969 	}
1970 
1971 	needrecov = nfs4_needs_recovery(ep, TRUE, mi->mi_vfsp);
1972 
1973 	if (needrecov && recov_credpp) {
1974 		*recov_credpp = cred_otw;
1975 		crhold(*recov_credpp);
1976 		if (recov_seqidp)
1977 			*recov_seqidp = seqid;
1978 	}
1979 
1980 	if (!ep->error && !res.status) {
1981 		/* get the open downgrade results */
1982 		resop = &res.array[2];
1983 		odg_res = &resop->nfs_resop4_u.opopen_downgrade;
1984 
1985 		osp->open_stateid = odg_res->open_stateid;
1986 
1987 		/* set the open streams new access/deny bits */
1988 		if (access_close & FREAD)
1989 			osp->os_share_acc_read--;
1990 		if (access_close & FWRITE)
1991 			osp->os_share_acc_write--;
1992 		osp->os_share_deny_none--;
1993 		osp->os_dc_openacc = new_acc;
1994 
1995 		nfs4_attr_cache(vp,
1996 		    &res.array[1].nfs_resop4_u.opgetattr.ga_res,
1997 		    t, cred_otw, TRUE, NULL);
1998 	}
1999 
2000 	if (!ep->error)
2001 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2002 
2003 no_args_out:
2004 	crfree(cred_otw);
2005 }
2006 
2007 /*
2008  * If an OPEN request gets ETIMEDOUT or EINTR (that includes bailing out
2009  * because the filesystem was forcibly unmounted) then we don't know if we
2010  * potentially left state dangling on the server, therefore the recovery
2011  * framework makes this call to resend the OPEN request and then undo it.
2012  */
2013 void
2014 nfs4_resend_open_otw(vnode_t **vpp, nfs4_lost_rqst_t *resend_rqstp,
2015 	nfs4_error_t *ep)
2016 {
2017 	COMPOUND4args_clnt	args;
2018 	COMPOUND4res_clnt	res;
2019 	nfs_argop4		argop[4];
2020 	GETFH4res		*gf_res = NULL;
2021 	OPEN4cargs		*open_args;
2022 	OPEN4res		*op_res;
2023 	char			*destcfp;
2024 	int			destclen;
2025 	nfs4_ga_res_t		*garp;
2026 	vnode_t			*dvp = NULL, *vp = NULL;
2027 	rnode4_t		*rp = NULL, *drp = NULL;
2028 	cred_t			*cr = NULL;
2029 	seqid4			seqid;
2030 	nfs4_open_owner_t	*oop = NULL;
2031 	nfs4_open_stream_t	*osp = NULL;
2032 	component4		*srcfp;
2033 	open_claim_type4	claim;
2034 	mntinfo4_t		*mi;
2035 	int			doqueue = 1;
2036 	bool_t			retry_open = FALSE;
2037 	int			created_osp = 0;
2038 	hrtime_t		t;
2039 	char 			*failed_msg = "";
2040 	int			fh_different;
2041 	int			reopen = 0;
2042 
2043 	nfs4_error_zinit(ep);
2044 
2045 	cr = resend_rqstp->lr_cr;
2046 	dvp = resend_rqstp->lr_dvp;
2047 
2048 	vp = *vpp;
2049 	if (vp) {
2050 		ASSERT(nfs4_consistent_type(vp));
2051 		rp = VTOR4(vp);
2052 	}
2053 
2054 	if (rp) {
2055 		/* If the file failed recovery, just quit. */
2056 		mutex_enter(&rp->r_statelock);
2057 		if (rp->r_flags & R4RECOVERR) {
2058 			mutex_exit(&rp->r_statelock);
2059 			ep->error = EIO;
2060 			return;
2061 		}
2062 		mutex_exit(&rp->r_statelock);
2063 	}
2064 
2065 	if (dvp) {
2066 		drp = VTOR4(dvp);
2067 		/* If the parent directory failed recovery, just quit. */
2068 		mutex_enter(&drp->r_statelock);
2069 		if (drp->r_flags & R4RECOVERR) {
2070 			mutex_exit(&drp->r_statelock);
2071 			ep->error = EIO;
2072 			return;
2073 		}
2074 		mutex_exit(&drp->r_statelock);
2075 	} else
2076 		reopen = 1;	/* NULL dvp means this is a reopen */
2077 
2078 	claim = resend_rqstp->lr_oclaim;
2079 	ASSERT(claim == CLAIM_NULL || claim == CLAIM_DELEGATE_CUR);
2080 
2081 	args.ctag = TAG_OPEN_LOST;
2082 	args.array_len = 4;
2083 	args.array = argop;
2084 
2085 	argop[0].argop = OP_CPUTFH;
2086 	if (reopen) {
2087 		ASSERT(vp != NULL);
2088 
2089 		mi = VTOMI4(vp);
2090 		/*
2091 		 * if this is a file mount then
2092 		 * use the mntinfo parentfh
2093 		 */
2094 		argop[0].nfs_argop4_u.opcputfh.sfh =
2095 		    (vp->v_flag & VROOT) ? mi->mi_srvparentfh :
2096 		    VTOSV(vp)->sv_dfh;
2097 		args.ctag = TAG_REOPEN_LOST;
2098 	} else {
2099 		argop[0].nfs_argop4_u.opcputfh.sfh = VTOR4(dvp)->r_fh;
2100 		mi = VTOMI4(dvp);
2101 	}
2102 
2103 	argop[1].argop = OP_COPEN;
2104 	open_args = &argop[1].nfs_argop4_u.opcopen;
2105 	open_args->claim = claim;
2106 
2107 	/*
2108 	 * If we sent over a OPEN with CREATE then the only
2109 	 * thing we care about is to not leave dangling state
2110 	 * on the server, not whether the file we potentially
2111 	 * created remains on the server.  So even though the
2112 	 * lost open request specified a CREATE, we only wish
2113 	 * to do a non-CREATE OPEN.
2114 	 */
2115 	open_args->opentype = OPEN4_NOCREATE;
2116 
2117 	srcfp = &resend_rqstp->lr_ofile;
2118 	destclen = srcfp->utf8string_len;
2119 	destcfp = kmem_alloc(destclen + 1, KM_SLEEP);
2120 	bcopy(srcfp->utf8string_val, destcfp, destclen);
2121 	destcfp[destclen] = '\0';
2122 	if (claim == CLAIM_DELEGATE_CUR) {
2123 		open_args->open_claim4_u.delegate_cur_info.delegate_stateid =
2124 		    resend_rqstp->lr_ostateid;
2125 		open_args->open_claim4_u.delegate_cur_info.cfile = destcfp;
2126 	} else {
2127 		open_args->open_claim4_u.cfile = destcfp;
2128 	}
2129 
2130 	open_args->share_access = resend_rqstp->lr_oacc;
2131 	open_args->share_deny = resend_rqstp->lr_odeny;
2132 	oop = resend_rqstp->lr_oop;
2133 	ASSERT(oop != NULL);
2134 
2135 	open_args->owner.clientid = mi2clientid(mi);
2136 	/* this length never changes */
2137 	open_args->owner.owner_len = sizeof (oop->oo_name);
2138 	open_args->owner.owner_val =
2139 	    kmem_alloc(open_args->owner.owner_len, KM_SLEEP);
2140 
2141 	ep->error = nfs4_start_open_seqid_sync(oop, mi);
2142 	ASSERT(ep->error == 0);		/* recov thread always succeeds */
2143 	/*
2144 	 * We can get away with not saving the seqid upon detection
2145 	 * of a lost request, and now just use the open owner's current
2146 	 * seqid since we only allow one op OTW per seqid and lost
2147 	 * requests are saved FIFO.
2148 	 */
2149 	seqid = nfs4_get_open_seqid(oop) + 1;
2150 	open_args->seqid = seqid;
2151 
2152 	bcopy(&oop->oo_name, open_args->owner.owner_val,
2153 	    open_args->owner.owner_len);
2154 
2155 	/* getfh */
2156 	argop[2].argop = OP_GETFH;
2157 
2158 	/* Construct the getattr part of the compound */
2159 	argop[3].argop = OP_GETATTR;
2160 	argop[3].nfs_argop4_u.opgetattr.attr_request = NFS4_VATTR_MASK;
2161 	argop[3].nfs_argop4_u.opgetattr.mi = mi;
2162 
2163 	res.array = NULL;
2164 
2165 	t = gethrtime();
2166 
2167 	rfs4call(mi, &args, &res, cr, &doqueue, 0, ep);
2168 
2169 	if (ep->error == 0 && nfs4_need_to_bump_seqid(&res))
2170 		nfs4_set_open_seqid(seqid, oop, args.ctag);
2171 
2172 	NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2173 	    "nfs4_resend_open_otw: error %d stat %d", ep->error, res.status));
2174 
2175 	if (ep->error || res.status)
2176 		goto err_out;
2177 
2178 	op_res = &res.array[1].nfs_resop4_u.opopen;
2179 	gf_res = &res.array[2].nfs_resop4_u.opgetfh;
2180 	garp = &res.array[3].nfs_resop4_u.opgetattr.ga_res;
2181 
2182 	if (!vp) {
2183 		int rnode_err = 0;
2184 		nfs4_sharedfh_t *sfh;
2185 
2186 		/*
2187 		 * If we can't decode all the attributes they are not usable,
2188 		 * just make the vnode.
2189 		 */
2190 
2191 		sfh = sfh4_get(&gf_res->object, VTOMI4(dvp));
2192 		*vpp = makenfs4node(sfh, garp, dvp->v_vfsp, t, cr, dvp,
2193 		    fn_get(VTOSV(dvp)->sv_name,
2194 		    open_args->open_claim4_u.cfile, sfh));
2195 		sfh4_rele(&sfh);
2196 		NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2197 		    "nfs4_resend_open_otw: made vp %p for file %s",
2198 		    (void *)(*vpp), open_args->open_claim4_u.cfile));
2199 
2200 		if (ep->error)
2201 			PURGE_ATTRCACHE4(*vpp);
2202 
2203 		/*
2204 		 * For the newly created *vpp case, make sure the rnode
2205 		 * isn't bad before using it.
2206 		 */
2207 		mutex_enter(&(VTOR4(*vpp))->r_statelock);
2208 		if (VTOR4(*vpp)->r_flags & R4RECOVERR)
2209 			rnode_err = EIO;
2210 		mutex_exit(&(VTOR4(*vpp))->r_statelock);
2211 
2212 		if (rnode_err) {
2213 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2214 			    "nfs4_resend_open_otw: rp %p is bad",
2215 			    (void *)VTOR4(*vpp)));
2216 			ep->error = rnode_err;
2217 			goto err_out;
2218 		}
2219 
2220 		vp = *vpp;
2221 		rp = VTOR4(vp);
2222 	}
2223 
2224 	if (reopen) {
2225 		/*
2226 		 * Check if the path we reopened really is the same
2227 		 * file. We could end up in a situation were the file
2228 		 * was removed and a new file created with the same name.
2229 		 */
2230 		(void) nfs_rw_enter_sig(&mi->mi_fh_lock, RW_READER, 0);
2231 		fh_different =
2232 		    (nfs4cmpfh(&rp->r_fh->sfh_fh, &gf_res->object) != 0);
2233 		if (fh_different) {
2234 			if (mi->mi_fh_expire_type == FH4_PERSISTENT ||
2235 			    mi->mi_fh_expire_type & FH4_NOEXPIRE_WITH_OPEN) {
2236 				/* Oops, we don't have the same file */
2237 				if (mi->mi_fh_expire_type == FH4_PERSISTENT)
2238 					failed_msg =
2239 					    "Couldn't reopen: Persistant "
2240 					    "file handle changed";
2241 				else
2242 					failed_msg =
2243 					    "Couldn't reopen: Volatile "
2244 					    "(no expire on open) file handle "
2245 					    "changed";
2246 
2247 				nfs4_end_open_seqid_sync(oop);
2248 				kmem_free(destcfp, destclen + 1);
2249 				nfs4args_copen_free(open_args);
2250 				(void) xdr_free(xdr_COMPOUND4res_clnt,
2251 				    (caddr_t)&res);
2252 				nfs_rw_exit(&mi->mi_fh_lock);
2253 				nfs4_fail_recov(vp, failed_msg, ep->error,
2254 				    ep->stat);
2255 				return;
2256 			} else {
2257 				/*
2258 				 * We have volatile file handles that don't
2259 				 * compare.  If the fids are the same then we
2260 				 * assume that the file handle expired but the
2261 				 * renode still refers to the same file object.
2262 				 *
2263 				 * First check that we have fids or not.
2264 				 * If we don't we have a dumb server so we will
2265 				 * just assume every thing is ok for now.
2266 				 */
2267 				if (!ep->error &&
2268 				    garp->n4g_va.va_mask & AT_NODEID &&
2269 				    rp->r_attr.va_mask & AT_NODEID &&
2270 				    rp->r_attr.va_nodeid !=
2271 				    garp->n4g_va.va_nodeid) {
2272 					/*
2273 					 * We have fids, but they don't
2274 					 * compare. So kill the file.
2275 					 */
2276 					failed_msg =
2277 					    "Couldn't reopen: file handle "
2278 					    "changed due to mismatched fids";
2279 					nfs4_end_open_seqid_sync(oop);
2280 					kmem_free(destcfp, destclen + 1);
2281 					nfs4args_copen_free(open_args);
2282 					(void) xdr_free(xdr_COMPOUND4res_clnt,
2283 					    (caddr_t)&res);
2284 					nfs_rw_exit(&mi->mi_fh_lock);
2285 					nfs4_fail_recov(vp, failed_msg,
2286 					    ep->error, ep->stat);
2287 					return;
2288 				} else {
2289 					/*
2290 					 * We have volatile file handles that
2291 					 * refers to the same file (at least
2292 					 * they have the same fid) or we don't
2293 					 * have fids so we can't tell. :(. We'll
2294 					 * be a kind and accepting client so
2295 					 * we'll update the rnode's file
2296 					 * handle with the otw handle.
2297 					 *
2298 					 * We need to drop mi->mi_fh_lock since
2299 					 * sh4_update acquires it. Since there
2300 					 * is only one recovery thread there is
2301 					 * no race.
2302 					 */
2303 					nfs_rw_exit(&mi->mi_fh_lock);
2304 					sfh4_update(rp->r_fh, &gf_res->object);
2305 				}
2306 			}
2307 		} else {
2308 			nfs_rw_exit(&mi->mi_fh_lock);
2309 		}
2310 	}
2311 
2312 	ASSERT(nfs4_consistent_type(vp));
2313 
2314 	if (op_res->rflags & OPEN4_RESULT_CONFIRM)
2315 		nfs4open_confirm(vp, &seqid, &op_res->stateid, cr, TRUE,
2316 		    &retry_open, oop, TRUE, ep, NULL);
2317 	if (ep->error || ep->stat) {
2318 		nfs4_end_open_seqid_sync(oop);
2319 		kmem_free(destcfp, destclen + 1);
2320 		nfs4args_copen_free(open_args);
2321 		if (!ep->error)
2322 			(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2323 		return;
2324 	}
2325 
2326 	if (reopen) {
2327 		/*
2328 		 * Doing a reopen here so the osp should already exist.
2329 		 * If not, something changed or went very wrong.
2330 		 *
2331 		 * returns with 'os_sync_lock' held
2332 		 */
2333 		osp = find_open_stream(oop, rp);
2334 		if (!osp) {
2335 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2336 			    "nfs4_resend_open_otw: couldn't find osp"));
2337 			ep->error = EINVAL;
2338 			goto err_out;
2339 		}
2340 		osp->os_open_ref_count++;
2341 	} else {
2342 		mutex_enter(&oop->oo_lock);
2343 		oop->oo_just_created = NFS4_PERM_CREATED;
2344 		mutex_exit(&oop->oo_lock);
2345 
2346 		/* returns with 'os_sync_lock' held */
2347 		osp = find_or_create_open_stream(oop, rp, &created_osp);
2348 		if (!osp) {
2349 			NFS4_DEBUG(nfs4_lost_rqst_debug, (CE_NOTE,
2350 			    "nfs4_resend_open_otw: couldn't create osp"));
2351 			ep->error = EINVAL;
2352 			goto err_out;
2353 		}
2354 	}
2355 
2356 	osp->open_stateid = op_res->stateid;
2357 	osp->os_delegation = FALSE;
2358 	/*
2359 	 * Need to reset this bitfield for the possible case where we were
2360 	 * going to OTW CLOSE the file, got a non-recoverable error, and before
2361 	 * we could retry the CLOSE, OPENed the file again.
2362 	 */
2363 	ASSERT(osp->os_open_owner->oo_seqid_inuse);
2364 	osp->os_final_close = 0;
2365 	osp->os_force_close = 0;
2366 
2367 	if (!reopen) {
2368 		if (open_args->share_access & OPEN4_SHARE_ACCESS_READ)
2369 			osp->os_share_acc_read++;
2370 		if (open_args->share_access & OPEN4_SHARE_ACCESS_WRITE)
2371 			osp->os_share_acc_write++;
2372 		osp->os_share_deny_none++;
2373 	}
2374 
2375 	mutex_exit(&osp->os_sync_lock);
2376 	if (created_osp)
2377 		nfs4_inc_state_ref_count(mi);
2378 	open_stream_rele(osp, rp);
2379 
2380 	nfs4_end_open_seqid_sync(oop);
2381 
2382 	/* accept delegation, if any */
2383 	nfs4_delegation_accept(rp, claim, op_res, garp, cr);
2384 
2385 	kmem_free(destcfp, destclen + 1);
2386 	nfs4args_copen_free(open_args);
2387 
2388 	if (claim == CLAIM_DELEGATE_CUR)
2389 		nfs4_attr_cache(vp, garp, t, cr, TRUE, NULL);
2390 	else
2391 		PURGE_ATTRCACHE4(vp);
2392 
2393 	(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2394 
2395 	ASSERT(nfs4_consistent_type(vp));
2396 
2397 	return;
2398 
2399 err_out:
2400 	nfs4_end_open_seqid_sync(oop);
2401 	kmem_free(destcfp, destclen + 1);
2402 	nfs4args_copen_free(open_args);
2403 	if (!ep->error)
2404 		(void) xdr_free(xdr_COMPOUND4res_clnt, (caddr_t)&res);
2405 }
2406