xref: /titanic_41/usr/src/uts/common/fs/nfs/nfs4_state.c (revision 99653d4ee642c6528e88224f12409a5f23060994)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/kmem.h>
30 #include <sys/cmn_err.h>
31 #include <sys/atomic.h>
32 #include <sys/clconf.h>
33 #include <sys/cladm.h>
34 #include <sys/flock.h>
35 #include <nfs/export.h>
36 #include <nfs/nfs.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfssys.h>
39 #include <nfs/lm.h>
40 #include <sys/pathname.h>
41 #include <sys/nvpair.h>
42 
43 
44 extern time_t rfs4_start_time;
45 
46 stateid4 special0 = {
47 	0,
48 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
49 };
50 
51 stateid4 special1 = {
52 	0xffffffff,
53 	{
54 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
55 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
56 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
57 	}
58 };
59 
60 
61 #define	ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
62 			stateid4_cmp(id, &special1))
63 
64 /* For embedding the cluster nodeid into our clientid */
65 #define	CLUSTER_NODEID_SHIFT	24
66 #define	CLUSTER_MAX_NODEID	255
67 
68 #ifdef DEBUG
69 int rfs4_debug;
70 #endif
71 
72 static uint32_t rfs4_database_debug = 0x00;
73 
74 static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf);
75 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
76 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
77 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
78 
79 /*
80  * Couple of simple init/destroy functions for a general waiter
81  */
82 void
83 rfs4_sw_init(rfs4_state_wait_t *swp)
84 {
85 	mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
86 	cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
87 	swp->sw_active = FALSE;
88 	swp->sw_wait_count = 0;
89 }
90 
91 void
92 rfs4_sw_destroy(rfs4_state_wait_t *swp)
93 {
94 	mutex_destroy(swp->sw_cv_lock);
95 	cv_destroy(swp->sw_cv);
96 }
97 
98 void
99 rfs4_sw_enter(rfs4_state_wait_t *swp)
100 {
101 	mutex_enter(swp->sw_cv_lock);
102 	while (swp->sw_active) {
103 		swp->sw_wait_count++;
104 		cv_wait(swp->sw_cv, swp->sw_cv_lock);
105 		swp->sw_wait_count--;
106 	}
107 	ASSERT(swp->sw_active == FALSE);
108 	swp->sw_active = TRUE;
109 	mutex_exit(swp->sw_cv_lock);
110 }
111 
112 void
113 rfs4_sw_exit(rfs4_state_wait_t *swp)
114 {
115 	mutex_enter(swp->sw_cv_lock);
116 	ASSERT(swp->sw_active == TRUE);
117 	swp->sw_active = FALSE;
118 	if (swp->sw_wait_count != 0)
119 		cv_broadcast(swp->sw_cv);
120 	mutex_exit(swp->sw_cv_lock);
121 }
122 
123 /*
124  * CPR callback id -- not related to v4 callbacks
125  */
126 static callb_id_t cpr_id = 0;
127 
128 static void
129 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
130 {
131 	lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
132 	lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
133 
134 	if (sres->status == NFS4ERR_DENIED) {
135 		dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
136 		bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
137 	}
138 }
139 
140 static void
141 deep_lock_free(LOCK4res *res)
142 {
143 	lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
144 
145 	if (res->status == NFS4ERR_DENIED)
146 		kmem_free(lo->owner_val, lo->owner_len);
147 }
148 
149 static void
150 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
151 {
152 	nfsace4 *sacep, *dacep;
153 
154 	if (sres->status != NFS4_OK) {
155 		return;
156 	}
157 
158 	dres->attrset = sres->attrset;
159 
160 	switch (sres->delegation.delegation_type) {
161 	case OPEN_DELEGATE_NONE:
162 		return;
163 	case OPEN_DELEGATE_READ:
164 		sacep = &sres->delegation.open_delegation4_u.read.permissions;
165 		dacep = &dres->delegation.open_delegation4_u.read.permissions;
166 		break;
167 	case OPEN_DELEGATE_WRITE:
168 		sacep = &sres->delegation.open_delegation4_u.write.permissions;
169 		dacep = &dres->delegation.open_delegation4_u.write.permissions;
170 		break;
171 	}
172 	dacep->who.utf8string_val =
173 		kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
174 	bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
175 	    sacep->who.utf8string_len);
176 }
177 
178 static void
179 deep_open_free(OPEN4res *res)
180 {
181 	nfsace4 *acep;
182 	if (res->status != NFS4_OK)
183 		return;
184 
185 	switch (res->delegation.delegation_type) {
186 	case OPEN_DELEGATE_NONE:
187 		return;
188 	case OPEN_DELEGATE_READ:
189 		acep = &res->delegation.open_delegation4_u.read.permissions;
190 		break;
191 	case OPEN_DELEGATE_WRITE:
192 		acep = &res->delegation.open_delegation4_u.write.permissions;
193 		break;
194 	}
195 
196 	if (acep->who.utf8string_val) {
197 		kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
198 		acep->who.utf8string_val = NULL;
199 	}
200 }
201 
202 void
203 rfs4_free_reply(nfs_resop4 *rp)
204 {
205 	switch (rp->resop) {
206 	case OP_LOCK:
207 		deep_lock_free(&rp->nfs_resop4_u.oplock);
208 		break;
209 	case OP_OPEN:
210 		deep_open_free(&rp->nfs_resop4_u.opopen);
211 	default:
212 		break;
213 	}
214 }
215 
216 void
217 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
218 {
219 	*dst = *src;
220 
221 	/* Handle responses that need deep copy */
222 	switch (src->resop) {
223 	case OP_LOCK:
224 		deep_lock_copy(&dst->nfs_resop4_u.oplock,
225 			    &src->nfs_resop4_u.oplock);
226 		break;
227 	case OP_OPEN:
228 		deep_open_copy(&dst->nfs_resop4_u.opopen,
229 			    &src->nfs_resop4_u.opopen);
230 		break;
231 	default:
232 		break;
233 	};
234 }
235 
236 /*
237  * This is the implementation of the underlying state engine. The
238  * public interface to this engine is described by
239  * nfs4_state.h. Callers to the engine should hold no state engine
240  * locks when they call in to it. If the protocol needs to lock data
241  * structures it should do so after acquiring all references to them
242  * first and then follow the following lock order:
243  *
244  *	client > openowner > state > lo_state > lockowner > file.
245  *
246  * Internally we only allow a thread to hold one hash bucket lock at a
247  * time and the lock is higher in the lock order (must be acquired
248  * first) than the data structure that is on that hash list.
249  *
250  * If a new reference was acquired by the caller, that reference needs
251  * to be released after releasing all acquired locks with the
252  * corresponding rfs4_*_rele routine.
253  */
254 
255 /*
256  * This code is some what prototypical for now. Its purpose currently is to
257  * implement the interfaces sufficiently to finish the higher protocol
258  * elements. This will be replaced by a dynamically resizeable tables
259  * backed by kmem_cache allocator. However synchronization is handled
260  * correctly (I hope) and will not change by much.  The mutexes for
261  * the hash buckets that can be used to create new instances of data
262  * structures  might be good candidates to evolve into reader writer
263  * locks. If it has to do a creation, it would be holding the
264  * mutex across a kmem_alloc with KM_SLEEP specified.
265  */
266 
267 #ifdef DEBUG
268 #define	TABSIZE 17
269 #else
270 #define	TABSIZE 2047
271 #endif
272 
273 #define	ADDRHASH(key) ((unsigned long)(key) >> 3)
274 
275 /* Used to serialize create/destroy of rfs4_server_state database */
276 kmutex_t	rfs4_state_lock;
277 static rfs4_database_t *rfs4_server_state = NULL;
278 
279 /* Used to serialize lookups of clientids */
280 static	krwlock_t	rfs4_findclient_lock;
281 
282 /*
283  * For now this "table" is exposed so that the CPR callback
284  * function can tromp through it..
285  */
286 rfs4_table_t *rfs4_client_tab;
287 
288 static rfs4_index_t *rfs4_clientid_idx;
289 static rfs4_index_t *rfs4_nfsclnt_idx;
290 static rfs4_table_t *rfs4_openowner_tab;
291 static rfs4_index_t *rfs4_openowner_idx;
292 static rfs4_table_t *rfs4_state_tab;
293 static rfs4_index_t *rfs4_state_idx;
294 static rfs4_index_t *rfs4_state_owner_file_idx;
295 static rfs4_index_t *rfs4_state_file_idx;
296 static rfs4_table_t *rfs4_lo_state_tab;
297 static rfs4_index_t *rfs4_lo_state_idx;
298 static rfs4_index_t *rfs4_lo_state_owner_idx;
299 static rfs4_table_t *rfs4_lockowner_tab;
300 static rfs4_index_t *rfs4_lockowner_idx;
301 static rfs4_index_t *rfs4_lockowner_pid_idx;
302 static rfs4_table_t *rfs4_file_tab;
303 static rfs4_index_t *rfs4_file_idx;
304 static rfs4_table_t *rfs4_deleg_state_tab;
305 static rfs4_index_t *rfs4_deleg_idx;
306 static rfs4_index_t *rfs4_deleg_state_idx;
307 
308 #define	MAXTABSZ 1024*1024
309 
310 /* The values below are rfs4_lease_time units */
311 
312 #ifdef DEBUG
313 #define	CLIENT_CACHE_TIME 1
314 #define	OPENOWNER_CACHE_TIME 1
315 #define	STATE_CACHE_TIME 1
316 #define	LO_STATE_CACHE_TIME 1
317 #define	LOCKOWNER_CACHE_TIME 1
318 #define	FILE_CACHE_TIME 3
319 #define	DELEG_STATE_CACHE_TIME 1
320 #else
321 #define	CLIENT_CACHE_TIME 10
322 #define	OPENOWNER_CACHE_TIME 5
323 #define	STATE_CACHE_TIME 1
324 #define	LO_STATE_CACHE_TIME 1
325 #define	LOCKOWNER_CACHE_TIME 3
326 #define	FILE_CACHE_TIME 40
327 #define	DELEG_STATE_CACHE_TIME 1
328 #endif
329 
330 
331 static time_t rfs4_client_cache_time = 0;
332 static time_t rfs4_openowner_cache_time = 0;
333 static time_t rfs4_state_cache_time = 0;
334 static time_t rfs4_lo_state_cache_time = 0;
335 static time_t rfs4_lockowner_cache_time = 0;
336 static time_t rfs4_file_cache_time = 0;
337 static time_t rfs4_deleg_state_cache_time = 0;
338 
339 static bool_t rfs4_client_create(rfs4_entry_t, void *);
340 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
341 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
342 static void rfs4_client_destroy(rfs4_entry_t);
343 static bool_t rfs4_client_expiry(rfs4_entry_t);
344 static uint32_t clientid_hash(void *);
345 static bool_t clientid_compare(rfs4_entry_t, void *);
346 static void *clientid_mkkey(rfs4_entry_t);
347 static uint32_t nfsclnt_hash(void *);
348 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
349 static void *nfsclnt_mkkey(rfs4_entry_t);
350 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
351 static void rfs4_openowner_destroy(rfs4_entry_t);
352 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
353 static uint32_t openowner_hash(void *);
354 static bool_t openowner_compare(rfs4_entry_t, void *);
355 static void *openowner_mkkey(rfs4_entry_t);
356 static bool_t rfs4_state_create(rfs4_entry_t, void *);
357 static void rfs4_state_destroy(rfs4_entry_t);
358 static bool_t rfs4_state_expiry(rfs4_entry_t);
359 static uint32_t state_hash(void *);
360 static bool_t state_compare(rfs4_entry_t, void *);
361 static void *state_mkkey(rfs4_entry_t);
362 static uint32_t state_owner_file_hash(void *);
363 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
364 static void *state_owner_file_mkkey(rfs4_entry_t);
365 static uint32_t state_file_hash(void *);
366 static bool_t state_file_compare(rfs4_entry_t, void *);
367 static void *state_file_mkkey(rfs4_entry_t);
368 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
369 static void rfs4_lo_state_destroy(rfs4_entry_t);
370 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
371 static uint32_t lo_state_hash(void *);
372 static bool_t lo_state_compare(rfs4_entry_t, void *);
373 static void *lo_state_mkkey(rfs4_entry_t);
374 static uint32_t lo_state_lo_hash(void *);
375 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
376 static void *lo_state_lo_mkkey(rfs4_entry_t);
377 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
378 static void rfs4_lockowner_destroy(rfs4_entry_t);
379 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
380 static uint32_t lockowner_hash(void *);
381 static bool_t lockowner_compare(rfs4_entry_t, void *);
382 static void *lockowner_mkkey(rfs4_entry_t);
383 static uint32_t pid_hash(void *);
384 static bool_t pid_compare(rfs4_entry_t, void *);
385 static void *pid_mkkey(rfs4_entry_t);
386 static bool_t rfs4_file_create(rfs4_entry_t, void *);
387 static void rfs4_file_destroy(rfs4_entry_t);
388 static uint32_t file_hash(void *);
389 static bool_t file_compare(rfs4_entry_t, void *);
390 static void *file_mkkey(rfs4_entry_t);
391 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
392 static void rfs4_deleg_state_destroy(rfs4_entry_t);
393 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
394 static uint32_t deleg_hash(void *);
395 static bool_t deleg_compare(rfs4_entry_t, void *);
396 static void *deleg_mkkey(rfs4_entry_t);
397 static uint32_t deleg_state_hash(void *);
398 static bool_t deleg_state_compare(rfs4_entry_t, void *);
399 static void *deleg_state_mkkey(rfs4_entry_t);
400 
401 static void rfs4_state_rele_nounlock(rfs4_state_t *);
402 
403 static int rfs4_ss_enabled = 0;
404 
405 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
406 
407 void
408 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
409 {
410 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
411 }
412 
413 static rfs4_ss_pn_t *
414 rfs4_ss_pnalloc(char *dir, char *leaf)
415 {
416 	rfs4_ss_pn_t *ss_pn;
417 	int 	dir_len, leaf_len;
418 
419 	/*
420 	 * validate we have a resonable path
421 	 * (account for the '/' and trailing null)
422 	 */
423 	if ((dir_len = strlen(dir)) > MAXPATHLEN ||
424 		(leaf_len = strlen(leaf)) > MAXNAMELEN ||
425 		(dir_len + leaf_len + 2) > MAXPATHLEN) {
426 		return (NULL);
427 	}
428 
429 	ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
430 
431 	(void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
432 	/* Handy pointer to just the leaf name */
433 	ss_pn->leaf = ss_pn->pn + dir_len + 1;
434 	return (ss_pn);
435 }
436 
437 
438 /*
439  * Move the "leaf" filename from "sdir" directory
440  * to the "ddir" directory. Return the pathname of
441  * the destination unless the rename fails in which
442  * case we need to return the source pathname.
443  */
444 static rfs4_ss_pn_t *
445 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
446 {
447 	rfs4_ss_pn_t *src, *dst;
448 
449 	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
450 		return (NULL);
451 
452 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
453 		rfs4_ss_pnfree(src);
454 		return (NULL);
455 	}
456 
457 	/*
458 	 * If the rename fails we shall return the src
459 	 * pathname and free the dst. Otherwise we need
460 	 * to free the src and return the dst pathanme.
461 	 */
462 	if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
463 		rfs4_ss_pnfree(dst);
464 		return (src);
465 	}
466 	rfs4_ss_pnfree(src);
467 	return (dst);
468 }
469 
470 
471 static rfs4_oldstate_t *
472 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
473 {
474 	struct uio uio;
475 	struct iovec iov[3];
476 
477 	rfs4_oldstate_t *cl_ss = NULL;
478 	vnode_t *vp;
479 	vattr_t va;
480 	uint_t id_len;
481 	int err, kill_file, file_vers;
482 
483 	if (ss_pn == NULL)
484 		return (NULL);
485 
486 	/*
487 	 * open the state file.
488 	 */
489 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
490 		return (NULL);
491 	}
492 
493 	if (vp->v_type != VREG) {
494 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
495 		VN_RELE(vp);
496 		return (NULL);
497 	}
498 
499 	err = VOP_ACCESS(vp, VREAD, 0, CRED());
500 	if (err) {
501 		/*
502 		 * We don't have read access? better get the heck out.
503 		 */
504 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
505 		VN_RELE(vp);
506 		return (NULL);
507 	}
508 
509 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
510 	/*
511 	 * get the file size to do some basic validation
512 	 */
513 	va.va_mask = AT_SIZE;
514 	err = VOP_GETATTR(vp, &va, 0, CRED());
515 
516 	kill_file = (va.va_size == 0 || va.va_size <
517 		(NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
518 
519 	if (err || kill_file) {
520 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
521 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
522 		VN_RELE(vp);
523 		if (kill_file) {
524 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
525 		}
526 		return (NULL);
527 	}
528 
529 	cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
530 
531 	/*
532 	 * build iovecs to read in the file_version, verifier and id_len
533 	 */
534 	iov[0].iov_base = (caddr_t)&file_vers;
535 	iov[0].iov_len = sizeof (int);
536 	iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
537 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
538 	iov[2].iov_base = (caddr_t)&id_len;
539 	iov[2].iov_len = sizeof (uint_t);
540 
541 	uio.uio_iov = iov;
542 	uio.uio_iovcnt = 3;
543 	uio.uio_segflg = UIO_SYSSPACE;
544 	uio.uio_loffset = 0;
545 	uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
546 
547 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
548 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
549 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
550 		VN_RELE(vp);
551 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
552 		return (NULL);
553 	}
554 
555 	/*
556 	 * if the file_version doesn't match or if the
557 	 * id_len is zero or the combination of the verifier,
558 	 * id_len and id_val is bigger than the file we have
559 	 * a problem. If so ditch the file.
560 	 */
561 	kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
562 	    (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
563 
564 	if (err || kill_file) {
565 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
566 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
567 		VN_RELE(vp);
568 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
569 		if (kill_file) {
570 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
571 		}
572 		return (NULL);
573 	}
574 
575 	/*
576 	 * now get the client id value
577 	 */
578 	cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
579 	iov[0].iov_base = cl_ss->cl_id4.id_val;
580 	iov[0].iov_len = id_len;
581 
582 	uio.uio_iov = iov;
583 	uio.uio_iovcnt = 1;
584 	uio.uio_segflg = UIO_SYSSPACE;
585 	uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
586 
587 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
588 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
589 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
590 		VN_RELE(vp);
591 		kmem_free(cl_ss->cl_id4.id_val, id_len);
592 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
593 		return (NULL);
594 	}
595 
596 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
597 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
598 	VN_RELE(vp);
599 	return (cl_ss);
600 }
601 
602 #ifdef	nextdp
603 #undef nextdp
604 #endif
605 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
606 
607 /*
608  * Add entries from statedir to supplied oldstate list.
609  * Optionally, move all entries from statedir -> destdir.
610  */
611 void
612 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
613 {
614 	rfs4_ss_pn_t *ss_pn;
615 	rfs4_oldstate_t *cl_ss = NULL;
616 	char	*dirt = NULL;
617 	int	err, dir_eof = 0, size = 0;
618 	vnode_t *dvp;
619 	struct iovec iov;
620 	struct uio uio;
621 	struct dirent64 *dep;
622 	offset_t dirchunk_offset = 0;
623 
624 	/*
625 	 * open the state directory
626 	 */
627 	if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
628 		return;
629 
630 	if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED()))
631 		goto out;
632 
633 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
634 
635 	/*
636 	 * Get and process the directory entries
637 	 */
638 	while (!dir_eof) {
639 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
640 		iov.iov_base = dirt;
641 		iov.iov_len = RFS4_SS_DIRSIZE;
642 		uio.uio_iov = &iov;
643 		uio.uio_iovcnt = 1;
644 		uio.uio_segflg = UIO_SYSSPACE;
645 		uio.uio_loffset = dirchunk_offset;
646 		uio.uio_resid = RFS4_SS_DIRSIZE;
647 
648 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof);
649 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
650 		if (err)
651 			goto out;
652 
653 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
654 
655 		/*
656 		 * Process all the directory entries in this
657 		 * readdir chunk
658 		 */
659 		for (dep = (struct dirent64 *)dirt; size > 0;
660 			dep = nextdp(dep)) {
661 
662 			size -= dep->d_reclen;
663 			dirchunk_offset = dep->d_off;
664 
665 			/*
666 			 * Skip '.' and '..'
667 			 */
668 			if (NFS_IS_DOTNAME(dep->d_name))
669 				continue;
670 
671 			ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
672 			if (ss_pn == NULL)
673 				continue;
674 
675 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
676 				if (destdir != NULL) {
677 					rfs4_ss_pnfree(ss_pn);
678 					cl_ss->ss_pn = rfs4_ss_movestate(
679 						statedir, destdir, dep->d_name);
680 				} else {
681 					cl_ss->ss_pn = ss_pn;
682 				}
683 				insque(cl_ss, oldstate);
684 			} else {
685 				rfs4_ss_pnfree(ss_pn);
686 			}
687 		}
688 	}
689 
690 out:
691 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
692 	VN_RELE(dvp);
693 	if (dirt)
694 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
695 }
696 
697 static void
698 rfs4_ss_init(void)
699 {
700 	int npaths = 1;
701 	char *default_dss_path = NFS4_DSS_VAR_DIR;
702 
703 	/* read the default stable storage state */
704 	rfs4_dss_readstate(npaths, &default_dss_path);
705 
706 	rfs4_ss_enabled = 1;
707 }
708 
709 static void
710 rfs4_ss_fini(void)
711 {
712 	rfs4_servinst_t *sip;
713 
714 	mutex_enter(&rfs4_servinst_lock);
715 	sip = rfs4_cur_servinst;
716 	while (sip != NULL) {
717 		rfs4_dss_clear_oldstate(sip);
718 		sip = sip->next;
719 	}
720 	mutex_exit(&rfs4_servinst_lock);
721 }
722 
723 /*
724  * Remove all oldstate files referenced by this servinst.
725  */
726 static void
727 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
728 {
729 	rfs4_oldstate_t *os_head, *osp;
730 
731 	rw_enter(&sip->oldstate_lock, RW_WRITER);
732 	os_head = sip->oldstate;
733 
734 	if (os_head == NULL)
735 		return;
736 
737 	/* skip dummy entry */
738 	osp = os_head->next;
739 	while (osp != os_head) {
740 		char *leaf = osp->ss_pn->leaf;
741 		rfs4_oldstate_t *os_next;
742 
743 		rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
744 
745 		if (osp->cl_id4.id_val)
746 			kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
747 		if (osp->ss_pn)
748 			kmem_free(osp->ss_pn, sizeof (rfs4_ss_pn_t));
749 
750 		os_next = osp->next;
751 		remque(osp);
752 		kmem_free(osp, sizeof (rfs4_oldstate_t));
753 		osp = os_next;
754 	}
755 
756 	/* free dummy entry */
757 	kmem_free(osp, sizeof (rfs4_oldstate_t));
758 
759 	sip->oldstate = NULL;
760 
761 	rw_exit(&sip->oldstate_lock);
762 }
763 
764 /*
765  * Form the state and oldstate paths, and read in the stable storage files.
766  */
767 void
768 rfs4_dss_readstate(int npaths, char **paths)
769 {
770 	int i;
771 	char *state, *oldstate;
772 
773 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
774 	oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
775 
776 	for (i = 0; i < npaths; i++) {
777 		char *path = paths[i];
778 
779 		(void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
780 		(void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
781 
782 		/*
783 		 * Populate the current server instance's oldstate list.
784 		 *
785 		 * 1. Read stable storage data from old state directory,
786 		 *    leaving its contents alone.
787 		 *
788 		 * 2. Read stable storage data from state directory,
789 		 *    and move the latter's contents to old state
790 		 *    directory.
791 		 */
792 		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL);
793 		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate);
794 	}
795 
796 	kmem_free(state, MAXPATHLEN);
797 	kmem_free(oldstate, MAXPATHLEN);
798 }
799 
800 
801 /*
802  * Check if we are still in grace and if the client can be
803  * granted permission to perform reclaims.
804  */
805 void
806 rfs4_ss_chkclid(rfs4_client_t *cp)
807 {
808 	rfs4_servinst_t *sip;
809 
810 	/*
811 	 * It should be sufficient to check the oldstate data for just
812 	 * this client's instance. However, since our per-instance
813 	 * client grouping is solely temporal, HA-NFSv4 RG failover
814 	 * might result in clients of the same RG being partitioned into
815 	 * separate instances.
816 	 *
817 	 * Until the client grouping is improved, we must check the
818 	 * oldstate data for all instances with an active grace period.
819 	 *
820 	 * This also serves as the mechanism to remove stale oldstate data.
821 	 * The first time we check an instance after its grace period has
822 	 * expired, the oldstate data should be cleared.
823 	 *
824 	 * Start at the current instance, and walk the list backwards
825 	 * to the first.
826 	 */
827 	mutex_enter(&rfs4_servinst_lock);
828 	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
829 		rfs4_ss_chkclid_sip(cp, sip);
830 
831 		/* if the above check found this client, we're done */
832 		if (cp->can_reclaim)
833 			break;
834 	}
835 	mutex_exit(&rfs4_servinst_lock);
836 }
837 
838 static void
839 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
840 {
841 	rfs4_oldstate_t *osp, *os_head;
842 
843 	/* short circuit everything if this server instance has no oldstate */
844 	rw_enter(&sip->oldstate_lock, RW_READER);
845 	os_head = sip->oldstate;
846 	rw_exit(&sip->oldstate_lock);
847 	if (os_head == NULL)
848 		return;
849 
850 	/*
851 	 * If this server instance is no longer in a grace period then
852 	 * the client won't be able to reclaim. No further need for this
853 	 * instance's oldstate data, so it can be cleared.
854 	 */
855 	if (!rfs4_servinst_in_grace(sip))
856 		return;
857 
858 	/* this instance is still in grace; search for the clientid */
859 
860 	rw_enter(&sip->oldstate_lock, RW_READER);
861 
862 	os_head = sip->oldstate;
863 	/* skip dummy entry */
864 	osp = os_head->next;
865 	while (osp != os_head) {
866 		if (osp->cl_id4.id_len == cp->nfs_client.id_len) {
867 			if (bcmp(osp->cl_id4.id_val, cp->nfs_client.id_val,
868 					osp->cl_id4.id_len) == 0) {
869 				cp->can_reclaim = 1;
870 				break;
871 			}
872 		}
873 		osp = osp->next;
874 	}
875 
876 	rw_exit(&sip->oldstate_lock);
877 }
878 
879 /*
880  * Place client information into stable storage: 1/3.
881  * First, generate the leaf filename, from the client's IP address and
882  * the server-generated short-hand clientid.
883  */
884 void
885 rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req)
886 {
887 	const char *kinet_ntop6(uchar_t *, char *, size_t);
888 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
889 	struct sockaddr *ca;
890 	uchar_t *b;
891 
892 	if (rfs4_ss_enabled == 0) {
893 		return;
894 	}
895 
896 	buf[0] = 0;
897 
898 
899 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
900 	if (ca == NULL) {
901 		return;
902 	}
903 
904 	/*
905 	 * Convert the caller's IP address to a dotted string
906 	 */
907 	if (ca->sa_family == AF_INET) {
908 
909 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
910 			sizeof (struct sockaddr_in));
911 		b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
912 		(void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
913 				b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
914 	} else if (ca->sa_family == AF_INET6) {
915 		struct sockaddr_in6 *sin6;
916 
917 		sin6 = (struct sockaddr_in6 *)ca;
918 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
919 				sizeof (struct sockaddr_in6));
920 		(void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
921 				buf, INET6_ADDRSTRLEN);
922 	}
923 
924 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
925 	    (longlong_t)cp->clientid);
926 	rfs4_ss_clid_write(cp, leaf);
927 }
928 
929 /*
930  * Place client information into stable storage: 2/3.
931  * DSS: distributed stable storage: the file may need to be written to
932  * multiple directories.
933  */
934 static void
935 rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf)
936 {
937 	rfs4_servinst_t *sip;
938 
939 	/*
940 	 * It should be sufficient to write the leaf file to (all) DSS paths
941 	 * associated with just this client's instance. However, since our
942 	 * per-instance client grouping is solely temporal, HA-NFSv4 RG
943 	 * failover might result in us losing DSS data.
944 	 *
945 	 * Until the client grouping is improved, we must write the DSS data
946 	 * to all instances' paths. Start at the current instance, and
947 	 * walk the list backwards to the first.
948 	 */
949 	mutex_enter(&rfs4_servinst_lock);
950 	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
951 		int i, npaths = sip->dss_npaths;
952 
953 		/* write the leaf file to all DSS paths */
954 		for (i = 0; i < npaths; i++) {
955 			rfs4_dss_path_t *dss_path = sip->dss_paths[i];
956 
957 			/* HA-NFSv4 path might have been failed-away from us */
958 			if (dss_path == NULL)
959 				continue;
960 
961 			rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
962 		}
963 	}
964 	mutex_exit(&rfs4_servinst_lock);
965 }
966 
967 /*
968  * Place client information into stable storage: 3/3.
969  * Write the stable storage data to the requested file.
970  */
971 static void
972 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
973 {
974 	int ioflag;
975 	int file_vers = NFS4_SS_VERSION;
976 	struct uio uio;
977 	struct iovec iov[4];
978 	char *dir;
979 	rfs4_ss_pn_t *ss_pn;
980 	vnode_t *vp;
981 	nfs_client_id4 *cl_id4 = &(cp->nfs_client);
982 
983 	/* allow 2 extra bytes for '/' & NUL */
984 	dir = kmem_alloc(strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2,
985 	    KM_SLEEP);
986 	(void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
987 
988 	if ((ss_pn = rfs4_ss_pnalloc(dir, leaf)) == NULL)
989 		return;
990 
991 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
992 			    CRCREAT, 0)) {
993 		rfs4_ss_pnfree(ss_pn);
994 		return;
995 	}
996 
997 	/*
998 	 * We need to record leaf - i.e. the filename - so that we know
999 	 * what to remove, in the future. However, the dir part of cp->ss_pn
1000 	 * should never be referenced directly, since it's potentially only
1001 	 * one of several paths with this leaf in it.
1002 	 */
1003 	if (cp->ss_pn != NULL) {
1004 		if (strcmp(cp->ss_pn->leaf, leaf) == 0) {
1005 			/* we've already recorded *this* leaf */
1006 			rfs4_ss_pnfree(ss_pn);
1007 		} else {
1008 			/* replace with this leaf */
1009 			rfs4_ss_pnfree(cp->ss_pn);
1010 			cp->ss_pn = ss_pn;
1011 		}
1012 	} else {
1013 		cp->ss_pn = ss_pn;
1014 	}
1015 
1016 	/*
1017 	 * Build a scatter list that points to the nfs_client_id4
1018 	 */
1019 	iov[0].iov_base = (caddr_t)&file_vers;
1020 	iov[0].iov_len = sizeof (int);
1021 	iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1022 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
1023 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1024 	iov[2].iov_len = sizeof (uint_t);
1025 	iov[3].iov_base = (caddr_t)cl_id4->id_val;
1026 	iov[3].iov_len = cl_id4->id_len;
1027 
1028 	uio.uio_iov = iov;
1029 	uio.uio_iovcnt = 4;
1030 	uio.uio_loffset = 0;
1031 	uio.uio_segflg = UIO_SYSSPACE;
1032 	uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1033 	uio.uio_resid = cl_id4->id_len + sizeof (int) +
1034 		NFS4_VERIFIER_SIZE + sizeof (uint_t);
1035 
1036 	ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1037 	uio.uio_extflg = UIO_COPY_DEFAULT;
1038 
1039 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1040 	/* write the full client id to the file. */
1041 	(void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1042 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1043 
1044 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
1045 	VN_RELE(vp);
1046 }
1047 
1048 /*
1049  * DSS: distributed stable storage.
1050  * Unpack the list of paths passed by nfsd.
1051  * Use nvlist_alloc(9F) to manage the data.
1052  * The caller is responsible for allocating and freeing the buffer.
1053  */
1054 int
1055 rfs4_dss_setpaths(char *buf, size_t buflen)
1056 {
1057 	int error;
1058 
1059 	/*
1060 	 * If this is a "warm start", i.e. we previously had DSS paths,
1061 	 * preserve the old paths.
1062 	 */
1063 	if (rfs4_dss_paths != NULL) {
1064 		/*
1065 		 * Before we lose the ptr, destroy the nvlist and pathnames
1066 		 * array from the warm start before this one.
1067 		 */
1068 		if (rfs4_dss_oldpaths)
1069 			nvlist_free(rfs4_dss_oldpaths);
1070 		rfs4_dss_oldpaths = rfs4_dss_paths;
1071 	}
1072 
1073 	/* unpack the buffer into a searchable nvlist */
1074 	error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1075 	if (error)
1076 		return (error);
1077 
1078 	/*
1079 	 * Search the nvlist for the pathnames nvpair (which is the only nvpair
1080 	 * in the list, and record its location.
1081 	 */
1082 	error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1083 	    &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1084 	return (error);
1085 }
1086 
1087 /*
1088  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1089  * to find and mark the client for forced expire.
1090  */
1091 static void
1092 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1093 {
1094 	rfs4_client_t *cp = (rfs4_client_t *)ent;
1095 	struct nfs4clrst_args *clr = arg;
1096 	struct sockaddr_in6 *ent_sin6;
1097 	struct in6_addr  clr_in6;
1098 	struct sockaddr_in  *ent_sin;
1099 	struct in_addr   clr_in;
1100 
1101 	if (clr->addr_type != cp->cl_addr.ss_family) {
1102 		return;
1103 	}
1104 
1105 	switch (clr->addr_type) {
1106 
1107 	case AF_INET6:
1108 		/* copyin the address from user space */
1109 		if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1110 			break;
1111 		}
1112 
1113 		ent_sin6 = (struct sockaddr_in6 *)&cp->cl_addr;
1114 
1115 		/*
1116 		 * now compare, and if equivalent mark entry
1117 		 * for forced expiration
1118 		 */
1119 		if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1120 			cp->forced_expire = 1;
1121 		}
1122 		break;
1123 
1124 	case AF_INET:
1125 		/* copyin the address from user space */
1126 		if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1127 			break;
1128 		}
1129 
1130 		ent_sin = (struct sockaddr_in *)&cp->cl_addr;
1131 
1132 		/*
1133 		 * now compare, and if equivalent mark entry
1134 		 * for forced expiration
1135 		 */
1136 		if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1137 			cp->forced_expire = 1;
1138 		}
1139 		break;
1140 
1141 	default:
1142 		/* force this assert to fail */
1143 		ASSERT(clr->addr_type != clr->addr_type);
1144 	}
1145 }
1146 
1147 /*
1148  * This is called from nfssys() in order to clear server state
1149  * for the specified client IP Address.
1150  */
1151 void
1152 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1153 {
1154 	(void) rfs4_dbe_walk(rfs4_client_tab, rfs4_client_scrub, clr);
1155 }
1156 
1157 /*
1158  * Used to initialize the NFSv4 server's state or database.  All of
1159  * the tables are created and timers are set. Only called when NFSv4
1160  * service is provided.
1161  */
1162 void
1163 rfs4_state_init()
1164 {
1165 	int start_grace;
1166 	extern boolean_t rfs4_cpr_callb(void *, int);
1167 	char *dss_path = NFS4_DSS_VAR_DIR;
1168 
1169 	mutex_enter(&rfs4_state_lock);
1170 
1171 	/*
1172 	 * If the server state database has already been initialized,
1173 	 * skip it
1174 	 */
1175 	if (rfs4_server_state != NULL) {
1176 		mutex_exit(&rfs4_state_lock);
1177 		return;
1178 	}
1179 
1180 	rw_init(&rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1181 
1182 	/*
1183 	 * Set the boot time.  If the server
1184 	 * has been restarted quickly and has had the opportunity to
1185 	 * service clients, then the start_time needs to be bumped
1186 	 * regardless.  A small window but it exists...
1187 	 */
1188 	if (rfs4_start_time != gethrestime_sec())
1189 		rfs4_start_time = gethrestime_sec();
1190 	else
1191 		rfs4_start_time++;
1192 
1193 	/* DSS: distributed stable storage: initialise served paths list */
1194 	rfs4_dss_pathlist = NULL;
1195 
1196 	/*
1197 	 * Create the first server instance, or a new one if the server has
1198 	 * been restarted; see above comments on rfs4_start_time. Don't
1199 	 * start its grace period; that will be done later, to maximise the
1200 	 * clients' recovery window.
1201 	 */
1202 	start_grace = 0;
1203 	rfs4_servinst_create(start_grace, 1, &dss_path);
1204 
1205 	/* reset the "first NFSv4 request" status */
1206 	rfs4_seen_first_compound = 0;
1207 
1208 	/*
1209 	 * Add a CPR callback so that we can update client
1210 	 * access times to extend the lease after a suspend
1211 	 * and resume (using the same class as rpcmod/connmgr)
1212 	 */
1213 	cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1214 
1215 	/* set the various cache timers for table creation */
1216 	if (rfs4_client_cache_time == 0)
1217 		rfs4_client_cache_time = CLIENT_CACHE_TIME;
1218 	if (rfs4_openowner_cache_time == 0)
1219 		rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1220 	if (rfs4_state_cache_time == 0)
1221 		rfs4_state_cache_time = STATE_CACHE_TIME;
1222 	if (rfs4_lo_state_cache_time == 0)
1223 		rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1224 	if (rfs4_lockowner_cache_time == 0)
1225 		rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1226 	if (rfs4_file_cache_time == 0)
1227 		rfs4_file_cache_time = FILE_CACHE_TIME;
1228 	if (rfs4_deleg_state_cache_time == 0)
1229 		rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1230 
1231 	/* Create the overall database to hold all server state */
1232 	rfs4_server_state = rfs4_database_create(rfs4_database_debug);
1233 
1234 	/* Now create the individual tables */
1235 	rfs4_client_cache_time *= rfs4_lease_time;
1236 	rfs4_client_tab = rfs4_table_create(rfs4_server_state,
1237 					    "Client",
1238 					    rfs4_client_cache_time,
1239 					    2,
1240 					    rfs4_client_create,
1241 					    rfs4_client_destroy,
1242 					    rfs4_client_expiry,
1243 					    sizeof (rfs4_client_t),
1244 					    TABSIZE,
1245 					    MAXTABSZ/8, 100);
1246 	rfs4_nfsclnt_idx = rfs4_index_create(rfs4_client_tab,
1247 					    "nfs_client_id4", nfsclnt_hash,
1248 					    nfsclnt_compare, nfsclnt_mkkey,
1249 					    TRUE);
1250 	rfs4_clientid_idx = rfs4_index_create(rfs4_client_tab,
1251 					    "client_id", clientid_hash,
1252 					    clientid_compare, clientid_mkkey,
1253 					    FALSE);
1254 
1255 	rfs4_openowner_cache_time *= rfs4_lease_time;
1256 	rfs4_openowner_tab = rfs4_table_create(rfs4_server_state,
1257 					    "OpenOwner",
1258 					    rfs4_openowner_cache_time,
1259 					    1,
1260 					    rfs4_openowner_create,
1261 					    rfs4_openowner_destroy,
1262 					    rfs4_openowner_expiry,
1263 					    sizeof (rfs4_openowner_t),
1264 					    TABSIZE,
1265 					    MAXTABSZ, 100);
1266 	rfs4_openowner_idx = rfs4_index_create(rfs4_openowner_tab,
1267 					    "open_owner4", openowner_hash,
1268 					    openowner_compare,
1269 					    openowner_mkkey, TRUE);
1270 
1271 	rfs4_state_cache_time *= rfs4_lease_time;
1272 	rfs4_state_tab = rfs4_table_create(rfs4_server_state,
1273 					"OpenStateID",
1274 					rfs4_state_cache_time,
1275 					3,
1276 					rfs4_state_create,
1277 					rfs4_state_destroy,
1278 					rfs4_state_expiry,
1279 					sizeof (rfs4_state_t),
1280 					TABSIZE,
1281 					MAXTABSZ, 100);
1282 	rfs4_state_owner_file_idx = rfs4_index_create(rfs4_state_tab,
1283 						"Openowner-File",
1284 						state_owner_file_hash,
1285 						state_owner_file_compare,
1286 						state_owner_file_mkkey, TRUE);
1287 	rfs4_state_idx = rfs4_index_create(rfs4_state_tab,
1288 					"State-id", state_hash,
1289 					state_compare, state_mkkey, FALSE);
1290 	rfs4_state_file_idx = rfs4_index_create(rfs4_state_tab,
1291 					"File", state_file_hash,
1292 					state_file_compare, state_file_mkkey,
1293 					FALSE);
1294 
1295 	rfs4_lo_state_cache_time *= rfs4_lease_time;
1296 	rfs4_lo_state_tab = rfs4_table_create(rfs4_server_state,
1297 					    "LockStateID",
1298 					    rfs4_lo_state_cache_time,
1299 					    2,
1300 					    rfs4_lo_state_create,
1301 					    rfs4_lo_state_destroy,
1302 					    rfs4_lo_state_expiry,
1303 					    sizeof (rfs4_lo_state_t),
1304 					    TABSIZE,
1305 					    MAXTABSZ, 100);
1306 	rfs4_lo_state_owner_idx = rfs4_index_create(rfs4_lo_state_tab,
1307 						    "lockownerxstate",
1308 						    lo_state_lo_hash,
1309 						    lo_state_lo_compare,
1310 						    lo_state_lo_mkkey, TRUE);
1311 	rfs4_lo_state_idx = rfs4_index_create(rfs4_lo_state_tab,
1312 					    "State-id",
1313 					    lo_state_hash, lo_state_compare,
1314 					    lo_state_mkkey, FALSE);
1315 
1316 	rfs4_lockowner_cache_time *= rfs4_lease_time;
1317 	rfs4_lockowner_tab = rfs4_table_create(rfs4_server_state,
1318 					    "Lockowner",
1319 					    rfs4_lockowner_cache_time,
1320 					    2,
1321 					    rfs4_lockowner_create,
1322 					    rfs4_lockowner_destroy,
1323 					    rfs4_lockowner_expiry,
1324 					    sizeof (rfs4_lockowner_t),
1325 					    TABSIZE,
1326 					    MAXTABSZ, 100);
1327 	rfs4_lockowner_idx = rfs4_index_create(rfs4_lockowner_tab,
1328 					    "lock_owner4", lockowner_hash,
1329 					    lockowner_compare,
1330 					    lockowner_mkkey, TRUE);
1331 	rfs4_lockowner_pid_idx = rfs4_index_create(rfs4_lockowner_tab,
1332 						"pid", pid_hash,
1333 						pid_compare, pid_mkkey,
1334 						FALSE);
1335 
1336 	rfs4_file_cache_time *= rfs4_lease_time;
1337 	rfs4_file_tab = rfs4_table_create(rfs4_server_state,
1338 					"File",
1339 					rfs4_file_cache_time,
1340 					1,
1341 					rfs4_file_create,
1342 					rfs4_file_destroy,
1343 					NULL,
1344 					sizeof (rfs4_file_t),
1345 					TABSIZE,
1346 					MAXTABSZ, -1);
1347 	rfs4_file_idx = rfs4_index_create(rfs4_file_tab,
1348 					"Filehandle", file_hash,
1349 					file_compare, file_mkkey, TRUE);
1350 
1351 	rfs4_deleg_state_cache_time *= rfs4_lease_time;
1352 	rfs4_deleg_state_tab = rfs4_table_create(rfs4_server_state,
1353 					"DelegStateID",
1354 					rfs4_deleg_state_cache_time,
1355 					2,
1356 					rfs4_deleg_state_create,
1357 					rfs4_deleg_state_destroy,
1358 					rfs4_deleg_state_expiry,
1359 					sizeof (rfs4_deleg_state_t),
1360 					TABSIZE,
1361 					MAXTABSZ, 100);
1362 	rfs4_deleg_idx = rfs4_index_create(rfs4_deleg_state_tab,
1363 						"DelegByFileClient",
1364 						deleg_hash,
1365 						deleg_compare,
1366 						deleg_mkkey, TRUE);
1367 	rfs4_deleg_state_idx = rfs4_index_create(rfs4_deleg_state_tab,
1368 						"DelegState",
1369 						deleg_state_hash,
1370 						deleg_state_compare,
1371 						deleg_state_mkkey, FALSE);
1372 
1373 	/*
1374 	 * Init the stable storage.
1375 	 */
1376 	rfs4_ss_init();
1377 
1378 	rfs4_client_clrst = rfs4_clear_client_state;
1379 
1380 	mutex_exit(&rfs4_state_lock);
1381 }
1382 
1383 
1384 /*
1385  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1386  * and other state.
1387  */
1388 void
1389 rfs4_state_fini()
1390 {
1391 	rfs4_database_t *dbp;
1392 
1393 	mutex_enter(&rfs4_state_lock);
1394 
1395 	if (rfs4_server_state == NULL) {
1396 		mutex_exit(&rfs4_state_lock);
1397 		return;
1398 	}
1399 
1400 	rfs4_client_clrst = NULL;
1401 
1402 	rfs4_set_deleg_policy(SRV_NEVER_DELEGATE);
1403 	dbp = rfs4_server_state;
1404 	rfs4_server_state = NULL;
1405 
1406 	/*
1407 	 * Cleanup the CPR callback.
1408 	 */
1409 	if (cpr_id)
1410 		(void) callb_delete(cpr_id);
1411 
1412 	rw_destroy(&rfs4_findclient_lock);
1413 
1414 	/* First stop all of the reaper threads in the database */
1415 	rfs4_database_shutdown(dbp);
1416 	/* clean up any dangling stable storage structures */
1417 	rfs4_ss_fini();
1418 	/* Now actually destroy/release the database and its tables */
1419 	rfs4_database_destroy(dbp);
1420 
1421 	/* Reset the cache timers for next time */
1422 	rfs4_client_cache_time = 0;
1423 	rfs4_openowner_cache_time = 0;
1424 	rfs4_state_cache_time = 0;
1425 	rfs4_lo_state_cache_time = 0;
1426 	rfs4_lockowner_cache_time = 0;
1427 	rfs4_file_cache_time = 0;
1428 	rfs4_deleg_state_cache_time = 0;
1429 
1430 	mutex_exit(&rfs4_state_lock);
1431 
1432 	/* destroy server instances and current instance ptr */
1433 	rfs4_servinst_destroy_all();
1434 
1435 	/* reset the "first NFSv4 request" status */
1436 	rfs4_seen_first_compound = 0;
1437 
1438 	/* DSS: distributed stable storage */
1439 	if (rfs4_dss_oldpaths)
1440 		nvlist_free(rfs4_dss_oldpaths);
1441 	if (rfs4_dss_paths)
1442 		nvlist_free(rfs4_dss_paths);
1443 	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1444 }
1445 
1446 typedef union {
1447 	struct {
1448 		uint32_t start_time;
1449 		uint32_t c_id;
1450 	} impl_id;
1451 	clientid4 id4;
1452 } cid;
1453 
1454 static int foreign_stateid(stateid_t *id);
1455 static int foreign_clientid(cid *cidp);
1456 static void embed_nodeid(cid *cidp);
1457 
1458 typedef union {
1459 	struct {
1460 		uint32_t c_id;
1461 		uint32_t gen_num;
1462 	} cv_impl;
1463 	verifier4	confirm_verf;
1464 } scid_confirm_verf;
1465 
1466 static uint32_t
1467 clientid_hash(void *key)
1468 {
1469 	cid *idp = key;
1470 
1471 	return (idp->impl_id.c_id);
1472 }
1473 
1474 static bool_t
1475 clientid_compare(rfs4_entry_t entry, void *key)
1476 {
1477 	rfs4_client_t *client = (rfs4_client_t *)entry;
1478 	clientid4 *idp = key;
1479 
1480 	return (*idp == client->clientid);
1481 }
1482 
1483 static void *
1484 clientid_mkkey(rfs4_entry_t entry)
1485 {
1486 	rfs4_client_t *client = (rfs4_client_t *)entry;
1487 
1488 	return (&client->clientid);
1489 }
1490 
1491 static uint32_t
1492 nfsclnt_hash(void *key)
1493 {
1494 	nfs_client_id4 *client = key;
1495 	int i;
1496 	uint32_t hash = 0;
1497 
1498 	for (i = 0; i < client->id_len; i++) {
1499 		hash <<= 1;
1500 		hash += (uint_t)client->id_val[i];
1501 	}
1502 	return (hash);
1503 }
1504 
1505 
1506 static bool_t
1507 nfsclnt_compare(rfs4_entry_t entry, void *key)
1508 {
1509 	rfs4_client_t *client = (rfs4_client_t *)entry;
1510 	nfs_client_id4 *nfs_client = key;
1511 
1512 	if (client->nfs_client.id_len != nfs_client->id_len)
1513 		return (FALSE);
1514 
1515 	return (bcmp(client->nfs_client.id_val, nfs_client->id_val,
1516 						nfs_client->id_len) == 0);
1517 }
1518 
1519 static void *
1520 nfsclnt_mkkey(rfs4_entry_t entry)
1521 {
1522 	rfs4_client_t *client = (rfs4_client_t *)entry;
1523 
1524 	return (&client->nfs_client);
1525 }
1526 
1527 static bool_t
1528 rfs4_client_expiry(rfs4_entry_t u_entry)
1529 {
1530 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1531 	bool_t cp_expired;
1532 
1533 	if (rfs4_dbe_is_invalid(cp->dbe))
1534 		return (TRUE);
1535 	/*
1536 	 * If the sysadmin has used clear_locks for this
1537 	 * entry then forced_expire will be set and we
1538 	 * want this entry to be reaped. Or the entry
1539 	 * has exceeded its lease period.
1540 	 */
1541 	cp_expired = (cp->forced_expire ||
1542 		(gethrestime_sec() - cp->last_access
1543 			> rfs4_lease_time));
1544 
1545 	if (!cp->ss_remove && cp_expired)
1546 		cp->ss_remove = 1;
1547 	return (cp_expired);
1548 }
1549 
1550 /*
1551  * Remove the leaf file from all distributed stable storage paths.
1552  */
1553 static void
1554 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1555 {
1556 	char *leaf = cp->ss_pn->leaf;
1557 
1558 	rfs4_dss_remove_leaf(cp->server_instance, NFS4_DSS_STATE_LEAF, leaf);
1559 }
1560 
1561 static void
1562 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1563 {
1564 	int i, npaths = sip->dss_npaths;
1565 
1566 	for (i = 0; i < npaths; i++) {
1567 		rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1568 		char *path, *dir;
1569 		size_t pathlen;
1570 
1571 		/* the HA-NFSv4 path might have been failed-over away from us */
1572 		if (dss_path == NULL)
1573 			continue;
1574 
1575 		dir = dss_path->path;
1576 
1577 		/* allow 3 extra bytes for two '/' & a NUL */
1578 		pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1579 		path = kmem_alloc(pathlen, KM_SLEEP);
1580 		(void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1581 
1582 		(void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1583 
1584 		kmem_free(path, pathlen);
1585 	}
1586 }
1587 
1588 static void
1589 rfs4_client_destroy(rfs4_entry_t u_entry)
1590 {
1591 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1592 
1593 	mutex_destroy(cp->cbinfo.cb_lock);
1594 	cv_destroy(cp->cbinfo.cb_cv);
1595 	cv_destroy(cp->cbinfo.cb_cv_nullcaller);
1596 
1597 	/* free callback info */
1598 	rfs4_cbinfo_free(&cp->cbinfo);
1599 
1600 	if (cp->cp_confirmed)
1601 		rfs4_client_rele(cp->cp_confirmed);
1602 
1603 	if (cp->ss_pn) {
1604 		/* check if the stable storage files need to be removed */
1605 		if (cp->ss_remove)
1606 			rfs4_dss_remove_cpleaf(cp);
1607 		rfs4_ss_pnfree(cp->ss_pn);
1608 	}
1609 
1610 	/* Free the client supplied client id */
1611 	kmem_free(cp->nfs_client.id_val, cp->nfs_client.id_len);
1612 
1613 	if (cp->sysidt != LM_NOSYSID)
1614 		lm_free_sysidt(cp->sysidt);
1615 }
1616 
1617 static bool_t
1618 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1619 {
1620 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1621 	nfs_client_id4 *client = (nfs_client_id4 *)arg;
1622 	cid *cidp;
1623 	scid_confirm_verf *scvp;
1624 
1625 	/* Get a clientid to give to the client */
1626 	cidp = (cid *)&cp->clientid;
1627 	cidp->impl_id.start_time = rfs4_start_time;
1628 	cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->dbe);
1629 
1630 	/* If we are booted as a cluster node, embed our nodeid */
1631 	if (cluster_bootflags & CLUSTER_BOOTED)
1632 		embed_nodeid(cidp);
1633 
1634 	/* Allocate and copy client's client id value */
1635 	cp->nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1636 	cp->nfs_client.id_len = client->id_len;
1637 	bcopy(client->id_val, cp->nfs_client.id_val, client->id_len);
1638 	cp->nfs_client.verifier = client->verifier;
1639 
1640 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1641 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1642 	scvp->cv_impl.c_id = cidp->impl_id.c_id;
1643 	scvp->cv_impl.gen_num = 0;
1644 
1645 	/* An F_UNLKSYS has been done for this client */
1646 	cp->unlksys_completed = FALSE;
1647 
1648 	/* We need the client to ack us */
1649 	cp->need_confirm = TRUE;
1650 	cp->cp_confirmed = NULL;
1651 
1652 	/* TRUE all the time until the callback path actually fails */
1653 	cp->cbinfo.cb_notified_of_cb_path_down = TRUE;
1654 
1655 	/* Initialize the access time to now */
1656 	cp->last_access = gethrestime_sec();
1657 
1658 	cp->cr_set = NULL;
1659 	/* Initialize list for insque/remque */
1660 	cp->openownerlist.next = cp->openownerlist.prev = &cp->openownerlist;
1661 	cp->openownerlist.oop = NULL; /* This is not an openowner */
1662 
1663 	cp->sysidt = LM_NOSYSID;
1664 
1665 	cp->clientdeleglist.next = cp->clientdeleglist.prev =
1666 		&cp->clientdeleglist;
1667 	cp->clientdeleglist.dsp = NULL;
1668 
1669 	/* set up the callback control structure */
1670 	cp->cbinfo.cb_state = CB_UNINIT;
1671 	mutex_init(cp->cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1672 	cv_init(cp->cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1673 	cv_init(cp->cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1674 
1675 	/*
1676 	 * Associate the client_t with the current server instance.
1677 	 * The hold is solely to satisfy the calling requirement of
1678 	 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1679 	 */
1680 	rfs4_dbe_hold(cp->dbe);
1681 	rfs4_servinst_assign(cp, rfs4_cur_servinst);
1682 	rfs4_dbe_rele(cp->dbe);
1683 
1684 	return (TRUE);
1685 }
1686 
1687 /*
1688  * Caller wants to generate/update the setclientid_confirm verifier
1689  * associated with a client.  This is done during the SETCLIENTID
1690  * processing.
1691  */
1692 void
1693 rfs4_client_scv_next(rfs4_client_t *cp)
1694 {
1695 	scid_confirm_verf *scvp;
1696 
1697 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1698 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1699 	scvp->cv_impl.gen_num++;
1700 }
1701 
1702 void
1703 rfs4_client_rele(rfs4_client_t *cp)
1704 {
1705 	rfs4_dbe_rele(cp->dbe);
1706 }
1707 
1708 rfs4_client_t *
1709 rfs4_findclient(nfs_client_id4 *client, bool_t *create,	rfs4_client_t *oldcp)
1710 {
1711 	rfs4_client_t *cp;
1712 
1713 
1714 	if (oldcp) {
1715 		rw_enter(&rfs4_findclient_lock, RW_WRITER);
1716 		rfs4_dbe_hide(oldcp->dbe);
1717 	} else {
1718 		rw_enter(&rfs4_findclient_lock, RW_READER);
1719 	}
1720 
1721 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_nfsclnt_idx, client,
1722 					create, (void *)client, RFS4_DBS_VALID);
1723 
1724 	if (oldcp)
1725 		rfs4_dbe_unhide(oldcp->dbe);
1726 
1727 	rw_exit(&rfs4_findclient_lock);
1728 
1729 	return (cp);
1730 }
1731 
1732 rfs4_client_t *
1733 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1734 {
1735 	rfs4_client_t *cp;
1736 	bool_t create = FALSE;
1737 	cid *cidp = (cid *)&clientid;
1738 
1739 	/* If we're a cluster and the nodeid isn't right, short-circuit */
1740 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1741 		return (NULL);
1742 
1743 	rw_enter(&rfs4_findclient_lock, RW_READER);
1744 
1745 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx, &clientid,
1746 					&create, NULL, RFS4_DBS_VALID);
1747 
1748 	rw_exit(&rfs4_findclient_lock);
1749 
1750 	if (cp && cp->need_confirm && find_unconfirmed == FALSE) {
1751 		rfs4_client_rele(cp);
1752 		return (NULL);
1753 	} else {
1754 		return (cp);
1755 	}
1756 }
1757 
1758 bool_t
1759 rfs4_lease_expired(rfs4_client_t *cp)
1760 {
1761 	bool_t rc;
1762 
1763 	rfs4_dbe_lock(cp->dbe);
1764 
1765 	/*
1766 	 * If the admin has executed clear_locks for this
1767 	 * client id, force expire will be set, so no need
1768 	 * to calculate anything because it's "outa here".
1769 	 */
1770 	if (cp->forced_expire) {
1771 		rc = TRUE;
1772 	} else {
1773 		rc = (gethrestime_sec() - cp->last_access > rfs4_lease_time);
1774 	}
1775 
1776 	/*
1777 	 * If the lease has expired we will also want
1778 	 * to remove any stable storage state data. So
1779 	 * mark the client id accordingly.
1780 	 */
1781 	if (!cp->ss_remove)
1782 		cp->ss_remove = (rc == TRUE);
1783 
1784 	rfs4_dbe_unlock(cp->dbe);
1785 
1786 	return (rc);
1787 }
1788 
1789 void
1790 rfs4_update_lease(rfs4_client_t *cp)
1791 {
1792 	rfs4_dbe_lock(cp->dbe);
1793 	if (!cp->forced_expire)
1794 		cp->last_access = gethrestime_sec();
1795 	rfs4_dbe_unlock(cp->dbe);
1796 }
1797 
1798 
1799 static bool_t
1800 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
1801 {
1802 	bool_t rc;
1803 
1804 	if (a->clientid != b->clientid)
1805 		return (FALSE);
1806 
1807 	if (a->owner_len != b->owner_len)
1808 		return (FALSE);
1809 
1810 	rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
1811 
1812 	return (rc);
1813 }
1814 
1815 static uint_t
1816 openowner_hash(void *key)
1817 {
1818 	int i;
1819 	open_owner4 *openowner = key;
1820 	uint_t hash = 0;
1821 
1822 	for (i = 0; i < openowner->owner_len; i++) {
1823 		hash <<= 4;
1824 		hash += (uint_t)openowner->owner_val[i];
1825 	}
1826 	hash += (uint_t)openowner->clientid;
1827 	hash |= (openowner->clientid >> 32);
1828 
1829 	return (hash);
1830 }
1831 
1832 static bool_t
1833 openowner_compare(rfs4_entry_t u_entry, void *key)
1834 {
1835 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1836 	open_owner4 *arg = key;
1837 
1838 	return (EQOPENOWNER(&op->owner, arg));
1839 }
1840 
1841 void *
1842 openowner_mkkey(rfs4_entry_t u_entry)
1843 {
1844 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1845 
1846 	return (&op->owner);
1847 }
1848 
1849 static bool_t
1850 rfs4_openowner_expiry(rfs4_entry_t u_entry)
1851 {
1852 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1853 
1854 	if (rfs4_dbe_is_invalid(op->dbe))
1855 		return (TRUE);
1856 	return ((gethrestime_sec() - op->client->last_access
1857 		> rfs4_lease_time));
1858 }
1859 
1860 static void
1861 rfs4_openowner_destroy(rfs4_entry_t u_entry)
1862 {
1863 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1864 
1865 	rfs4_sw_destroy(&op->oo_sw);
1866 
1867 	/* Remove open owner from client's lists of open owners */
1868 	rfs4_dbe_lock(op->client->dbe);
1869 
1870 	remque(&op->openownerlist);
1871 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1872 
1873 	rfs4_dbe_unlock(op->client->dbe);
1874 
1875 	/* One less reference to the client */
1876 	rfs4_client_rele(op->client);
1877 	op->client = NULL;
1878 
1879 	/* Free the last reply for this lock owner */
1880 	rfs4_free_reply(op->reply);
1881 
1882 	if (op->reply_fh.nfs_fh4_val) {
1883 		kmem_free(op->reply_fh.nfs_fh4_val, op->reply_fh.nfs_fh4_len);
1884 		op->reply_fh.nfs_fh4_val = NULL;
1885 		op->reply_fh.nfs_fh4_len = 0;
1886 	}
1887 
1888 	/* Free the lock owner id */
1889 	kmem_free(op->owner.owner_val, op->owner.owner_len);
1890 }
1891 
1892 void
1893 rfs4_openowner_rele(rfs4_openowner_t *op)
1894 {
1895 	rfs4_dbe_rele(op->dbe);
1896 }
1897 
1898 static bool_t
1899 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
1900 {
1901 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1902 	rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
1903 	open_owner4 *openowner = &argp->owner;
1904 	seqid4 seqid = argp->open_seqid;
1905 	rfs4_client_t *cp;
1906 	bool_t create = FALSE;
1907 
1908 	rw_enter(&rfs4_findclient_lock, RW_READER);
1909 
1910 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
1911 					&openowner->clientid,
1912 					&create, NULL, RFS4_DBS_VALID);
1913 
1914 	rw_exit(&rfs4_findclient_lock);
1915 
1916 	if (cp == NULL)
1917 		return (FALSE);
1918 
1919 	op->reply_fh.nfs_fh4_len = 0;
1920 	op->reply_fh.nfs_fh4_val = NULL;
1921 
1922 	op->owner.clientid = openowner->clientid;
1923 	op->owner.owner_val =
1924 		kmem_alloc(openowner->owner_len, KM_SLEEP);
1925 	bcopy(openowner->owner_val,
1926 	    op->owner.owner_val, openowner->owner_len);
1927 	op->owner.owner_len = openowner->owner_len;
1928 
1929 	op->need_confirm = TRUE;
1930 
1931 	rfs4_sw_init(&op->oo_sw);
1932 
1933 	op->open_seqid = seqid;
1934 	bzero(op->reply, sizeof (nfs_resop4));
1935 	op->client = cp;
1936 	op->cr_set = NULL;
1937 	/* Init lists for remque/insque */
1938 	op->ownerstateids.next = op->ownerstateids.prev = &op->ownerstateids;
1939 	op->ownerstateids.sp = NULL; /* NULL since this is the state list */
1940 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1941 	op->openownerlist.oop = op; /* ourselves */
1942 
1943 	/* Insert openowner into client's open owner list */
1944 	rfs4_dbe_lock(cp->dbe);
1945 
1946 	insque(&op->openownerlist, cp->openownerlist.prev);
1947 
1948 	rfs4_dbe_unlock(cp->dbe);
1949 
1950 	return (TRUE);
1951 }
1952 
1953 rfs4_openowner_t *
1954 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
1955 {
1956 	rfs4_openowner_t *op;
1957 	rfs4_openowner_t arg;
1958 
1959 	arg.owner = *openowner;
1960 	arg.open_seqid = seqid;
1961 	op = (rfs4_openowner_t *)rfs4_dbsearch(rfs4_openowner_idx, openowner,
1962 					    create, &arg, RFS4_DBS_VALID);
1963 
1964 	return (op);
1965 }
1966 
1967 void
1968 rfs4_update_open_sequence(rfs4_openowner_t *op)
1969 {
1970 
1971 	rfs4_dbe_lock(op->dbe);
1972 
1973 	op->open_seqid++;
1974 
1975 	rfs4_dbe_unlock(op->dbe);
1976 }
1977 
1978 void
1979 rfs4_update_open_resp(rfs4_openowner_t *op, nfs_resop4 *resp, nfs_fh4 *fh)
1980 {
1981 
1982 	rfs4_dbe_lock(op->dbe);
1983 
1984 	rfs4_free_reply(op->reply);
1985 
1986 	rfs4_copy_reply(op->reply, resp);
1987 
1988 	/* Save the filehandle if provided and free if not used */
1989 	if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
1990 	    fh && fh->nfs_fh4_len) {
1991 		if (op->reply_fh.nfs_fh4_val == NULL)
1992 			op->reply_fh.nfs_fh4_val =
1993 				kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
1994 		nfs_fh4_copy(fh, &op->reply_fh);
1995 	} else {
1996 		if (op->reply_fh.nfs_fh4_val) {
1997 			kmem_free(op->reply_fh.nfs_fh4_val,
1998 				op->reply_fh.nfs_fh4_len);
1999 			op->reply_fh.nfs_fh4_val = NULL;
2000 			op->reply_fh.nfs_fh4_len = 0;
2001 		}
2002 	}
2003 
2004 	rfs4_dbe_unlock(op->dbe);
2005 }
2006 
2007 static bool_t
2008 lockowner_compare(rfs4_entry_t u_entry, void *key)
2009 {
2010 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2011 	lock_owner4 *b = (lock_owner4 *)key;
2012 
2013 	if (lo->owner.clientid != b->clientid)
2014 		return (FALSE);
2015 
2016 	if (lo->owner.owner_len != b->owner_len)
2017 		return (FALSE);
2018 
2019 	return (bcmp(lo->owner.owner_val, b->owner_val,
2020 					lo->owner.owner_len) == 0);
2021 }
2022 
2023 void *
2024 lockowner_mkkey(rfs4_entry_t u_entry)
2025 {
2026 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2027 
2028 	return (&lo->owner);
2029 }
2030 
2031 static uint32_t
2032 lockowner_hash(void *key)
2033 {
2034 	int i;
2035 	lock_owner4 *lockowner = key;
2036 	uint_t hash = 0;
2037 
2038 	for (i = 0; i < lockowner->owner_len; i++) {
2039 		hash <<= 4;
2040 		hash += (uint_t)lockowner->owner_val[i];
2041 	}
2042 	hash += (uint_t)lockowner->clientid;
2043 	hash |= (lockowner->clientid >> 32);
2044 
2045 	return (hash);
2046 }
2047 
2048 static uint32_t
2049 pid_hash(void *key)
2050 {
2051 	return ((uint32_t)(uintptr_t)key);
2052 }
2053 
2054 static void *
2055 pid_mkkey(rfs4_entry_t u_entry)
2056 {
2057 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2058 
2059 	return ((void *)(uintptr_t)lo->pid);
2060 }
2061 
2062 static bool_t
2063 pid_compare(rfs4_entry_t u_entry, void *key)
2064 {
2065 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2066 
2067 	return (lo->pid == (pid_t)(uintptr_t)key);
2068 }
2069 
2070 static void
2071 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2072 {
2073 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2074 
2075 	/* Free the lock owner id */
2076 	kmem_free(lo->owner.owner_val, lo->owner.owner_len);
2077 	rfs4_client_rele(lo->client);
2078 }
2079 
2080 void
2081 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2082 {
2083 	rfs4_dbe_rele(lo->dbe);
2084 }
2085 
2086 /* ARGSUSED */
2087 static bool_t
2088 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2089 {
2090 	/*
2091 	 * Since expiry is called with no other references on
2092 	 * this struct, go ahead and have it removed.
2093 	 */
2094 	return (TRUE);
2095 }
2096 
2097 static bool_t
2098 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2099 {
2100 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2101 	lock_owner4 *lockowner = (lock_owner4 *)arg;
2102 	rfs4_client_t *cp;
2103 	bool_t create = FALSE;
2104 
2105 	rw_enter(&rfs4_findclient_lock, RW_READER);
2106 
2107 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2108 					&lockowner->clientid,
2109 					&create, NULL, RFS4_DBS_VALID);
2110 
2111 	rw_exit(&rfs4_findclient_lock);
2112 
2113 	if (cp == NULL)
2114 		return (FALSE);
2115 
2116 	/* Reference client */
2117 	lo->client = cp;
2118 	lo->owner.clientid = lockowner->clientid;
2119 	lo->owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2120 	bcopy(lockowner->owner_val, lo->owner.owner_val, lockowner->owner_len);
2121 	lo->owner.owner_len = lockowner->owner_len;
2122 	lo->pid = rfs4_dbe_getid(lo->dbe);
2123 
2124 	return (TRUE);
2125 }
2126 
2127 rfs4_lockowner_t *
2128 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2129 {
2130 	rfs4_lockowner_t *lo;
2131 
2132 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_idx, lockowner,
2133 					    create, lockowner, RFS4_DBS_VALID);
2134 
2135 	return (lo);
2136 }
2137 
2138 rfs4_lockowner_t *
2139 rfs4_findlockowner_by_pid(pid_t pid)
2140 {
2141 	rfs4_lockowner_t *lo;
2142 	bool_t create = FALSE;
2143 
2144 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_pid_idx,
2145 		(void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2146 
2147 	return (lo);
2148 }
2149 
2150 
2151 static uint32_t
2152 file_hash(void *key)
2153 {
2154 	return (ADDRHASH(key));
2155 }
2156 
2157 static void *
2158 file_mkkey(rfs4_entry_t u_entry)
2159 {
2160 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2161 
2162 	return (fp->vp);
2163 }
2164 
2165 static bool_t
2166 file_compare(rfs4_entry_t u_entry, void *key)
2167 {
2168 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2169 
2170 	return (fp->vp == (vnode_t *)key);
2171 }
2172 
2173 static void
2174 rfs4_file_destroy(rfs4_entry_t u_entry)
2175 {
2176 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2177 
2178 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
2179 	if (fp->filehandle.nfs_fh4_val)
2180 		kmem_free(fp->filehandle.nfs_fh4_val,
2181 			fp->filehandle.nfs_fh4_len);
2182 	cv_destroy(fp->dinfo->recall_cv);
2183 	if (fp->vp) {
2184 		VN_RELE(fp->vp);
2185 		fp->vp = NULL;
2186 	}
2187 	rw_destroy(&fp->file_rwlock);
2188 }
2189 
2190 /*
2191  * Used to unlock the underlying dbe struct only
2192  */
2193 void
2194 rfs4_file_rele(rfs4_file_t *fp)
2195 {
2196 	rfs4_dbe_rele(fp->dbe);
2197 }
2198 
2199 /*
2200  * Used to unlock the file rw lock and the file's dbe entry
2201  * Only used to pair with rfs4_findfile_withlock()
2202  */
2203 void
2204 rfs4_file_rele_withunlock(rfs4_file_t *fp)
2205 {
2206 	rw_exit(&fp->file_rwlock);
2207 	rfs4_dbe_rele(fp->dbe);
2208 }
2209 
2210 typedef struct {
2211     vnode_t *vp;
2212     nfs_fh4 *fh;
2213 } rfs4_fcreate_arg;
2214 
2215 static bool_t
2216 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2217 {
2218 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2219 	rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2220 	vnode_t *vp = ap->vp;
2221 	nfs_fh4 *fh = ap->fh;
2222 
2223 	VN_HOLD(vp);
2224 
2225 	fp->filehandle.nfs_fh4_len = 0;
2226 	fp->filehandle.nfs_fh4_val = NULL;
2227 	ASSERT(fh && fh->nfs_fh4_len);
2228 	if (fh && fh->nfs_fh4_len) {
2229 		fp->filehandle.nfs_fh4_val =
2230 			kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2231 		nfs_fh4_copy(fh, &fp->filehandle);
2232 	}
2233 	fp->vp = vp;
2234 
2235 	/* Init list for remque/insque */
2236 	fp->delegationlist.next = fp->delegationlist.prev =
2237 		&fp->delegationlist;
2238 	fp->delegationlist.dsp = NULL; /* NULL since this is state list */
2239 
2240 	fp->share_deny = fp->share_access = fp->access_read = 0;
2241 	fp->access_write = fp->deny_read = fp->deny_write = 0;
2242 
2243 	mutex_init(fp->dinfo->recall_lock, NULL, MUTEX_DEFAULT, NULL);
2244 	cv_init(fp->dinfo->recall_cv, NULL, CV_DEFAULT, NULL);
2245 
2246 	fp->dinfo->dtype = OPEN_DELEGATE_NONE;
2247 
2248 	rw_init(&fp->file_rwlock, NULL, RW_DEFAULT, NULL);
2249 
2250 	return (TRUE);
2251 }
2252 
2253 rfs4_file_t *
2254 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2255 {
2256 	rfs4_file_t *fp;
2257 	rfs4_fcreate_arg arg;
2258 
2259 	arg.vp = vp;
2260 	arg.fh = fh;
2261 
2262 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2263 					&arg, RFS4_DBS_VALID);
2264 	return (fp);
2265 }
2266 
2267 /*
2268  * Find a file in the db and once it is located, take the rw lock.
2269  * Need to check the vnode pointer and if it does not exist (it was
2270  * removed between the db location and check) redo the find.  This
2271  * assumes that a file struct that has a NULL vnode pointer is marked
2272  * at 'invalid' and will not be found in the db the second time
2273  * around.
2274  */
2275 rfs4_file_t *
2276 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2277 {
2278 	rfs4_file_t *fp;
2279 	rfs4_fcreate_arg arg;
2280 	bool_t screate = *create;
2281 
2282 retry:
2283 	arg.vp = vp;
2284 	arg.fh = fh;
2285 
2286 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2287 					&arg, RFS4_DBS_VALID);
2288 	if (fp != NULL) {
2289 		rw_enter(&fp->file_rwlock, RW_WRITER);
2290 		if (fp->vp == NULL) {
2291 			rw_exit(&fp->file_rwlock);
2292 			rfs4_file_rele(fp);
2293 			*create = screate;
2294 			goto retry;
2295 		}
2296 	}
2297 
2298 	return (fp);
2299 }
2300 
2301 static uint32_t
2302 lo_state_hash(void *key)
2303 {
2304 	stateid_t *id = key;
2305 
2306 	return (id->bits.ident+id->bits.pid);
2307 }
2308 
2309 static bool_t
2310 lo_state_compare(rfs4_entry_t u_entry, void *key)
2311 {
2312 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2313 	stateid_t *id = key;
2314 	bool_t rc;
2315 
2316 	rc = (lop->lockid.bits.boottime == id->bits.boottime &&
2317 	    lop->lockid.bits.type == id->bits.type &&
2318 	    lop->lockid.bits.ident == id->bits.ident &&
2319 	    lop->lockid.bits.pid == id->bits.pid);
2320 
2321 	return (rc);
2322 }
2323 
2324 static void *
2325 lo_state_mkkey(rfs4_entry_t u_entry)
2326 {
2327 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2328 
2329 	return (&lsp->lockid);
2330 }
2331 
2332 static bool_t
2333 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2334 {
2335 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2336 
2337 	if (rfs4_dbe_is_invalid(lsp->dbe))
2338 		return (TRUE);
2339 	if (lsp->state->closed)
2340 		return (TRUE);
2341 	return ((gethrestime_sec() - lsp->state->owner->client->last_access
2342 		> rfs4_lease_time));
2343 }
2344 
2345 static void
2346 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2347 {
2348 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2349 
2350 	rfs4_sw_destroy(&lsp->ls_sw);
2351 
2352 	/* Make sure to release the file locks */
2353 	if (lsp->locks_cleaned == FALSE) {
2354 		lsp->locks_cleaned = TRUE;
2355 		if (lsp->locker->client->sysidt != LM_NOSYSID) {
2356 			/* Is the PxFS kernel module loaded? */
2357 			if (lm_remove_file_locks != NULL) {
2358 				int new_sysid;
2359 
2360 				/* Encode the cluster nodeid in new sysid */
2361 				new_sysid = lsp->locker->client->sysidt;
2362 				lm_set_nlmid_flk(&new_sysid);
2363 
2364 				/*
2365 				 * This PxFS routine removes file locks for a
2366 				 * client over all nodes of a cluster.
2367 				 */
2368 				NFS4_DEBUG(rfs4_debug, (CE_NOTE,
2369 				    "lm_remove_file_locks(sysid=0x%x)\n",
2370 				    new_sysid));
2371 				(*lm_remove_file_locks)(new_sysid);
2372 			} else {
2373 				(void) cleanlocks(lsp->state->finfo->vp,
2374 				    lsp->locker->pid,
2375 				    lsp->locker->client->sysidt);
2376 			}
2377 		}
2378 	}
2379 
2380 	rfs4_dbe_lock(lsp->state->dbe);
2381 
2382 	remque(&lsp->lockownerlist);
2383 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2384 		&lsp->lockownerlist;
2385 
2386 	rfs4_dbe_unlock(lsp->state->dbe);
2387 
2388 	/* Free the last reply for this state */
2389 	rfs4_free_reply(lsp->reply);
2390 
2391 	rfs4_lockowner_rele(lsp->locker);
2392 	lsp->locker = NULL;
2393 
2394 	rfs4_state_rele_nounlock(lsp->state);
2395 	lsp->state = NULL;
2396 }
2397 
2398 static bool_t
2399 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2400 {
2401 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2402 	rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2403 	rfs4_lockowner_t *lo = argp->locker;
2404 	rfs4_state_t *sp = argp->state;
2405 
2406 	lsp->state = sp;
2407 
2408 	lsp->lockid = sp->stateid;
2409 	lsp->lockid.bits.type = LOCKID;
2410 	lsp->lockid.bits.chgseq = 0;
2411 	lsp->lockid.bits.pid = lo->pid;
2412 
2413 	lsp->locks_cleaned = FALSE;
2414 	lsp->lock_completed = FALSE;
2415 
2416 	rfs4_sw_init(&lsp->ls_sw);
2417 
2418 	/* Attached the supplied lock owner */
2419 	rfs4_dbe_hold(lo->dbe);
2420 	lsp->locker = lo;
2421 
2422 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2423 		&lsp->lockownerlist;
2424 	lsp->lockownerlist.lsp = lsp;
2425 
2426 	rfs4_dbe_lock(sp->dbe);
2427 
2428 	insque(&lsp->lockownerlist, sp->lockownerlist.prev);
2429 
2430 	rfs4_dbe_hold(sp->dbe);
2431 
2432 	rfs4_dbe_unlock(sp->dbe);
2433 
2434 	return (TRUE);
2435 }
2436 
2437 void
2438 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2439 {
2440 	if (unlock_fp == TRUE)
2441 		rw_exit(&lsp->state->finfo->file_rwlock);
2442 	rfs4_dbe_rele(lsp->dbe);
2443 }
2444 
2445 static rfs4_lo_state_t *
2446 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2447 {
2448 	rfs4_lo_state_t *lsp;
2449 	bool_t create = FALSE;
2450 
2451 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_idx, id,
2452 					    &create, NULL, RFS4_DBS_VALID);
2453 	if (lock_fp == TRUE && lsp != NULL)
2454 		rw_enter(&lsp->state->finfo->file_rwlock, RW_READER);
2455 
2456 	return (lsp);
2457 }
2458 
2459 
2460 static uint32_t
2461 lo_state_lo_hash(void *key)
2462 {
2463 	rfs4_lo_state_t *lop = key;
2464 
2465 	return (ADDRHASH(lop->locker) ^ ADDRHASH(lop->state));
2466 }
2467 
2468 static bool_t
2469 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2470 {
2471 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2472 	rfs4_lo_state_t *keyp = key;
2473 
2474 	return (keyp->locker == lop->locker && keyp->state == lop->state);
2475 }
2476 
2477 static void *
2478 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2479 {
2480 	return (u_entry);
2481 }
2482 
2483 rfs4_lo_state_t *
2484 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo,
2485 			rfs4_state_t *sp, bool_t *create)
2486 {
2487 	rfs4_lo_state_t *lsp;
2488 	rfs4_lo_state_t arg;
2489 
2490 	arg.locker = lo;
2491 	arg.state = sp;
2492 
2493 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_owner_idx, &arg,
2494 					    create, &arg, RFS4_DBS_VALID);
2495 
2496 	return (lsp);
2497 }
2498 
2499 static stateid_t
2500 get_stateid(id_t eid)
2501 {
2502 	stateid_t id;
2503 
2504 	id.bits.boottime = rfs4_start_time;
2505 	id.bits.ident = eid;
2506 	id.bits.chgseq = 0;
2507 	id.bits.type = 0;
2508 	id.bits.pid = 0;
2509 
2510 	/*
2511 	 * If we are booted as a cluster node, embed our nodeid.
2512 	 * We've already done sanity checks in rfs4_client_create() so no
2513 	 * need to repeat them here.
2514 	 */
2515 	id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2516 	    clconf_get_nodeid() : 0;
2517 
2518 	return (id);
2519 }
2520 
2521 /*
2522  * For use only when booted as a cluster node.
2523  * Returns TRUE if the embedded nodeid indicates that this stateid was
2524  * generated on another node.
2525  */
2526 static int
2527 foreign_stateid(stateid_t *id)
2528 {
2529 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2530 	return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2531 }
2532 
2533 /*
2534  * For use only when booted as a cluster node.
2535  * Returns TRUE if the embedded nodeid indicates that this clientid was
2536  * generated on another node.
2537  */
2538 static int
2539 foreign_clientid(cid *cidp)
2540 {
2541 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2542 	return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2543 	    (uint32_t)clconf_get_nodeid());
2544 }
2545 
2546 /*
2547  * For use only when booted as a cluster node.
2548  * Embed our cluster nodeid into the clientid.
2549  */
2550 static void
2551 embed_nodeid(cid *cidp)
2552 {
2553 	int clnodeid;
2554 	/*
2555 	 * Currently, our state tables are small enough that their
2556 	 * ids will leave enough bits free for the nodeid. If the
2557 	 * tables become larger, we mustn't overwrite the id.
2558 	 * Equally, we only have room for so many bits of nodeid, so
2559 	 * must check that too.
2560 	 */
2561 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2562 	ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2563 	clnodeid = clconf_get_nodeid();
2564 	ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2565 	ASSERT(clnodeid != NODEID_UNKNOWN);
2566 	cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2567 }
2568 
2569 static uint32_t
2570 state_hash(void *key)
2571 {
2572 	stateid_t *ip = (stateid_t *)key;
2573 
2574 	return (ip->bits.ident);
2575 }
2576 
2577 static bool_t
2578 state_compare(rfs4_entry_t u_entry, void *key)
2579 {
2580 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2581 	stateid_t *id = (stateid_t *)key;
2582 	bool_t rc;
2583 
2584 	rc = (sp->stateid.bits.boottime == id->bits.boottime &&
2585 	    sp->stateid.bits.ident == id->bits.ident);
2586 
2587 	return (rc);
2588 }
2589 
2590 static void *
2591 state_mkkey(rfs4_entry_t u_entry)
2592 {
2593 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2594 
2595 	return (&sp->stateid);
2596 }
2597 
2598 static void
2599 rfs4_state_destroy(rfs4_entry_t u_entry)
2600 {
2601 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2602 
2603 	ASSERT(&sp->lockownerlist == sp->lockownerlist.next);
2604 
2605 	/* release any share locks for this stateid if it's still open */
2606 	if (!sp->closed)
2607 		rfs4_unshare(sp);
2608 
2609 	/* Were done with the file */
2610 	rfs4_file_rele(sp->finfo);
2611 	sp->finfo = NULL;
2612 
2613 	/* And now with the openowner */
2614 	rfs4_dbe_lock(sp->owner->dbe);
2615 
2616 	remque(&sp->ownerstateids);
2617 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2618 
2619 	rfs4_dbe_unlock(sp->owner->dbe);
2620 
2621 	rfs4_openowner_rele(sp->owner);
2622 	sp->owner = NULL;
2623 }
2624 
2625 static void
2626 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2627 {
2628 	rfs4_dbe_rele(sp->dbe);
2629 }
2630 
2631 void
2632 rfs4_state_rele(rfs4_state_t *sp)
2633 {
2634 	rw_exit(&sp->finfo->file_rwlock);
2635 	rfs4_dbe_rele(sp->dbe);
2636 }
2637 
2638 static uint32_t
2639 deleg_hash(void *key)
2640 {
2641 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2642 
2643 	return (ADDRHASH(dsp->client) ^ ADDRHASH(dsp->finfo));
2644 }
2645 
2646 static bool_t
2647 deleg_compare(rfs4_entry_t u_entry, void *key)
2648 {
2649 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2650 	rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2651 
2652 	return (dsp->client == kdsp->client && dsp->finfo == kdsp->finfo);
2653 }
2654 
2655 static void *
2656 deleg_mkkey(rfs4_entry_t u_entry)
2657 {
2658 	return (u_entry);
2659 }
2660 
2661 static uint32_t
2662 deleg_state_hash(void *key)
2663 {
2664 	stateid_t *ip = (stateid_t *)key;
2665 
2666 	return (ip->bits.ident);
2667 }
2668 
2669 static bool_t
2670 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2671 {
2672 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2673 	stateid_t *id = (stateid_t *)key;
2674 	bool_t rc;
2675 
2676 	if (id->bits.type != DELEGID)
2677 		return (FALSE);
2678 
2679 	rc = (dsp->delegid.bits.boottime == id->bits.boottime &&
2680 	    dsp->delegid.bits.ident == id->bits.ident);
2681 
2682 	return (rc);
2683 }
2684 
2685 static void *
2686 deleg_state_mkkey(rfs4_entry_t u_entry)
2687 {
2688 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2689 
2690 	return (&dsp->delegid);
2691 }
2692 
2693 static bool_t
2694 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2695 {
2696 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2697 
2698 	if (rfs4_dbe_is_invalid(dsp->dbe))
2699 		return (TRUE);
2700 	return ((gethrestime_sec() - dsp->client->last_access
2701 		> rfs4_lease_time));
2702 
2703 }
2704 
2705 static bool_t
2706 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2707 {
2708 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2709 	rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->finfo;
2710 	rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->client;
2711 
2712 	rfs4_dbe_hold(fp->dbe);
2713 	rfs4_dbe_hold(cp->dbe);
2714 
2715 	dsp->delegid = get_stateid(rfs4_dbe_getid(dsp->dbe));
2716 	dsp->delegid.bits.type = DELEGID;
2717 	dsp->finfo = fp;
2718 	dsp->client = cp;
2719 	dsp->dtype = OPEN_DELEGATE_NONE;
2720 
2721 	dsp->time_granted = gethrestime_sec();	/* observability */
2722 	dsp->time_revoked = 0;
2723 
2724 	/* Init lists for remque/insque */
2725 	dsp->delegationlist.next = dsp->delegationlist.prev =
2726 		&dsp->delegationlist;
2727 	dsp->delegationlist.dsp = dsp;
2728 
2729 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2730 		&dsp->clientdeleglist;
2731 	dsp->clientdeleglist.dsp = dsp;
2732 
2733 	/* Insert state on per open owner's list */
2734 	rfs4_dbe_lock(cp->dbe);
2735 
2736 	insque(&dsp->clientdeleglist, cp->clientdeleglist.prev);
2737 
2738 	rfs4_dbe_unlock(cp->dbe);
2739 
2740 	return (TRUE);
2741 }
2742 
2743 static void
2744 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
2745 {
2746 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2747 
2748 	if (&dsp->delegationlist != dsp->delegationlist.next)
2749 		rfs4_return_deleg(dsp, FALSE);
2750 
2751 	/* Were done with the file */
2752 	rfs4_file_rele(dsp->finfo);
2753 	dsp->finfo = NULL;
2754 
2755 	/* And now with the openowner */
2756 	rfs4_dbe_lock(dsp->client->dbe);
2757 
2758 	remque(&dsp->clientdeleglist);
2759 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2760 		&dsp->clientdeleglist;
2761 
2762 	rfs4_dbe_unlock(dsp->client->dbe);
2763 
2764 	rfs4_client_rele(dsp->client);
2765 	dsp->client = NULL;
2766 }
2767 
2768 rfs4_deleg_state_t *
2769 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
2770 {
2771 	rfs4_deleg_state_t ds, *dsp;
2772 
2773 	ds.client = sp->owner->client;
2774 	ds.finfo = sp->finfo;
2775 
2776 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_idx, &ds,
2777 					create, &ds, RFS4_DBS_VALID);
2778 
2779 	return (dsp);
2780 }
2781 
2782 rfs4_deleg_state_t *
2783 rfs4_finddelegstate(stateid_t *id)
2784 {
2785 	rfs4_deleg_state_t *dsp;
2786 	bool_t create = FALSE;
2787 
2788 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_state_idx, id,
2789 					&create, NULL, RFS4_DBS_VALID);
2790 
2791 	return (dsp);
2792 }
2793 
2794 void
2795 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
2796 {
2797 	rfs4_dbe_rele(dsp->dbe);
2798 }
2799 
2800 void
2801 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
2802 {
2803 
2804 	rfs4_dbe_lock(lsp->dbe);
2805 
2806 	/*
2807 	 * If we are skipping sequence id checking, this means that
2808 	 * this is the first lock request and therefore the sequence
2809 	 * id does not need to be updated.  This only happens on the
2810 	 * first lock request for a lockowner
2811 	 */
2812 	if (!lsp->skip_seqid_check)
2813 		lsp->seqid++;
2814 
2815 	rfs4_dbe_unlock(lsp->dbe);
2816 }
2817 
2818 void
2819 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
2820 {
2821 
2822 	rfs4_dbe_lock(lsp->dbe);
2823 
2824 	rfs4_free_reply(lsp->reply);
2825 
2826 	rfs4_copy_reply(lsp->reply, resp);
2827 
2828 	rfs4_dbe_unlock(lsp->dbe);
2829 }
2830 
2831 void
2832 rfs4_free_opens(rfs4_openowner_t *op, bool_t invalidate,
2833 	bool_t close_of_client)
2834 {
2835 	rfs4_state_t *sp;
2836 
2837 	rfs4_dbe_lock(op->dbe);
2838 
2839 	for (sp = op->ownerstateids.next->sp; sp != NULL;
2840 		sp = sp->ownerstateids.next->sp) {
2841 		rfs4_state_close(sp, FALSE, close_of_client, CRED());
2842 		if (invalidate == TRUE)
2843 			rfs4_dbe_invalidate(sp->dbe);
2844 	}
2845 
2846 	rfs4_dbe_unlock(op->dbe);
2847 	rfs4_dbe_invalidate(op->dbe);
2848 }
2849 
2850 static uint32_t
2851 state_owner_file_hash(void *key)
2852 {
2853 	rfs4_state_t *sp = key;
2854 
2855 	return (ADDRHASH(sp->owner) ^ ADDRHASH(sp->finfo));
2856 }
2857 
2858 static bool_t
2859 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
2860 {
2861 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2862 	rfs4_state_t *arg = key;
2863 
2864 	if (sp->closed == TRUE)
2865 		return (FALSE);
2866 
2867 	return (arg->owner == sp->owner && arg->finfo == sp->finfo);
2868 }
2869 
2870 static void *
2871 state_owner_file_mkkey(rfs4_entry_t u_entry)
2872 {
2873 	return (u_entry);
2874 }
2875 
2876 static uint32_t
2877 state_file_hash(void *key)
2878 {
2879 	return (ADDRHASH(key));
2880 }
2881 
2882 static bool_t
2883 state_file_compare(rfs4_entry_t u_entry, void *key)
2884 {
2885 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2886 	rfs4_file_t *fp = key;
2887 
2888 	if (sp->closed == TRUE)
2889 		return (FALSE);
2890 
2891 	return (fp == sp->finfo);
2892 }
2893 
2894 static void *
2895 state_file_mkkey(rfs4_entry_t u_entry)
2896 {
2897 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2898 
2899 	return (sp->finfo);
2900 }
2901 
2902 rfs4_state_t *
2903 rfs4_findstate_by_owner_file(rfs4_openowner_t *op, rfs4_file_t *file,
2904 	bool_t *create)
2905 {
2906 	rfs4_state_t *sp;
2907 	rfs4_state_t key;
2908 
2909 	key.owner = op;
2910 	key.finfo = file;
2911 
2912 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_owner_file_idx, &key,
2913 					create, &key, RFS4_DBS_VALID);
2914 
2915 	return (sp);
2916 }
2917 
2918 /* This returns ANY state struct that refers to this file */
2919 static rfs4_state_t *
2920 rfs4_findstate_by_file(rfs4_file_t *fp)
2921 {
2922 	bool_t create = FALSE;
2923 
2924 	return ((rfs4_state_t *)rfs4_dbsearch(rfs4_state_file_idx, fp,
2925 		&create, fp, RFS4_DBS_VALID));
2926 }
2927 
2928 static bool_t
2929 rfs4_state_expiry(rfs4_entry_t u_entry)
2930 {
2931 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2932 
2933 	if (rfs4_dbe_is_invalid(sp->dbe))
2934 		return (TRUE);
2935 
2936 	if (sp->closed == TRUE &&
2937 	    ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->dbe))
2938 		> rfs4_lease_time))
2939 		return (TRUE);
2940 
2941 	return ((gethrestime_sec() - sp->owner->client->last_access
2942 		> rfs4_lease_time));
2943 }
2944 
2945 static bool_t
2946 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
2947 {
2948 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2949 	rfs4_file_t *fp = ((rfs4_state_t *)argp)->finfo;
2950 	rfs4_openowner_t *op = ((rfs4_state_t *)argp)->owner;
2951 
2952 	rfs4_dbe_hold(fp->dbe);
2953 	rfs4_dbe_hold(op->dbe);
2954 	sp->stateid = get_stateid(rfs4_dbe_getid(sp->dbe));
2955 	sp->stateid.bits.type = OPENID;
2956 	sp->owner = op;
2957 	sp->finfo = fp;
2958 
2959 	/* Init lists for remque/insque */
2960 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2961 	sp->ownerstateids.sp = sp;
2962 	sp->lockownerlist.next = sp->lockownerlist.prev = &sp->lockownerlist;
2963 	sp->lockownerlist.lsp = NULL;
2964 
2965 	/* Insert state on per open owner's list */
2966 	rfs4_dbe_lock(op->dbe);
2967 
2968 	insque(&sp->ownerstateids, op->ownerstateids.prev);
2969 
2970 	rfs4_dbe_unlock(op->dbe);
2971 
2972 	return (TRUE);
2973 }
2974 
2975 static rfs4_state_t *
2976 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid,
2977 		bool_t lock_fp)
2978 {
2979 	rfs4_state_t *sp;
2980 	bool_t create = FALSE;
2981 
2982 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_idx, id,
2983 					&create, NULL, find_invalid);
2984 	if (lock_fp == TRUE && sp != NULL)
2985 		rw_enter(&sp->finfo->file_rwlock, RW_READER);
2986 
2987 	return (sp);
2988 }
2989 
2990 void
2991 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held,
2992 			bool_t close_of_client, cred_t *cr)
2993 {
2994 	/* Remove the associated lo_state owners */
2995 	if (!lock_held)
2996 		rfs4_dbe_lock(sp->dbe);
2997 	if (sp->closed == FALSE) {
2998 		sp->closed = TRUE;
2999 
3000 		rfs4_release_share_lock_state(sp, cr, close_of_client);
3001 	}
3002 	if (!lock_held)
3003 		rfs4_dbe_unlock(sp->dbe);
3004 }
3005 
3006 /*
3007  * Remove all state associated with the given client.
3008  */
3009 void
3010 rfs4_client_state_remove(rfs4_client_t *cp)
3011 {
3012 	rfs4_openowner_t *oop;
3013 
3014 	rfs4_dbe_lock(cp->dbe);
3015 
3016 	for (oop = cp->openownerlist.next->oop;  oop != NULL;
3017 		oop = oop->openownerlist.next->oop) {
3018 		rfs4_free_opens(oop, TRUE, TRUE);
3019 	}
3020 
3021 	rfs4_dbe_unlock(cp->dbe);
3022 }
3023 
3024 void
3025 rfs4_client_close(rfs4_client_t *cp)
3026 {
3027 	/* Mark client as going away. */
3028 	rfs4_dbe_lock(cp->dbe);
3029 	rfs4_dbe_invalidate(cp->dbe);
3030 	rfs4_dbe_unlock(cp->dbe);
3031 
3032 	rfs4_client_state_remove(cp);
3033 
3034 	/* Release the client */
3035 	rfs4_client_rele(cp);
3036 }
3037 
3038 nfsstat4
3039 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3040 {
3041 	cid *cidp = (cid *) cp;
3042 
3043 	/*
3044 	 * If we are booted as a cluster node, check the embedded nodeid.
3045 	 * If it indicates that this clientid was generated on another node,
3046 	 * inform the client accordingly.
3047 	 */
3048 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
3049 		return (NFS4ERR_STALE_CLIENTID);
3050 
3051 	/*
3052 	 * If the server start time matches the time provided
3053 	 * by the client (via the clientid) and this is NOT a
3054 	 * setclientid_confirm then return EXPIRED.
3055 	 */
3056 	if (!setclid_confirm && cidp->impl_id.start_time == rfs4_start_time)
3057 		return (NFS4ERR_EXPIRED);
3058 
3059 	return (NFS4ERR_STALE_CLIENTID);
3060 }
3061 
3062 /*
3063  * This is used when a stateid has not been found amongst the
3064  * current server's state.  Check the stateid to see if it
3065  * was from this server instantiation or not.
3066  */
3067 static nfsstat4
3068 what_stateid_error(stateid_t *id, stateid_type_t type)
3069 {
3070 	/* If we are booted as a cluster node, was stateid locally generated? */
3071 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3072 		return (NFS4ERR_STALE_STATEID);
3073 
3074 	/* If types don't match then no use checking further */
3075 	if (type != id->bits.type)
3076 		return (NFS4ERR_BAD_STATEID);
3077 
3078 	/* From a previous server instantiation, return STALE */
3079 	if (id->bits.boottime < rfs4_start_time)
3080 		return (NFS4ERR_STALE_STATEID);
3081 
3082 	/*
3083 	 * From this server but the state is most likely beyond lease
3084 	 * timeout: return NFS4ERR_EXPIRED.  However, there is the
3085 	 * case of a delegation stateid.  For delegations, there is a
3086 	 * case where the state can be removed without the client's
3087 	 * knowledge/consent: revocation.  In the case of delegation
3088 	 * revocation, the delegation state will be removed and will
3089 	 * not be found.  If the client does something like a
3090 	 * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3091 	 * that has been revoked, the server should return BAD_STATEID
3092 	 * instead of the more common EXPIRED error.
3093 	 */
3094 	if (id->bits.boottime == rfs4_start_time) {
3095 		if (type == DELEGID)
3096 			return (NFS4ERR_BAD_STATEID);
3097 		else
3098 			return (NFS4ERR_EXPIRED);
3099 	}
3100 
3101 	return (NFS4ERR_BAD_STATEID);
3102 }
3103 
3104 /*
3105  * Used later on to find the various state structs.  When called from
3106  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3107  * taken (it is not needed) and helps on the read/write path with
3108  * respect to performance.
3109  */
3110 static nfsstat4
3111 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3112 		rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3113 {
3114 	stateid_t *id = (stateid_t *)stateid;
3115 	rfs4_state_t *sp;
3116 
3117 	*spp = NULL;
3118 
3119 	/* If we are booted as a cluster node, was stateid locally generated? */
3120 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3121 		return (NFS4ERR_STALE_STATEID);
3122 
3123 	sp = rfs4_findstate(id, find_invalid, lock_fp);
3124 	if (sp == NULL) {
3125 		return (what_stateid_error(id, OPENID));
3126 	}
3127 
3128 	if (rfs4_lease_expired(sp->owner->client)) {
3129 		if (lock_fp == TRUE)
3130 			rfs4_state_rele(sp);
3131 		else
3132 			rfs4_state_rele_nounlock(sp);
3133 		return (NFS4ERR_EXPIRED);
3134 	}
3135 
3136 	*spp = sp;
3137 
3138 	return (NFS4_OK);
3139 }
3140 
3141 nfsstat4
3142 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3143 		rfs4_dbsearch_type_t find_invalid)
3144 {
3145 	return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3146 }
3147 
3148 int
3149 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3150 {
3151 	stateid_t *id = (stateid_t *)stateid;
3152 
3153 	if (rfs4_lease_expired(sp->owner->client))
3154 		return (NFS4_CHECK_STATEID_EXPIRED);
3155 
3156 	/* Stateid is some time in the future - that's bad */
3157 	if (sp->stateid.bits.chgseq < id->bits.chgseq)
3158 		return (NFS4_CHECK_STATEID_BAD);
3159 
3160 	if (sp->stateid.bits.chgseq == id->bits.chgseq + 1)
3161 		return (NFS4_CHECK_STATEID_REPLAY);
3162 
3163 	/* Stateid is some time in the past - that's old */
3164 	if (sp->stateid.bits.chgseq > id->bits.chgseq)
3165 		return (NFS4_CHECK_STATEID_OLD);
3166 
3167 	/* Caller needs to know about confirmation before closure */
3168 	if (sp->owner->need_confirm)
3169 		return (NFS4_CHECK_STATEID_UNCONFIRMED);
3170 
3171 	if (sp->closed == TRUE)
3172 		return (NFS4_CHECK_STATEID_CLOSED);
3173 
3174 	return (NFS4_CHECK_STATEID_OKAY);
3175 }
3176 
3177 int
3178 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3179 {
3180 	stateid_t *id = (stateid_t *)stateid;
3181 
3182 	if (rfs4_lease_expired(lsp->state->owner->client))
3183 		return (NFS4_CHECK_STATEID_EXPIRED);
3184 
3185 	/* Stateid is some time in the future - that's bad */
3186 	if (lsp->lockid.bits.chgseq < id->bits.chgseq)
3187 		return (NFS4_CHECK_STATEID_BAD);
3188 
3189 	if (lsp->lockid.bits.chgseq == id->bits.chgseq + 1)
3190 		return (NFS4_CHECK_STATEID_REPLAY);
3191 
3192 	/* Stateid is some time in the past - that's old */
3193 	if (lsp->lockid.bits.chgseq > id->bits.chgseq)
3194 		return (NFS4_CHECK_STATEID_OLD);
3195 
3196 	return (NFS4_CHECK_STATEID_OKAY);
3197 }
3198 
3199 nfsstat4
3200 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3201 {
3202 	stateid_t *id = (stateid_t *)stateid;
3203 	rfs4_deleg_state_t *dsp;
3204 
3205 	*dspp = NULL;
3206 
3207 	/* If we are booted as a cluster node, was stateid locally generated? */
3208 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3209 		return (NFS4ERR_STALE_STATEID);
3210 
3211 	dsp = rfs4_finddelegstate(id);
3212 	if (dsp == NULL) {
3213 		return (what_stateid_error(id, DELEGID));
3214 	}
3215 
3216 	if (rfs4_lease_expired(dsp->client)) {
3217 		rfs4_deleg_state_rele(dsp);
3218 		return (NFS4ERR_EXPIRED);
3219 	}
3220 
3221 	*dspp = dsp;
3222 
3223 	return (NFS4_OK);
3224 }
3225 
3226 nfsstat4
3227 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3228 {
3229 	stateid_t *id = (stateid_t *)stateid;
3230 	rfs4_lo_state_t *lsp;
3231 
3232 	*lspp = NULL;
3233 
3234 	/* If we are booted as a cluster node, was stateid locally generated? */
3235 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3236 		return (NFS4ERR_STALE_STATEID);
3237 
3238 	lsp = rfs4_findlo_state(id, lock_fp);
3239 	if (lsp == NULL) {
3240 		return (what_stateid_error(id, LOCKID));
3241 	}
3242 
3243 	if (rfs4_lease_expired(lsp->state->owner->client)) {
3244 		rfs4_lo_state_rele(lsp, lock_fp);
3245 		return (NFS4ERR_EXPIRED);
3246 	}
3247 
3248 	*lspp = lsp;
3249 
3250 	return (NFS4_OK);
3251 }
3252 
3253 static nfsstat4
3254 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3255 	rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lospp)
3256 {
3257 	rfs4_state_t *sp = NULL;
3258 	rfs4_deleg_state_t *dsp = NULL;
3259 	rfs4_lo_state_t *losp = NULL;
3260 	stateid_t *id;
3261 	nfsstat4 status;
3262 
3263 	*spp = NULL; *dspp = NULL; *lospp = NULL;
3264 
3265 	id = (stateid_t *)sid;
3266 	switch (id->bits.type) {
3267 	case OPENID:
3268 		status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3269 		break;
3270 	case DELEGID:
3271 		status = rfs4_get_deleg_state(sid, &dsp);
3272 		break;
3273 	case LOCKID:
3274 		status = rfs4_get_lo_state(sid, &losp, FALSE);
3275 		if (status == NFS4_OK) {
3276 			sp = losp->state;
3277 			rfs4_dbe_hold(sp->dbe);
3278 		}
3279 		break;
3280 	default:
3281 		status = NFS4ERR_BAD_STATEID;
3282 	}
3283 
3284 	if (status == NFS4_OK) {
3285 		*spp = sp;
3286 		*dspp = dsp;
3287 		*lospp = losp;
3288 	}
3289 
3290 	return (status);
3291 }
3292 
3293 /*
3294  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3295  * rfs4_state_t struct has access to do this operation and if so
3296  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3297  */
3298 nfsstat4
3299 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3300 {
3301 	nfsstat4 stat = NFS4_OK;
3302 	rfs4_file_t *fp;
3303 	bool_t create = FALSE;
3304 
3305 	rfs4_dbe_lock(sp->dbe);
3306 	if (mode == FWRITE) {
3307 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3308 			stat = NFS4ERR_OPENMODE;
3309 		}
3310 	} else if (mode == FREAD) {
3311 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_READ)) {
3312 			/*
3313 			 * If we have OPENed the file with DENYing access
3314 			 * to both READ and WRITE then no one else could
3315 			 * have OPENed the file, hence no conflicting READ
3316 			 * deny.  This check is merely an optimization.
3317 			 */
3318 			if (sp->share_deny == OPEN4_SHARE_DENY_BOTH)
3319 				goto out;
3320 
3321 			/* Check against file struct's DENY mode */
3322 			fp = rfs4_findfile(vp, NULL, &create);
3323 			if (fp != NULL) {
3324 				int deny_read = 0;
3325 				rfs4_dbe_lock(fp->dbe);
3326 				/*
3327 				 * Check if any other open owner has the file
3328 				 * OPENed with deny READ.
3329 				 */
3330 				if (sp->share_deny & OPEN4_SHARE_DENY_READ)
3331 					deny_read = 1;
3332 				ASSERT(fp->deny_read - deny_read >= 0);
3333 				if (fp->deny_read - deny_read > 0)
3334 					stat = NFS4ERR_OPENMODE;
3335 				rfs4_dbe_unlock(fp->dbe);
3336 				rfs4_file_rele(fp);
3337 			}
3338 		}
3339 	} else {
3340 		/* Illegal I/O mode */
3341 		stat = NFS4ERR_INVAL;
3342 	}
3343 out:
3344 	rfs4_dbe_unlock(sp->dbe);
3345 	return (stat);
3346 }
3347 
3348 /*
3349  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3350  * the file is being truncated, return NFS4_OK if allowed or approriate
3351  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3352  * the associated file will be done if the I/O is not consistent with any
3353  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3354  * as reader or writer as appropriate. rfs4_op_open will accquire the
3355  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3356  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3357  * deleg parameter, we will return whether a write delegation is held by
3358  * the client associated with this stateid.
3359  * If the server instance associated with the relevant client is in its
3360  * grace period, return NFS4ERR_GRACE.
3361  */
3362 
3363 nfsstat4
3364 rfs4_check_stateid(int mode, vnode_t *vp,
3365 		stateid4 *stateid, bool_t trunc, bool_t *deleg,
3366 		bool_t do_access)
3367 {
3368 	rfs4_file_t *fp;
3369 	bool_t create = FALSE;
3370 	rfs4_state_t *sp;
3371 	rfs4_deleg_state_t *dsp;
3372 	rfs4_lo_state_t *lsp;
3373 	stateid_t *id = (stateid_t *)stateid;
3374 	nfsstat4 stat = NFS4_OK;
3375 
3376 	if (ISSPECIAL(stateid)) {
3377 		fp = rfs4_findfile(vp, NULL, &create);
3378 		if (fp == NULL)
3379 			return (NFS4_OK);
3380 		if (fp->dinfo->dtype == OPEN_DELEGATE_NONE) {
3381 			rfs4_file_rele(fp);
3382 			return (NFS4_OK);
3383 		}
3384 		if (mode == FWRITE ||
3385 			fp->dinfo->dtype == OPEN_DELEGATE_WRITE) {
3386 			rfs4_recall_deleg(fp, trunc, NULL);
3387 			rfs4_file_rele(fp);
3388 			return (NFS4ERR_DELAY);
3389 		}
3390 		rfs4_file_rele(fp);
3391 		return (NFS4_OK);
3392 	} else {
3393 		stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3394 		if (stat != NFS4_OK)
3395 			return (stat);
3396 		if (lsp != NULL) {
3397 			/* Is associated server instance in its grace period? */
3398 			if (rfs4_clnt_in_grace(lsp->locker->client)) {
3399 				rfs4_lo_state_rele(lsp, FALSE);
3400 				if (sp != NULL)
3401 					rfs4_state_rele_nounlock(sp);
3402 				return (NFS4ERR_GRACE);
3403 			}
3404 			if (id->bits.type == LOCKID) {
3405 				/* Seqid in the future? - that's bad */
3406 				if (lsp->lockid.bits.chgseq <
3407 					id->bits.chgseq) {
3408 					rfs4_lo_state_rele(lsp, FALSE);
3409 					if (sp != NULL)
3410 						rfs4_state_rele_nounlock(sp);
3411 					return (NFS4ERR_BAD_STATEID);
3412 				}
3413 				/* Seqid in the past? - that's old */
3414 				if (lsp->lockid.bits.chgseq >
3415 					id->bits.chgseq) {
3416 					rfs4_lo_state_rele(lsp, FALSE);
3417 					if (sp != NULL)
3418 						rfs4_state_rele_nounlock(sp);
3419 					return (NFS4ERR_OLD_STATEID);
3420 				}
3421 				/* Ensure specified filehandle matches */
3422 				if (lsp->state->finfo->vp != vp) {
3423 					rfs4_lo_state_rele(lsp, FALSE);
3424 					if (sp != NULL)
3425 						rfs4_state_rele_nounlock(sp);
3426 					return (NFS4ERR_BAD_STATEID);
3427 				}
3428 			}
3429 			rfs4_lo_state_rele(lsp, FALSE);
3430 		}
3431 
3432 		/* Stateid provided was an "open" stateid */
3433 		if (sp != NULL) {
3434 			/* Is associated server instance in its grace period? */
3435 			if (rfs4_clnt_in_grace(sp->owner->client)) {
3436 				rfs4_state_rele_nounlock(sp);
3437 				return (NFS4ERR_GRACE);
3438 			}
3439 			if (id->bits.type == OPENID) {
3440 				/* Seqid in the future? - that's bad */
3441 				if (sp->stateid.bits.chgseq <
3442 					id->bits.chgseq) {
3443 					rfs4_state_rele_nounlock(sp);
3444 					return (NFS4ERR_BAD_STATEID);
3445 				}
3446 				/* Seqid in the past - that's old */
3447 				if (sp->stateid.bits.chgseq >
3448 					id->bits.chgseq) {
3449 					rfs4_state_rele_nounlock(sp);
3450 					return (NFS4ERR_OLD_STATEID);
3451 				}
3452 			}
3453 			/* Ensure specified filehandle matches */
3454 			if (sp->finfo->vp != vp) {
3455 				rfs4_state_rele_nounlock(sp);
3456 				return (NFS4ERR_BAD_STATEID);
3457 			}
3458 
3459 			if (sp->owner->need_confirm) {
3460 				rfs4_state_rele_nounlock(sp);
3461 				return (NFS4ERR_BAD_STATEID);
3462 			}
3463 
3464 			if (sp->closed == TRUE) {
3465 				rfs4_state_rele_nounlock(sp);
3466 				return (NFS4ERR_OLD_STATEID);
3467 			}
3468 
3469 			if (do_access)
3470 				stat = rfs4_state_has_access(sp, mode, vp);
3471 			else
3472 				stat = NFS4_OK;
3473 
3474 			/*
3475 			 * Return whether this state has write
3476 			 * delegation if desired
3477 			 */
3478 			if (deleg &&
3479 			    (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3480 				*deleg = TRUE;
3481 
3482 			/*
3483 			 * We got a valid stateid, so we update the
3484 			 * lease on the client. Ideally we would like
3485 			 * to do this after the calling op succeeds,
3486 			 * but for now this will be good
3487 			 * enough. Callers of this routine are
3488 			 * currently insulated from the state stuff.
3489 			 */
3490 			rfs4_update_lease(sp->owner->client);
3491 
3492 			/*
3493 			 * If a delegation is present on this file and
3494 			 * this is a WRITE, then update the lastwrite
3495 			 * time to indicate that activity is present.
3496 			 */
3497 			if (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3498 				mode == FWRITE) {
3499 				sp->finfo->dinfo->time_lastwrite =
3500 					gethrestime_sec();
3501 			}
3502 
3503 			rfs4_state_rele_nounlock(sp);
3504 
3505 			return (stat);
3506 		}
3507 
3508 		if (dsp != NULL) {
3509 			/* Is associated server instance in its grace period? */
3510 			if (rfs4_clnt_in_grace(dsp->client)) {
3511 				rfs4_deleg_state_rele(dsp);
3512 				return (NFS4ERR_GRACE);
3513 			}
3514 			if (dsp->delegid.bits.chgseq !=	id->bits.chgseq) {
3515 				rfs4_deleg_state_rele(dsp);
3516 				return (NFS4ERR_BAD_STATEID);
3517 			}
3518 
3519 			/* Ensure specified filehandle matches */
3520 			if (dsp->finfo->vp != vp) {
3521 				rfs4_deleg_state_rele(dsp);
3522 				return (NFS4ERR_BAD_STATEID);
3523 			}
3524 			/*
3525 			 * Return whether this state has write
3526 			 * delegation if desired
3527 			 */
3528 			if (deleg &&
3529 			    (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3530 				*deleg = TRUE;
3531 
3532 			rfs4_update_lease(dsp->client);
3533 
3534 			/*
3535 			 * If a delegation is present on this file and
3536 			 * this is a WRITE, then update the lastwrite
3537 			 * time to indicate that activity is present.
3538 			 */
3539 			if (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3540 				mode == FWRITE) {
3541 				dsp->finfo->dinfo->time_lastwrite =
3542 					gethrestime_sec();
3543 			}
3544 
3545 			/*
3546 			 * XXX - what happens if this is a WRITE and the
3547 			 * delegation type of for READ.
3548 			 */
3549 			rfs4_deleg_state_rele(dsp);
3550 
3551 			return (stat);
3552 		}
3553 		/*
3554 		 * If we got this far, something bad happened
3555 		 */
3556 		return (NFS4ERR_BAD_STATEID);
3557 	}
3558 }
3559 
3560 
3561 /*
3562  * This is a special function in that for the file struct provided the
3563  * server wants to remove/close all current state associated with the
3564  * file.  The prime use of this would be with OP_REMOVE to force the
3565  * release of state and particularly of file locks.
3566  *
3567  * There is an assumption that there is no delegations outstanding on
3568  * this file at this point.  The caller should have waited for those
3569  * to be returned or revoked.
3570  */
3571 void
3572 rfs4_close_all_state(rfs4_file_t *fp)
3573 {
3574 	rfs4_state_t *sp;
3575 
3576 	rfs4_dbe_lock(fp->dbe);
3577 
3578 #ifdef DEBUG
3579 	/* only applies when server is handing out delegations */
3580 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE)
3581 		ASSERT(fp->dinfo->hold_grant > 0);
3582 #endif
3583 
3584 	/* No delegations for this file */
3585 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
3586 
3587 	/* Make sure that it can not be found */
3588 	rfs4_dbe_invalidate(fp->dbe);
3589 
3590 	if (fp->vp == NULL) {
3591 		rfs4_dbe_unlock(fp->dbe);
3592 		return;
3593 	}
3594 	rfs4_dbe_unlock(fp->dbe);
3595 
3596 	/*
3597 	 * Hold as writer to prevent other server threads from
3598 	 * processing requests related to the file while all state is
3599 	 * being removed.
3600 	 */
3601 	rw_enter(&fp->file_rwlock, RW_WRITER);
3602 
3603 	/* Remove ALL state from the file */
3604 	while (sp = rfs4_findstate_by_file(fp)) {
3605 		rfs4_state_close(sp, FALSE, FALSE, CRED());
3606 		rfs4_state_rele_nounlock(sp);
3607 	}
3608 
3609 	/*
3610 	 * This is only safe since there are no further references to
3611 	 * the file.
3612 	 */
3613 	rfs4_dbe_lock(fp->dbe);
3614 	if (fp->vp) {
3615 		VN_RELE(fp->vp);
3616 		fp->vp = NULL;
3617 	}
3618 	rfs4_dbe_unlock(fp->dbe);
3619 
3620 	/* Finally let other references to proceed */
3621 	rw_exit(&fp->file_rwlock);
3622 }
3623 
3624 /*
3625  * This function is used as a target for the rfs4_dbe_walk() call
3626  * below.  The purpose of this function is to see if the
3627  * lockowner_state refers to a file that resides within the exportinfo
3628  * export.  If so, then remove the lock_owner state (file locks and
3629  * share "locks") for this object since the intent is the server is
3630  * unexporting the specified directory.  Be sure to invalidate the
3631  * object after the state has been released
3632  */
3633 static void
3634 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3635 {
3636 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3637 	struct exportinfo *exi = (struct exportinfo *)e;
3638 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3639 
3640 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3641 	finfo_fhp =
3642 		(nfs_fh4_fmt_t *)lsp->state->finfo->filehandle.nfs_fh4_val;
3643 
3644 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3645 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3646 		exi_fhp->fh4_xlen) == 0) {
3647 		rfs4_state_close(lsp->state, FALSE, FALSE, CRED());
3648 		rfs4_dbe_invalidate(lsp->dbe);
3649 		rfs4_dbe_invalidate(lsp->state->dbe);
3650 	}
3651 }
3652 
3653 /*
3654  * This function is used as a target for the rfs4_dbe_walk() call
3655  * below.  The purpose of this function is to see if the state refers
3656  * to a file that resides within the exportinfo export.  If so, then
3657  * remove the open state for this object since the intent is the
3658  * server is unexporting the specified directory.  The main result for
3659  * this type of entry is to invalidate it such it will not be found in
3660  * the future.
3661  */
3662 static void
3663 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3664 {
3665 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3666 	struct exportinfo *exi = (struct exportinfo *)e;
3667 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3668 
3669 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3670 	finfo_fhp =
3671 		(nfs_fh4_fmt_t *)sp->finfo->filehandle.nfs_fh4_val;
3672 
3673 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3674 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3675 		exi_fhp->fh4_xlen) == 0) {
3676 		rfs4_state_close(sp, TRUE, FALSE, CRED());
3677 		rfs4_dbe_invalidate(sp->dbe);
3678 	}
3679 }
3680 
3681 /*
3682  * This function is used as a target for the rfs4_dbe_walk() call
3683  * below.  The purpose of this function is to see if the state refers
3684  * to a file that resides within the exportinfo export.  If so, then
3685  * remove the deleg state for this object since the intent is the
3686  * server is unexporting the specified directory.  The main result for
3687  * this type of entry is to invalidate it such it will not be found in
3688  * the future.
3689  */
3690 static void
3691 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
3692 {
3693 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3694 	struct exportinfo *exi = (struct exportinfo *)e;
3695 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3696 
3697 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3698 	finfo_fhp =
3699 		(nfs_fh4_fmt_t *)dsp->finfo->filehandle.nfs_fh4_val;
3700 
3701 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3702 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3703 		exi_fhp->fh4_xlen) == 0) {
3704 		rfs4_dbe_invalidate(dsp->dbe);
3705 	}
3706 }
3707 
3708 /*
3709  * This function is used as a target for the rfs4_dbe_walk() call
3710  * below.  The purpose of this function is to see if the state refers
3711  * to a file that resides within the exportinfo export.  If so, then
3712  * release vnode hold for this object since the intent is the server
3713  * is unexporting the specified directory.  Invalidation will prevent
3714  * this struct from being found in the future.
3715  */
3716 static void
3717 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
3718 {
3719 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
3720 	struct exportinfo *exi = (struct exportinfo *)e;
3721 	nfs_fh4_fmt_t *exi_fhp, *finfo_fhp;
3722 
3723 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3724 	finfo_fhp = (nfs_fh4_fmt_t *)fp->filehandle.nfs_fh4_val;
3725 
3726 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3727 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3728 		exi_fhp->fh4_xlen) == 0) {
3729 		if (fp->vp) {
3730 			/* don't leak monitors */
3731 			if (fp->dinfo->dtype == OPEN_DELEGATE_READ)
3732 				(void) fem_uninstall(fp->vp, deleg_rdops,
3733 						(void *)fp);
3734 			else if (fp->dinfo->dtype == OPEN_DELEGATE_WRITE)
3735 				(void) fem_uninstall(fp->vp, deleg_wrops,
3736 						(void *)fp);
3737 			VN_RELE(fp->vp);
3738 			fp->vp = NULL;
3739 		}
3740 		rfs4_dbe_invalidate(fp->dbe);
3741 	}
3742 }
3743 
3744 /*
3745  * Given a directory that is being unexported, cleanup/release all
3746  * state in the server that refers to objects residing underneath this
3747  * particular export.  The ordering of the release is important.
3748  * Lock_owner, then state and then file.
3749  */
3750 void
3751 rfs4_clean_state_exi(struct exportinfo *exi)
3752 {
3753 	mutex_enter(&rfs4_state_lock);
3754 
3755 	if (rfs4_server_state == NULL) {
3756 		mutex_exit(&rfs4_state_lock);
3757 		return;
3758 	}
3759 
3760 	rfs4_dbe_walk(rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
3761 	rfs4_dbe_walk(rfs4_state_tab, rfs4_state_walk_callout, exi);
3762 	rfs4_dbe_walk(rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
3763 	rfs4_dbe_walk(rfs4_file_tab, rfs4_file_walk_callout, exi);
3764 
3765 	mutex_exit(&rfs4_state_lock);
3766 }
3767