xref: /titanic_44/usr/src/uts/common/fs/nfs/nfs4_state.c (revision a399b7655a1d835aa8606c2b29e4e777baac8635)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/systm.h>
29 #include <sys/kmem.h>
30 #include <sys/cmn_err.h>
31 #include <sys/atomic.h>
32 #include <sys/clconf.h>
33 #include <sys/cladm.h>
34 #include <sys/flock.h>
35 #include <nfs/export.h>
36 #include <nfs/nfs.h>
37 #include <nfs/nfs4.h>
38 #include <nfs/nfssys.h>
39 #include <nfs/lm.h>
40 #include <sys/pathname.h>
41 #include <sys/nvpair.h>
42 
43 
44 extern time_t rfs4_start_time;
45 
46 stateid4 special0 = {
47 	0,
48 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
49 };
50 
51 stateid4 special1 = {
52 	0xffffffff,
53 	{
54 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
55 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
56 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
57 	}
58 };
59 
60 
61 #define	ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
62 			stateid4_cmp(id, &special1))
63 
64 /* For embedding the cluster nodeid into our clientid */
65 #define	CLUSTER_NODEID_SHIFT	24
66 #define	CLUSTER_MAX_NODEID	255
67 
68 #ifdef DEBUG
69 int rfs4_debug;
70 #endif
71 
72 static uint32_t rfs4_database_debug = 0x00;
73 
74 static void rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf);
75 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
76 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
77 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
78 
79 /*
80  * Couple of simple init/destroy functions for a general waiter
81  */
82 void
83 rfs4_sw_init(rfs4_state_wait_t *swp)
84 {
85 	mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
86 	cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
87 	swp->sw_active = FALSE;
88 	swp->sw_wait_count = 0;
89 }
90 
91 void
92 rfs4_sw_destroy(rfs4_state_wait_t *swp)
93 {
94 	mutex_destroy(swp->sw_cv_lock);
95 	cv_destroy(swp->sw_cv);
96 }
97 
98 void
99 rfs4_sw_enter(rfs4_state_wait_t *swp)
100 {
101 	mutex_enter(swp->sw_cv_lock);
102 	while (swp->sw_active) {
103 		swp->sw_wait_count++;
104 		cv_wait(swp->sw_cv, swp->sw_cv_lock);
105 		swp->sw_wait_count--;
106 	}
107 	ASSERT(swp->sw_active == FALSE);
108 	swp->sw_active = TRUE;
109 	mutex_exit(swp->sw_cv_lock);
110 }
111 
112 void
113 rfs4_sw_exit(rfs4_state_wait_t *swp)
114 {
115 	mutex_enter(swp->sw_cv_lock);
116 	ASSERT(swp->sw_active == TRUE);
117 	swp->sw_active = FALSE;
118 	if (swp->sw_wait_count != 0)
119 		cv_broadcast(swp->sw_cv);
120 	mutex_exit(swp->sw_cv_lock);
121 }
122 
123 /*
124  * CPR callback id -- not related to v4 callbacks
125  */
126 static callb_id_t cpr_id = 0;
127 
128 static void
129 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
130 {
131 	lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
132 	lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
133 
134 	if (sres->status == NFS4ERR_DENIED) {
135 		dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
136 		bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
137 	}
138 }
139 
140 static void
141 deep_lock_free(LOCK4res *res)
142 {
143 	lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
144 
145 	if (res->status == NFS4ERR_DENIED)
146 		kmem_free(lo->owner_val, lo->owner_len);
147 }
148 
149 static void
150 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
151 {
152 	nfsace4 *sacep, *dacep;
153 
154 	if (sres->status != NFS4_OK) {
155 		return;
156 	}
157 
158 	dres->attrset = sres->attrset;
159 
160 	switch (sres->delegation.delegation_type) {
161 	case OPEN_DELEGATE_NONE:
162 		return;
163 	case OPEN_DELEGATE_READ:
164 		sacep = &sres->delegation.open_delegation4_u.read.permissions;
165 		dacep = &dres->delegation.open_delegation4_u.read.permissions;
166 		break;
167 	case OPEN_DELEGATE_WRITE:
168 		sacep = &sres->delegation.open_delegation4_u.write.permissions;
169 		dacep = &dres->delegation.open_delegation4_u.write.permissions;
170 		break;
171 	}
172 	dacep->who.utf8string_val =
173 		kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
174 	bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
175 	    sacep->who.utf8string_len);
176 }
177 
178 static void
179 deep_open_free(OPEN4res *res)
180 {
181 	nfsace4 *acep;
182 	if (res->status != NFS4_OK)
183 		return;
184 
185 	switch (res->delegation.delegation_type) {
186 	case OPEN_DELEGATE_NONE:
187 		return;
188 	case OPEN_DELEGATE_READ:
189 		acep = &res->delegation.open_delegation4_u.read.permissions;
190 		break;
191 	case OPEN_DELEGATE_WRITE:
192 		acep = &res->delegation.open_delegation4_u.write.permissions;
193 		break;
194 	}
195 
196 	if (acep->who.utf8string_val) {
197 		kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
198 		acep->who.utf8string_val = NULL;
199 	}
200 }
201 
202 void
203 rfs4_free_reply(nfs_resop4 *rp)
204 {
205 	switch (rp->resop) {
206 	case OP_LOCK:
207 		deep_lock_free(&rp->nfs_resop4_u.oplock);
208 		break;
209 	case OP_OPEN:
210 		deep_open_free(&rp->nfs_resop4_u.opopen);
211 	default:
212 		break;
213 	}
214 }
215 
216 void
217 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
218 {
219 	*dst = *src;
220 
221 	/* Handle responses that need deep copy */
222 	switch (src->resop) {
223 	case OP_LOCK:
224 		deep_lock_copy(&dst->nfs_resop4_u.oplock,
225 			    &src->nfs_resop4_u.oplock);
226 		break;
227 	case OP_OPEN:
228 		deep_open_copy(&dst->nfs_resop4_u.opopen,
229 			    &src->nfs_resop4_u.opopen);
230 		break;
231 	default:
232 		break;
233 	};
234 }
235 
236 /*
237  * This is the implementation of the underlying state engine. The
238  * public interface to this engine is described by
239  * nfs4_state.h. Callers to the engine should hold no state engine
240  * locks when they call in to it. If the protocol needs to lock data
241  * structures it should do so after acquiring all references to them
242  * first and then follow the following lock order:
243  *
244  *	client > openowner > state > lo_state > lockowner > file.
245  *
246  * Internally we only allow a thread to hold one hash bucket lock at a
247  * time and the lock is higher in the lock order (must be acquired
248  * first) than the data structure that is on that hash list.
249  *
250  * If a new reference was acquired by the caller, that reference needs
251  * to be released after releasing all acquired locks with the
252  * corresponding rfs4_*_rele routine.
253  */
254 
255 /*
256  * This code is some what prototypical for now. Its purpose currently is to
257  * implement the interfaces sufficiently to finish the higher protocol
258  * elements. This will be replaced by a dynamically resizeable tables
259  * backed by kmem_cache allocator. However synchronization is handled
260  * correctly (I hope) and will not change by much.  The mutexes for
261  * the hash buckets that can be used to create new instances of data
262  * structures  might be good candidates to evolve into reader writer
263  * locks. If it has to do a creation, it would be holding the
264  * mutex across a kmem_alloc with KM_SLEEP specified.
265  */
266 
267 #ifdef DEBUG
268 #define	TABSIZE 17
269 #else
270 #define	TABSIZE 2047
271 #endif
272 
273 #define	ADDRHASH(key) ((unsigned long)(key) >> 3)
274 
275 /* Used to serialize create/destroy of rfs4_server_state database */
276 kmutex_t	rfs4_state_lock;
277 static rfs4_database_t *rfs4_server_state = NULL;
278 
279 /* Used to serialize lookups of clientids */
280 static	krwlock_t	rfs4_findclient_lock;
281 
282 /*
283  * For now this "table" is exposed so that the CPR callback
284  * function can tromp through it..
285  */
286 rfs4_table_t *rfs4_client_tab;
287 
288 static rfs4_index_t *rfs4_clientid_idx;
289 static rfs4_index_t *rfs4_nfsclnt_idx;
290 static rfs4_table_t *rfs4_openowner_tab;
291 static rfs4_index_t *rfs4_openowner_idx;
292 static rfs4_table_t *rfs4_state_tab;
293 static rfs4_index_t *rfs4_state_idx;
294 static rfs4_index_t *rfs4_state_owner_file_idx;
295 static rfs4_index_t *rfs4_state_file_idx;
296 static rfs4_table_t *rfs4_lo_state_tab;
297 static rfs4_index_t *rfs4_lo_state_idx;
298 static rfs4_index_t *rfs4_lo_state_owner_idx;
299 static rfs4_table_t *rfs4_lockowner_tab;
300 static rfs4_index_t *rfs4_lockowner_idx;
301 static rfs4_index_t *rfs4_lockowner_pid_idx;
302 static rfs4_table_t *rfs4_file_tab;
303 static rfs4_index_t *rfs4_file_idx;
304 static rfs4_table_t *rfs4_deleg_state_tab;
305 static rfs4_index_t *rfs4_deleg_idx;
306 static rfs4_index_t *rfs4_deleg_state_idx;
307 
308 #define	MAXTABSZ 1024*1024
309 
310 /* The values below are rfs4_lease_time units */
311 
312 #ifdef DEBUG
313 #define	CLIENT_CACHE_TIME 1
314 #define	OPENOWNER_CACHE_TIME 1
315 #define	STATE_CACHE_TIME 1
316 #define	LO_STATE_CACHE_TIME 1
317 #define	LOCKOWNER_CACHE_TIME 1
318 #define	FILE_CACHE_TIME 3
319 #define	DELEG_STATE_CACHE_TIME 1
320 #else
321 #define	CLIENT_CACHE_TIME 10
322 #define	OPENOWNER_CACHE_TIME 5
323 #define	STATE_CACHE_TIME 1
324 #define	LO_STATE_CACHE_TIME 1
325 #define	LOCKOWNER_CACHE_TIME 3
326 #define	FILE_CACHE_TIME 40
327 #define	DELEG_STATE_CACHE_TIME 1
328 #endif
329 
330 
331 static time_t rfs4_client_cache_time = 0;
332 static time_t rfs4_openowner_cache_time = 0;
333 static time_t rfs4_state_cache_time = 0;
334 static time_t rfs4_lo_state_cache_time = 0;
335 static time_t rfs4_lockowner_cache_time = 0;
336 static time_t rfs4_file_cache_time = 0;
337 static time_t rfs4_deleg_state_cache_time = 0;
338 
339 static bool_t rfs4_client_create(rfs4_entry_t, void *);
340 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
341 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
342 static void rfs4_client_destroy(rfs4_entry_t);
343 static bool_t rfs4_client_expiry(rfs4_entry_t);
344 static uint32_t clientid_hash(void *);
345 static bool_t clientid_compare(rfs4_entry_t, void *);
346 static void *clientid_mkkey(rfs4_entry_t);
347 static uint32_t nfsclnt_hash(void *);
348 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
349 static void *nfsclnt_mkkey(rfs4_entry_t);
350 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
351 static void rfs4_openowner_destroy(rfs4_entry_t);
352 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
353 static uint32_t openowner_hash(void *);
354 static bool_t openowner_compare(rfs4_entry_t, void *);
355 static void *openowner_mkkey(rfs4_entry_t);
356 static bool_t rfs4_state_create(rfs4_entry_t, void *);
357 static void rfs4_state_destroy(rfs4_entry_t);
358 static bool_t rfs4_state_expiry(rfs4_entry_t);
359 static uint32_t state_hash(void *);
360 static bool_t state_compare(rfs4_entry_t, void *);
361 static void *state_mkkey(rfs4_entry_t);
362 static uint32_t state_owner_file_hash(void *);
363 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
364 static void *state_owner_file_mkkey(rfs4_entry_t);
365 static uint32_t state_file_hash(void *);
366 static bool_t state_file_compare(rfs4_entry_t, void *);
367 static void *state_file_mkkey(rfs4_entry_t);
368 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
369 static void rfs4_lo_state_destroy(rfs4_entry_t);
370 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
371 static uint32_t lo_state_hash(void *);
372 static bool_t lo_state_compare(rfs4_entry_t, void *);
373 static void *lo_state_mkkey(rfs4_entry_t);
374 static uint32_t lo_state_lo_hash(void *);
375 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
376 static void *lo_state_lo_mkkey(rfs4_entry_t);
377 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
378 static void rfs4_lockowner_destroy(rfs4_entry_t);
379 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
380 static uint32_t lockowner_hash(void *);
381 static bool_t lockowner_compare(rfs4_entry_t, void *);
382 static void *lockowner_mkkey(rfs4_entry_t);
383 static uint32_t pid_hash(void *);
384 static bool_t pid_compare(rfs4_entry_t, void *);
385 static void *pid_mkkey(rfs4_entry_t);
386 static bool_t rfs4_file_create(rfs4_entry_t, void *);
387 static void rfs4_file_destroy(rfs4_entry_t);
388 static uint32_t file_hash(void *);
389 static bool_t file_compare(rfs4_entry_t, void *);
390 static void *file_mkkey(rfs4_entry_t);
391 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
392 static void rfs4_deleg_state_destroy(rfs4_entry_t);
393 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
394 static uint32_t deleg_hash(void *);
395 static bool_t deleg_compare(rfs4_entry_t, void *);
396 static void *deleg_mkkey(rfs4_entry_t);
397 static uint32_t deleg_state_hash(void *);
398 static bool_t deleg_state_compare(rfs4_entry_t, void *);
399 static void *deleg_state_mkkey(rfs4_entry_t);
400 
401 static void rfs4_state_rele_nounlock(rfs4_state_t *);
402 
403 static int rfs4_ss_enabled = 0;
404 
405 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
406 
407 void
408 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
409 {
410 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
411 }
412 
413 static rfs4_ss_pn_t *
414 rfs4_ss_pnalloc(char *dir, char *leaf)
415 {
416 	rfs4_ss_pn_t *ss_pn;
417 	int 	dir_len, leaf_len;
418 
419 	/*
420 	 * validate we have a resonable path
421 	 * (account for the '/' and trailing null)
422 	 */
423 	if ((dir_len = strlen(dir)) > MAXPATHLEN ||
424 		(leaf_len = strlen(leaf)) > MAXNAMELEN ||
425 		(dir_len + leaf_len + 2) > MAXPATHLEN) {
426 		return (NULL);
427 	}
428 
429 	ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
430 
431 	(void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
432 	/* Handy pointer to just the leaf name */
433 	ss_pn->leaf = ss_pn->pn + dir_len + 1;
434 	return (ss_pn);
435 }
436 
437 
438 /*
439  * Move the "leaf" filename from "sdir" directory
440  * to the "ddir" directory. Return the pathname of
441  * the destination unless the rename fails in which
442  * case we need to return the source pathname.
443  */
444 static rfs4_ss_pn_t *
445 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
446 {
447 	rfs4_ss_pn_t *src, *dst;
448 
449 	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
450 		return (NULL);
451 
452 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
453 		rfs4_ss_pnfree(src);
454 		return (NULL);
455 	}
456 
457 	/*
458 	 * If the rename fails we shall return the src
459 	 * pathname and free the dst. Otherwise we need
460 	 * to free the src and return the dst pathanme.
461 	 */
462 	if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
463 		rfs4_ss_pnfree(dst);
464 		return (src);
465 	}
466 	rfs4_ss_pnfree(src);
467 	return (dst);
468 }
469 
470 
471 static rfs4_oldstate_t *
472 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
473 {
474 	struct uio uio;
475 	struct iovec iov[3];
476 
477 	rfs4_oldstate_t *cl_ss = NULL;
478 	vnode_t *vp;
479 	vattr_t va;
480 	uint_t id_len;
481 	int err, kill_file, file_vers;
482 
483 	if (ss_pn == NULL)
484 		return (NULL);
485 
486 	/*
487 	 * open the state file.
488 	 */
489 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
490 		return (NULL);
491 	}
492 
493 	if (vp->v_type != VREG) {
494 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
495 		VN_RELE(vp);
496 		return (NULL);
497 	}
498 
499 	err = VOP_ACCESS(vp, VREAD, 0, CRED());
500 	if (err) {
501 		/*
502 		 * We don't have read access? better get the heck out.
503 		 */
504 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
505 		VN_RELE(vp);
506 		return (NULL);
507 	}
508 
509 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
510 	/*
511 	 * get the file size to do some basic validation
512 	 */
513 	va.va_mask = AT_SIZE;
514 	err = VOP_GETATTR(vp, &va, 0, CRED());
515 
516 	kill_file = (va.va_size == 0 || va.va_size <
517 		(NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
518 
519 	if (err || kill_file) {
520 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
521 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
522 		VN_RELE(vp);
523 		if (kill_file) {
524 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
525 		}
526 		return (NULL);
527 	}
528 
529 	cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
530 
531 	/*
532 	 * build iovecs to read in the file_version, verifier and id_len
533 	 */
534 	iov[0].iov_base = (caddr_t)&file_vers;
535 	iov[0].iov_len = sizeof (int);
536 	iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
537 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
538 	iov[2].iov_base = (caddr_t)&id_len;
539 	iov[2].iov_len = sizeof (uint_t);
540 
541 	uio.uio_iov = iov;
542 	uio.uio_iovcnt = 3;
543 	uio.uio_segflg = UIO_SYSSPACE;
544 	uio.uio_loffset = 0;
545 	uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
546 
547 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
548 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
549 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
550 		VN_RELE(vp);
551 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
552 		return (NULL);
553 	}
554 
555 	/*
556 	 * if the file_version doesn't match or if the
557 	 * id_len is zero or the combination of the verifier,
558 	 * id_len and id_val is bigger than the file we have
559 	 * a problem. If so ditch the file.
560 	 */
561 	kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
562 	    (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
563 
564 	if (err || kill_file) {
565 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
566 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
567 		VN_RELE(vp);
568 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
569 		if (kill_file) {
570 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
571 		}
572 		return (NULL);
573 	}
574 
575 	/*
576 	 * now get the client id value
577 	 */
578 	cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
579 	iov[0].iov_base = cl_ss->cl_id4.id_val;
580 	iov[0].iov_len = id_len;
581 
582 	uio.uio_iov = iov;
583 	uio.uio_iovcnt = 1;
584 	uio.uio_segflg = UIO_SYSSPACE;
585 	uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
586 
587 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
588 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
589 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
590 		VN_RELE(vp);
591 		kmem_free(cl_ss->cl_id4.id_val, id_len);
592 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
593 		return (NULL);
594 	}
595 
596 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
597 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
598 	VN_RELE(vp);
599 	return (cl_ss);
600 }
601 
602 #ifdef	nextdp
603 #undef nextdp
604 #endif
605 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
606 
607 /*
608  * Add entries from statedir to supplied oldstate list.
609  * Optionally, move all entries from statedir -> destdir.
610  */
611 void
612 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
613 {
614 	rfs4_ss_pn_t *ss_pn;
615 	rfs4_oldstate_t *cl_ss = NULL;
616 	char	*dirt = NULL;
617 	int	err, dir_eof = 0, size = 0;
618 	vnode_t *dvp;
619 	struct iovec iov;
620 	struct uio uio;
621 	struct dirent64 *dep;
622 	offset_t dirchunk_offset = 0;
623 
624 	/*
625 	 * open the state directory
626 	 */
627 	if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
628 		return;
629 
630 	if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED()))
631 		goto out;
632 
633 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
634 
635 	/*
636 	 * Get and process the directory entries
637 	 */
638 	while (!dir_eof) {
639 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
640 		iov.iov_base = dirt;
641 		iov.iov_len = RFS4_SS_DIRSIZE;
642 		uio.uio_iov = &iov;
643 		uio.uio_iovcnt = 1;
644 		uio.uio_segflg = UIO_SYSSPACE;
645 		uio.uio_loffset = dirchunk_offset;
646 		uio.uio_resid = RFS4_SS_DIRSIZE;
647 
648 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof);
649 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
650 		if (err)
651 			goto out;
652 
653 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
654 
655 		/*
656 		 * Process all the directory entries in this
657 		 * readdir chunk
658 		 */
659 		for (dep = (struct dirent64 *)dirt; size > 0;
660 			dep = nextdp(dep)) {
661 
662 			size -= dep->d_reclen;
663 			dirchunk_offset = dep->d_off;
664 
665 			/*
666 			 * Skip '.' and '..'
667 			 */
668 			if (NFS_IS_DOTNAME(dep->d_name))
669 				continue;
670 
671 			ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
672 			if (ss_pn == NULL)
673 				continue;
674 
675 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
676 				if (destdir != NULL) {
677 					rfs4_ss_pnfree(ss_pn);
678 					cl_ss->ss_pn = rfs4_ss_movestate(
679 						statedir, destdir, dep->d_name);
680 				} else {
681 					cl_ss->ss_pn = ss_pn;
682 				}
683 				insque(cl_ss, oldstate);
684 			} else {
685 				rfs4_ss_pnfree(ss_pn);
686 			}
687 		}
688 	}
689 
690 out:
691 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
692 	VN_RELE(dvp);
693 	if (dirt)
694 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
695 }
696 
697 static void
698 rfs4_ss_init(void)
699 {
700 	int npaths = 1;
701 	char *default_dss_path = NFS4_DSS_VAR_DIR;
702 
703 	/* read the default stable storage state */
704 	rfs4_dss_readstate(npaths, &default_dss_path);
705 
706 	rfs4_ss_enabled = 1;
707 }
708 
709 static void
710 rfs4_ss_fini(void)
711 {
712 	rfs4_servinst_t *sip;
713 
714 	mutex_enter(&rfs4_servinst_lock);
715 	sip = rfs4_cur_servinst;
716 	while (sip != NULL) {
717 		rfs4_dss_clear_oldstate(sip);
718 		sip = sip->next;
719 	}
720 	mutex_exit(&rfs4_servinst_lock);
721 }
722 
723 /*
724  * Remove all oldstate files referenced by this servinst.
725  */
726 static void
727 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
728 {
729 	rfs4_oldstate_t *os_head, *osp;
730 
731 	rw_enter(&sip->oldstate_lock, RW_WRITER);
732 	os_head = sip->oldstate;
733 
734 	if (os_head == NULL)
735 		return;
736 
737 	/* skip dummy entry */
738 	osp = os_head->next;
739 	while (osp != os_head) {
740 		char *leaf = osp->ss_pn->leaf;
741 		rfs4_oldstate_t *os_next;
742 
743 		rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
744 
745 		if (osp->cl_id4.id_val)
746 			kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
747 		if (osp->ss_pn)
748 			kmem_free(osp->ss_pn, sizeof (rfs4_ss_pn_t));
749 
750 		os_next = osp->next;
751 		remque(osp);
752 		kmem_free(osp, sizeof (rfs4_oldstate_t));
753 		osp = os_next;
754 	}
755 
756 	/* free dummy entry */
757 	kmem_free(osp, sizeof (rfs4_oldstate_t));
758 
759 	sip->oldstate = NULL;
760 
761 	rw_exit(&sip->oldstate_lock);
762 }
763 
764 /*
765  * Form the state and oldstate paths, and read in the stable storage files.
766  */
767 void
768 rfs4_dss_readstate(int npaths, char **paths)
769 {
770 	int i;
771 	char *state, *oldstate;
772 
773 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
774 	oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
775 
776 	for (i = 0; i < npaths; i++) {
777 		char *path = paths[i];
778 
779 		(void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
780 		(void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
781 
782 		/*
783 		 * Populate the current server instance's oldstate list.
784 		 *
785 		 * 1. Read stable storage data from old state directory,
786 		 *    leaving its contents alone.
787 		 *
788 		 * 2. Read stable storage data from state directory,
789 		 *    and move the latter's contents to old state
790 		 *    directory.
791 		 */
792 		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, oldstate, NULL);
793 		rfs4_ss_oldstate(rfs4_cur_servinst->oldstate, state, oldstate);
794 	}
795 
796 	kmem_free(state, MAXPATHLEN);
797 	kmem_free(oldstate, MAXPATHLEN);
798 }
799 
800 
801 /*
802  * Check if we are still in grace and if the client can be
803  * granted permission to perform reclaims.
804  */
805 void
806 rfs4_ss_chkclid(rfs4_client_t *cp)
807 {
808 	rfs4_servinst_t *sip;
809 
810 	/*
811 	 * It should be sufficient to check the oldstate data for just
812 	 * this client's instance. However, since our per-instance
813 	 * client grouping is solely temporal, HA-NFSv4 RG failover
814 	 * might result in clients of the same RG being partitioned into
815 	 * separate instances.
816 	 *
817 	 * Until the client grouping is improved, we must check the
818 	 * oldstate data for all instances with an active grace period.
819 	 *
820 	 * This also serves as the mechanism to remove stale oldstate data.
821 	 * The first time we check an instance after its grace period has
822 	 * expired, the oldstate data should be cleared.
823 	 *
824 	 * Start at the current instance, and walk the list backwards
825 	 * to the first.
826 	 */
827 	mutex_enter(&rfs4_servinst_lock);
828 	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
829 		rfs4_ss_chkclid_sip(cp, sip);
830 
831 		/* if the above check found this client, we're done */
832 		if (cp->can_reclaim)
833 			break;
834 	}
835 	mutex_exit(&rfs4_servinst_lock);
836 }
837 
838 static void
839 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
840 {
841 	rfs4_oldstate_t *osp, *os_head;
842 
843 	/* short circuit everything if this server instance has no oldstate */
844 	rw_enter(&sip->oldstate_lock, RW_READER);
845 	os_head = sip->oldstate;
846 	rw_exit(&sip->oldstate_lock);
847 	if (os_head == NULL)
848 		return;
849 
850 	/*
851 	 * If this server instance is no longer in a grace period then
852 	 * the client won't be able to reclaim. No further need for this
853 	 * instance's oldstate data, so it can be cleared.
854 	 */
855 	if (!rfs4_servinst_in_grace(sip))
856 		return;
857 
858 	/* this instance is still in grace; search for the clientid */
859 
860 	rw_enter(&sip->oldstate_lock, RW_READER);
861 
862 	os_head = sip->oldstate;
863 	/* skip dummy entry */
864 	osp = os_head->next;
865 	while (osp != os_head) {
866 		if (osp->cl_id4.id_len == cp->nfs_client.id_len) {
867 			if (bcmp(osp->cl_id4.id_val, cp->nfs_client.id_val,
868 					osp->cl_id4.id_len) == 0) {
869 				cp->can_reclaim = 1;
870 				break;
871 			}
872 		}
873 		osp = osp->next;
874 	}
875 
876 	rw_exit(&sip->oldstate_lock);
877 }
878 
879 /*
880  * Place client information into stable storage: 1/3.
881  * First, generate the leaf filename, from the client's IP address and
882  * the server-generated short-hand clientid.
883  */
884 void
885 rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req)
886 {
887 	const char *kinet_ntop6(uchar_t *, char *, size_t);
888 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
889 	struct sockaddr *ca;
890 	uchar_t *b;
891 
892 	if (rfs4_ss_enabled == 0) {
893 		return;
894 	}
895 
896 	buf[0] = 0;
897 
898 
899 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
900 	if (ca == NULL) {
901 		return;
902 	}
903 
904 	/*
905 	 * Convert the caller's IP address to a dotted string
906 	 */
907 	if (ca->sa_family == AF_INET) {
908 
909 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
910 			sizeof (struct sockaddr_in));
911 		b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
912 		(void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
913 				b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
914 	} else if (ca->sa_family == AF_INET6) {
915 		struct sockaddr_in6 *sin6;
916 
917 		sin6 = (struct sockaddr_in6 *)ca;
918 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
919 				sizeof (struct sockaddr_in6));
920 		(void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
921 				buf, INET6_ADDRSTRLEN);
922 	}
923 
924 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
925 	    (longlong_t)cp->clientid);
926 	rfs4_ss_clid_write(cp, leaf);
927 }
928 
929 /*
930  * Place client information into stable storage: 2/3.
931  * DSS: distributed stable storage: the file may need to be written to
932  * multiple directories.
933  */
934 static void
935 rfs4_ss_clid_write(rfs4_client_t *cp, char *leaf)
936 {
937 	rfs4_servinst_t *sip;
938 
939 	/*
940 	 * It should be sufficient to write the leaf file to (all) DSS paths
941 	 * associated with just this client's instance. However, since our
942 	 * per-instance client grouping is solely temporal, HA-NFSv4 RG
943 	 * failover might result in us losing DSS data.
944 	 *
945 	 * Until the client grouping is improved, we must write the DSS data
946 	 * to all instances' paths. Start at the current instance, and
947 	 * walk the list backwards to the first.
948 	 */
949 	mutex_enter(&rfs4_servinst_lock);
950 	for (sip = rfs4_cur_servinst; sip != NULL; sip = sip->prev) {
951 		int i, npaths = sip->dss_npaths;
952 
953 		/* write the leaf file to all DSS paths */
954 		for (i = 0; i < npaths; i++) {
955 			rfs4_dss_path_t *dss_path = sip->dss_paths[i];
956 
957 			/* HA-NFSv4 path might have been failed-away from us */
958 			if (dss_path == NULL)
959 				continue;
960 
961 			rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
962 		}
963 	}
964 	mutex_exit(&rfs4_servinst_lock);
965 }
966 
967 /*
968  * Place client information into stable storage: 3/3.
969  * Write the stable storage data to the requested file.
970  */
971 static void
972 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
973 {
974 	int ioflag;
975 	int file_vers = NFS4_SS_VERSION;
976 	size_t dirlen;
977 	struct uio uio;
978 	struct iovec iov[4];
979 	char *dir;
980 	rfs4_ss_pn_t *ss_pn;
981 	vnode_t *vp;
982 	nfs_client_id4 *cl_id4 = &(cp->nfs_client);
983 
984 	/* allow 2 extra bytes for '/' & NUL */
985 	dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
986 	dir = kmem_alloc(dirlen, KM_SLEEP);
987 	(void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
988 
989 	ss_pn = rfs4_ss_pnalloc(dir, leaf);
990 	/* rfs4_ss_pnalloc takes its own copy */
991 	kmem_free(dir, dirlen);
992 	if (ss_pn == NULL)
993 		return;
994 
995 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
996 			    CRCREAT, 0)) {
997 		rfs4_ss_pnfree(ss_pn);
998 		return;
999 	}
1000 
1001 	/*
1002 	 * We need to record leaf - i.e. the filename - so that we know
1003 	 * what to remove, in the future. However, the dir part of cp->ss_pn
1004 	 * should never be referenced directly, since it's potentially only
1005 	 * one of several paths with this leaf in it.
1006 	 */
1007 	if (cp->ss_pn != NULL) {
1008 		if (strcmp(cp->ss_pn->leaf, leaf) == 0) {
1009 			/* we've already recorded *this* leaf */
1010 			rfs4_ss_pnfree(ss_pn);
1011 		} else {
1012 			/* replace with this leaf */
1013 			rfs4_ss_pnfree(cp->ss_pn);
1014 			cp->ss_pn = ss_pn;
1015 		}
1016 	} else {
1017 		cp->ss_pn = ss_pn;
1018 	}
1019 
1020 	/*
1021 	 * Build a scatter list that points to the nfs_client_id4
1022 	 */
1023 	iov[0].iov_base = (caddr_t)&file_vers;
1024 	iov[0].iov_len = sizeof (int);
1025 	iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1026 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
1027 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1028 	iov[2].iov_len = sizeof (uint_t);
1029 	iov[3].iov_base = (caddr_t)cl_id4->id_val;
1030 	iov[3].iov_len = cl_id4->id_len;
1031 
1032 	uio.uio_iov = iov;
1033 	uio.uio_iovcnt = 4;
1034 	uio.uio_loffset = 0;
1035 	uio.uio_segflg = UIO_SYSSPACE;
1036 	uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1037 	uio.uio_resid = cl_id4->id_len + sizeof (int) +
1038 		NFS4_VERIFIER_SIZE + sizeof (uint_t);
1039 
1040 	ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1041 	uio.uio_extflg = UIO_COPY_DEFAULT;
1042 
1043 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1044 	/* write the full client id to the file. */
1045 	(void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1046 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1047 
1048 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
1049 	VN_RELE(vp);
1050 }
1051 
1052 /*
1053  * DSS: distributed stable storage.
1054  * Unpack the list of paths passed by nfsd.
1055  * Use nvlist_alloc(9F) to manage the data.
1056  * The caller is responsible for allocating and freeing the buffer.
1057  */
1058 int
1059 rfs4_dss_setpaths(char *buf, size_t buflen)
1060 {
1061 	int error;
1062 
1063 	/*
1064 	 * If this is a "warm start", i.e. we previously had DSS paths,
1065 	 * preserve the old paths.
1066 	 */
1067 	if (rfs4_dss_paths != NULL) {
1068 		/*
1069 		 * Before we lose the ptr, destroy the nvlist and pathnames
1070 		 * array from the warm start before this one.
1071 		 */
1072 		if (rfs4_dss_oldpaths)
1073 			nvlist_free(rfs4_dss_oldpaths);
1074 		rfs4_dss_oldpaths = rfs4_dss_paths;
1075 	}
1076 
1077 	/* unpack the buffer into a searchable nvlist */
1078 	error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1079 	if (error)
1080 		return (error);
1081 
1082 	/*
1083 	 * Search the nvlist for the pathnames nvpair (which is the only nvpair
1084 	 * in the list, and record its location.
1085 	 */
1086 	error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1087 	    &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1088 	return (error);
1089 }
1090 
1091 /*
1092  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1093  * to find and mark the client for forced expire.
1094  */
1095 static void
1096 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1097 {
1098 	rfs4_client_t *cp = (rfs4_client_t *)ent;
1099 	struct nfs4clrst_args *clr = arg;
1100 	struct sockaddr_in6 *ent_sin6;
1101 	struct in6_addr  clr_in6;
1102 	struct sockaddr_in  *ent_sin;
1103 	struct in_addr   clr_in;
1104 
1105 	if (clr->addr_type != cp->cl_addr.ss_family) {
1106 		return;
1107 	}
1108 
1109 	switch (clr->addr_type) {
1110 
1111 	case AF_INET6:
1112 		/* copyin the address from user space */
1113 		if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1114 			break;
1115 		}
1116 
1117 		ent_sin6 = (struct sockaddr_in6 *)&cp->cl_addr;
1118 
1119 		/*
1120 		 * now compare, and if equivalent mark entry
1121 		 * for forced expiration
1122 		 */
1123 		if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1124 			cp->forced_expire = 1;
1125 		}
1126 		break;
1127 
1128 	case AF_INET:
1129 		/* copyin the address from user space */
1130 		if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1131 			break;
1132 		}
1133 
1134 		ent_sin = (struct sockaddr_in *)&cp->cl_addr;
1135 
1136 		/*
1137 		 * now compare, and if equivalent mark entry
1138 		 * for forced expiration
1139 		 */
1140 		if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1141 			cp->forced_expire = 1;
1142 		}
1143 		break;
1144 
1145 	default:
1146 		/* force this assert to fail */
1147 		ASSERT(clr->addr_type != clr->addr_type);
1148 	}
1149 }
1150 
1151 /*
1152  * This is called from nfssys() in order to clear server state
1153  * for the specified client IP Address.
1154  */
1155 void
1156 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1157 {
1158 	(void) rfs4_dbe_walk(rfs4_client_tab, rfs4_client_scrub, clr);
1159 }
1160 
1161 /*
1162  * Used to initialize the NFSv4 server's state or database.  All of
1163  * the tables are created and timers are set. Only called when NFSv4
1164  * service is provided.
1165  */
1166 void
1167 rfs4_state_init()
1168 {
1169 	int start_grace;
1170 	extern boolean_t rfs4_cpr_callb(void *, int);
1171 	char *dss_path = NFS4_DSS_VAR_DIR;
1172 
1173 	mutex_enter(&rfs4_state_lock);
1174 
1175 	/*
1176 	 * If the server state database has already been initialized,
1177 	 * skip it
1178 	 */
1179 	if (rfs4_server_state != NULL) {
1180 		mutex_exit(&rfs4_state_lock);
1181 		return;
1182 	}
1183 
1184 	rw_init(&rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1185 
1186 	/*
1187 	 * Set the boot time.  If the server
1188 	 * has been restarted quickly and has had the opportunity to
1189 	 * service clients, then the start_time needs to be bumped
1190 	 * regardless.  A small window but it exists...
1191 	 */
1192 	if (rfs4_start_time != gethrestime_sec())
1193 		rfs4_start_time = gethrestime_sec();
1194 	else
1195 		rfs4_start_time++;
1196 
1197 	/* DSS: distributed stable storage: initialise served paths list */
1198 	rfs4_dss_pathlist = NULL;
1199 
1200 	/*
1201 	 * Create the first server instance, or a new one if the server has
1202 	 * been restarted; see above comments on rfs4_start_time. Don't
1203 	 * start its grace period; that will be done later, to maximise the
1204 	 * clients' recovery window.
1205 	 */
1206 	start_grace = 0;
1207 	rfs4_servinst_create(start_grace, 1, &dss_path);
1208 
1209 	/* reset the "first NFSv4 request" status */
1210 	rfs4_seen_first_compound = 0;
1211 
1212 	/*
1213 	 * Add a CPR callback so that we can update client
1214 	 * access times to extend the lease after a suspend
1215 	 * and resume (using the same class as rpcmod/connmgr)
1216 	 */
1217 	cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1218 
1219 	/* set the various cache timers for table creation */
1220 	if (rfs4_client_cache_time == 0)
1221 		rfs4_client_cache_time = CLIENT_CACHE_TIME;
1222 	if (rfs4_openowner_cache_time == 0)
1223 		rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1224 	if (rfs4_state_cache_time == 0)
1225 		rfs4_state_cache_time = STATE_CACHE_TIME;
1226 	if (rfs4_lo_state_cache_time == 0)
1227 		rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1228 	if (rfs4_lockowner_cache_time == 0)
1229 		rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1230 	if (rfs4_file_cache_time == 0)
1231 		rfs4_file_cache_time = FILE_CACHE_TIME;
1232 	if (rfs4_deleg_state_cache_time == 0)
1233 		rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1234 
1235 	/* Create the overall database to hold all server state */
1236 	rfs4_server_state = rfs4_database_create(rfs4_database_debug);
1237 
1238 	/* Now create the individual tables */
1239 	rfs4_client_cache_time *= rfs4_lease_time;
1240 	rfs4_client_tab = rfs4_table_create(rfs4_server_state,
1241 					    "Client",
1242 					    rfs4_client_cache_time,
1243 					    2,
1244 					    rfs4_client_create,
1245 					    rfs4_client_destroy,
1246 					    rfs4_client_expiry,
1247 					    sizeof (rfs4_client_t),
1248 					    TABSIZE,
1249 					    MAXTABSZ/8, 100);
1250 	rfs4_nfsclnt_idx = rfs4_index_create(rfs4_client_tab,
1251 					    "nfs_client_id4", nfsclnt_hash,
1252 					    nfsclnt_compare, nfsclnt_mkkey,
1253 					    TRUE);
1254 	rfs4_clientid_idx = rfs4_index_create(rfs4_client_tab,
1255 					    "client_id", clientid_hash,
1256 					    clientid_compare, clientid_mkkey,
1257 					    FALSE);
1258 
1259 	rfs4_openowner_cache_time *= rfs4_lease_time;
1260 	rfs4_openowner_tab = rfs4_table_create(rfs4_server_state,
1261 					    "OpenOwner",
1262 					    rfs4_openowner_cache_time,
1263 					    1,
1264 					    rfs4_openowner_create,
1265 					    rfs4_openowner_destroy,
1266 					    rfs4_openowner_expiry,
1267 					    sizeof (rfs4_openowner_t),
1268 					    TABSIZE,
1269 					    MAXTABSZ, 100);
1270 	rfs4_openowner_idx = rfs4_index_create(rfs4_openowner_tab,
1271 					    "open_owner4", openowner_hash,
1272 					    openowner_compare,
1273 					    openowner_mkkey, TRUE);
1274 
1275 	rfs4_state_cache_time *= rfs4_lease_time;
1276 	rfs4_state_tab = rfs4_table_create(rfs4_server_state,
1277 					"OpenStateID",
1278 					rfs4_state_cache_time,
1279 					3,
1280 					rfs4_state_create,
1281 					rfs4_state_destroy,
1282 					rfs4_state_expiry,
1283 					sizeof (rfs4_state_t),
1284 					TABSIZE,
1285 					MAXTABSZ, 100);
1286 	rfs4_state_owner_file_idx = rfs4_index_create(rfs4_state_tab,
1287 						"Openowner-File",
1288 						state_owner_file_hash,
1289 						state_owner_file_compare,
1290 						state_owner_file_mkkey, TRUE);
1291 	rfs4_state_idx = rfs4_index_create(rfs4_state_tab,
1292 					"State-id", state_hash,
1293 					state_compare, state_mkkey, FALSE);
1294 	rfs4_state_file_idx = rfs4_index_create(rfs4_state_tab,
1295 					"File", state_file_hash,
1296 					state_file_compare, state_file_mkkey,
1297 					FALSE);
1298 
1299 	rfs4_lo_state_cache_time *= rfs4_lease_time;
1300 	rfs4_lo_state_tab = rfs4_table_create(rfs4_server_state,
1301 					    "LockStateID",
1302 					    rfs4_lo_state_cache_time,
1303 					    2,
1304 					    rfs4_lo_state_create,
1305 					    rfs4_lo_state_destroy,
1306 					    rfs4_lo_state_expiry,
1307 					    sizeof (rfs4_lo_state_t),
1308 					    TABSIZE,
1309 					    MAXTABSZ, 100);
1310 	rfs4_lo_state_owner_idx = rfs4_index_create(rfs4_lo_state_tab,
1311 						    "lockownerxstate",
1312 						    lo_state_lo_hash,
1313 						    lo_state_lo_compare,
1314 						    lo_state_lo_mkkey, TRUE);
1315 	rfs4_lo_state_idx = rfs4_index_create(rfs4_lo_state_tab,
1316 					    "State-id",
1317 					    lo_state_hash, lo_state_compare,
1318 					    lo_state_mkkey, FALSE);
1319 
1320 	rfs4_lockowner_cache_time *= rfs4_lease_time;
1321 	rfs4_lockowner_tab = rfs4_table_create(rfs4_server_state,
1322 					    "Lockowner",
1323 					    rfs4_lockowner_cache_time,
1324 					    2,
1325 					    rfs4_lockowner_create,
1326 					    rfs4_lockowner_destroy,
1327 					    rfs4_lockowner_expiry,
1328 					    sizeof (rfs4_lockowner_t),
1329 					    TABSIZE,
1330 					    MAXTABSZ, 100);
1331 	rfs4_lockowner_idx = rfs4_index_create(rfs4_lockowner_tab,
1332 					    "lock_owner4", lockowner_hash,
1333 					    lockowner_compare,
1334 					    lockowner_mkkey, TRUE);
1335 	rfs4_lockowner_pid_idx = rfs4_index_create(rfs4_lockowner_tab,
1336 						"pid", pid_hash,
1337 						pid_compare, pid_mkkey,
1338 						FALSE);
1339 
1340 	rfs4_file_cache_time *= rfs4_lease_time;
1341 	rfs4_file_tab = rfs4_table_create(rfs4_server_state,
1342 					"File",
1343 					rfs4_file_cache_time,
1344 					1,
1345 					rfs4_file_create,
1346 					rfs4_file_destroy,
1347 					NULL,
1348 					sizeof (rfs4_file_t),
1349 					TABSIZE,
1350 					MAXTABSZ, -1);
1351 	rfs4_file_idx = rfs4_index_create(rfs4_file_tab,
1352 					"Filehandle", file_hash,
1353 					file_compare, file_mkkey, TRUE);
1354 
1355 	rfs4_deleg_state_cache_time *= rfs4_lease_time;
1356 	rfs4_deleg_state_tab = rfs4_table_create(rfs4_server_state,
1357 					"DelegStateID",
1358 					rfs4_deleg_state_cache_time,
1359 					2,
1360 					rfs4_deleg_state_create,
1361 					rfs4_deleg_state_destroy,
1362 					rfs4_deleg_state_expiry,
1363 					sizeof (rfs4_deleg_state_t),
1364 					TABSIZE,
1365 					MAXTABSZ, 100);
1366 	rfs4_deleg_idx = rfs4_index_create(rfs4_deleg_state_tab,
1367 						"DelegByFileClient",
1368 						deleg_hash,
1369 						deleg_compare,
1370 						deleg_mkkey, TRUE);
1371 	rfs4_deleg_state_idx = rfs4_index_create(rfs4_deleg_state_tab,
1372 						"DelegState",
1373 						deleg_state_hash,
1374 						deleg_state_compare,
1375 						deleg_state_mkkey, FALSE);
1376 
1377 	/*
1378 	 * Init the stable storage.
1379 	 */
1380 	rfs4_ss_init();
1381 
1382 	rfs4_client_clrst = rfs4_clear_client_state;
1383 
1384 	mutex_exit(&rfs4_state_lock);
1385 }
1386 
1387 
1388 /*
1389  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1390  * and other state.
1391  */
1392 void
1393 rfs4_state_fini()
1394 {
1395 	rfs4_database_t *dbp;
1396 
1397 	mutex_enter(&rfs4_state_lock);
1398 
1399 	if (rfs4_server_state == NULL) {
1400 		mutex_exit(&rfs4_state_lock);
1401 		return;
1402 	}
1403 
1404 	rfs4_client_clrst = NULL;
1405 
1406 	rfs4_set_deleg_policy(SRV_NEVER_DELEGATE);
1407 	dbp = rfs4_server_state;
1408 	rfs4_server_state = NULL;
1409 
1410 	/*
1411 	 * Cleanup the CPR callback.
1412 	 */
1413 	if (cpr_id)
1414 		(void) callb_delete(cpr_id);
1415 
1416 	rw_destroy(&rfs4_findclient_lock);
1417 
1418 	/* First stop all of the reaper threads in the database */
1419 	rfs4_database_shutdown(dbp);
1420 	/* clean up any dangling stable storage structures */
1421 	rfs4_ss_fini();
1422 	/* Now actually destroy/release the database and its tables */
1423 	rfs4_database_destroy(dbp);
1424 
1425 	/* Reset the cache timers for next time */
1426 	rfs4_client_cache_time = 0;
1427 	rfs4_openowner_cache_time = 0;
1428 	rfs4_state_cache_time = 0;
1429 	rfs4_lo_state_cache_time = 0;
1430 	rfs4_lockowner_cache_time = 0;
1431 	rfs4_file_cache_time = 0;
1432 	rfs4_deleg_state_cache_time = 0;
1433 
1434 	mutex_exit(&rfs4_state_lock);
1435 
1436 	/* destroy server instances and current instance ptr */
1437 	rfs4_servinst_destroy_all();
1438 
1439 	/* reset the "first NFSv4 request" status */
1440 	rfs4_seen_first_compound = 0;
1441 
1442 	/* DSS: distributed stable storage */
1443 	if (rfs4_dss_oldpaths)
1444 		nvlist_free(rfs4_dss_oldpaths);
1445 	if (rfs4_dss_paths)
1446 		nvlist_free(rfs4_dss_paths);
1447 	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1448 }
1449 
1450 typedef union {
1451 	struct {
1452 		uint32_t start_time;
1453 		uint32_t c_id;
1454 	} impl_id;
1455 	clientid4 id4;
1456 } cid;
1457 
1458 static int foreign_stateid(stateid_t *id);
1459 static int foreign_clientid(cid *cidp);
1460 static void embed_nodeid(cid *cidp);
1461 
1462 typedef union {
1463 	struct {
1464 		uint32_t c_id;
1465 		uint32_t gen_num;
1466 	} cv_impl;
1467 	verifier4	confirm_verf;
1468 } scid_confirm_verf;
1469 
1470 static uint32_t
1471 clientid_hash(void *key)
1472 {
1473 	cid *idp = key;
1474 
1475 	return (idp->impl_id.c_id);
1476 }
1477 
1478 static bool_t
1479 clientid_compare(rfs4_entry_t entry, void *key)
1480 {
1481 	rfs4_client_t *client = (rfs4_client_t *)entry;
1482 	clientid4 *idp = key;
1483 
1484 	return (*idp == client->clientid);
1485 }
1486 
1487 static void *
1488 clientid_mkkey(rfs4_entry_t entry)
1489 {
1490 	rfs4_client_t *client = (rfs4_client_t *)entry;
1491 
1492 	return (&client->clientid);
1493 }
1494 
1495 static uint32_t
1496 nfsclnt_hash(void *key)
1497 {
1498 	nfs_client_id4 *client = key;
1499 	int i;
1500 	uint32_t hash = 0;
1501 
1502 	for (i = 0; i < client->id_len; i++) {
1503 		hash <<= 1;
1504 		hash += (uint_t)client->id_val[i];
1505 	}
1506 	return (hash);
1507 }
1508 
1509 
1510 static bool_t
1511 nfsclnt_compare(rfs4_entry_t entry, void *key)
1512 {
1513 	rfs4_client_t *client = (rfs4_client_t *)entry;
1514 	nfs_client_id4 *nfs_client = key;
1515 
1516 	if (client->nfs_client.id_len != nfs_client->id_len)
1517 		return (FALSE);
1518 
1519 	return (bcmp(client->nfs_client.id_val, nfs_client->id_val,
1520 						nfs_client->id_len) == 0);
1521 }
1522 
1523 static void *
1524 nfsclnt_mkkey(rfs4_entry_t entry)
1525 {
1526 	rfs4_client_t *client = (rfs4_client_t *)entry;
1527 
1528 	return (&client->nfs_client);
1529 }
1530 
1531 static bool_t
1532 rfs4_client_expiry(rfs4_entry_t u_entry)
1533 {
1534 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1535 	bool_t cp_expired;
1536 
1537 	if (rfs4_dbe_is_invalid(cp->dbe))
1538 		return (TRUE);
1539 	/*
1540 	 * If the sysadmin has used clear_locks for this
1541 	 * entry then forced_expire will be set and we
1542 	 * want this entry to be reaped. Or the entry
1543 	 * has exceeded its lease period.
1544 	 */
1545 	cp_expired = (cp->forced_expire ||
1546 		(gethrestime_sec() - cp->last_access
1547 			> rfs4_lease_time));
1548 
1549 	if (!cp->ss_remove && cp_expired)
1550 		cp->ss_remove = 1;
1551 	return (cp_expired);
1552 }
1553 
1554 /*
1555  * Remove the leaf file from all distributed stable storage paths.
1556  */
1557 static void
1558 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1559 {
1560 	char *leaf = cp->ss_pn->leaf;
1561 
1562 	rfs4_dss_remove_leaf(cp->server_instance, NFS4_DSS_STATE_LEAF, leaf);
1563 }
1564 
1565 static void
1566 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1567 {
1568 	int i, npaths = sip->dss_npaths;
1569 
1570 	for (i = 0; i < npaths; i++) {
1571 		rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1572 		char *path, *dir;
1573 		size_t pathlen;
1574 
1575 		/* the HA-NFSv4 path might have been failed-over away from us */
1576 		if (dss_path == NULL)
1577 			continue;
1578 
1579 		dir = dss_path->path;
1580 
1581 		/* allow 3 extra bytes for two '/' & a NUL */
1582 		pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1583 		path = kmem_alloc(pathlen, KM_SLEEP);
1584 		(void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1585 
1586 		(void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1587 
1588 		kmem_free(path, pathlen);
1589 	}
1590 }
1591 
1592 static void
1593 rfs4_client_destroy(rfs4_entry_t u_entry)
1594 {
1595 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1596 
1597 	mutex_destroy(cp->cbinfo.cb_lock);
1598 	cv_destroy(cp->cbinfo.cb_cv);
1599 	cv_destroy(cp->cbinfo.cb_cv_nullcaller);
1600 
1601 	/* free callback info */
1602 	rfs4_cbinfo_free(&cp->cbinfo);
1603 
1604 	if (cp->cp_confirmed)
1605 		rfs4_client_rele(cp->cp_confirmed);
1606 
1607 	if (cp->ss_pn) {
1608 		/* check if the stable storage files need to be removed */
1609 		if (cp->ss_remove)
1610 			rfs4_dss_remove_cpleaf(cp);
1611 		rfs4_ss_pnfree(cp->ss_pn);
1612 	}
1613 
1614 	/* Free the client supplied client id */
1615 	kmem_free(cp->nfs_client.id_val, cp->nfs_client.id_len);
1616 
1617 	if (cp->sysidt != LM_NOSYSID)
1618 		lm_free_sysidt(cp->sysidt);
1619 }
1620 
1621 static bool_t
1622 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1623 {
1624 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1625 	nfs_client_id4 *client = (nfs_client_id4 *)arg;
1626 	cid *cidp;
1627 	scid_confirm_verf *scvp;
1628 
1629 	/* Get a clientid to give to the client */
1630 	cidp = (cid *)&cp->clientid;
1631 	cidp->impl_id.start_time = rfs4_start_time;
1632 	cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->dbe);
1633 
1634 	/* If we are booted as a cluster node, embed our nodeid */
1635 	if (cluster_bootflags & CLUSTER_BOOTED)
1636 		embed_nodeid(cidp);
1637 
1638 	/* Allocate and copy client's client id value */
1639 	cp->nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1640 	cp->nfs_client.id_len = client->id_len;
1641 	bcopy(client->id_val, cp->nfs_client.id_val, client->id_len);
1642 	cp->nfs_client.verifier = client->verifier;
1643 
1644 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1645 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1646 	scvp->cv_impl.c_id = cidp->impl_id.c_id;
1647 	scvp->cv_impl.gen_num = 0;
1648 
1649 	/* An F_UNLKSYS has been done for this client */
1650 	cp->unlksys_completed = FALSE;
1651 
1652 	/* We need the client to ack us */
1653 	cp->need_confirm = TRUE;
1654 	cp->cp_confirmed = NULL;
1655 
1656 	/* TRUE all the time until the callback path actually fails */
1657 	cp->cbinfo.cb_notified_of_cb_path_down = TRUE;
1658 
1659 	/* Initialize the access time to now */
1660 	cp->last_access = gethrestime_sec();
1661 
1662 	cp->cr_set = NULL;
1663 	/* Initialize list for insque/remque */
1664 	cp->openownerlist.next = cp->openownerlist.prev = &cp->openownerlist;
1665 	cp->openownerlist.oop = NULL; /* This is not an openowner */
1666 
1667 	cp->sysidt = LM_NOSYSID;
1668 
1669 	cp->clientdeleglist.next = cp->clientdeleglist.prev =
1670 		&cp->clientdeleglist;
1671 	cp->clientdeleglist.dsp = NULL;
1672 
1673 	/* set up the callback control structure */
1674 	cp->cbinfo.cb_state = CB_UNINIT;
1675 	mutex_init(cp->cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1676 	cv_init(cp->cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1677 	cv_init(cp->cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1678 
1679 	/*
1680 	 * Associate the client_t with the current server instance.
1681 	 * The hold is solely to satisfy the calling requirement of
1682 	 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1683 	 */
1684 	rfs4_dbe_hold(cp->dbe);
1685 	rfs4_servinst_assign(cp, rfs4_cur_servinst);
1686 	rfs4_dbe_rele(cp->dbe);
1687 
1688 	return (TRUE);
1689 }
1690 
1691 /*
1692  * Caller wants to generate/update the setclientid_confirm verifier
1693  * associated with a client.  This is done during the SETCLIENTID
1694  * processing.
1695  */
1696 void
1697 rfs4_client_scv_next(rfs4_client_t *cp)
1698 {
1699 	scid_confirm_verf *scvp;
1700 
1701 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1702 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1703 	scvp->cv_impl.gen_num++;
1704 }
1705 
1706 void
1707 rfs4_client_rele(rfs4_client_t *cp)
1708 {
1709 	rfs4_dbe_rele(cp->dbe);
1710 }
1711 
1712 rfs4_client_t *
1713 rfs4_findclient(nfs_client_id4 *client, bool_t *create,	rfs4_client_t *oldcp)
1714 {
1715 	rfs4_client_t *cp;
1716 
1717 
1718 	if (oldcp) {
1719 		rw_enter(&rfs4_findclient_lock, RW_WRITER);
1720 		rfs4_dbe_hide(oldcp->dbe);
1721 	} else {
1722 		rw_enter(&rfs4_findclient_lock, RW_READER);
1723 	}
1724 
1725 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_nfsclnt_idx, client,
1726 					create, (void *)client, RFS4_DBS_VALID);
1727 
1728 	if (oldcp)
1729 		rfs4_dbe_unhide(oldcp->dbe);
1730 
1731 	rw_exit(&rfs4_findclient_lock);
1732 
1733 	return (cp);
1734 }
1735 
1736 rfs4_client_t *
1737 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1738 {
1739 	rfs4_client_t *cp;
1740 	bool_t create = FALSE;
1741 	cid *cidp = (cid *)&clientid;
1742 
1743 	/* If we're a cluster and the nodeid isn't right, short-circuit */
1744 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1745 		return (NULL);
1746 
1747 	rw_enter(&rfs4_findclient_lock, RW_READER);
1748 
1749 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx, &clientid,
1750 					&create, NULL, RFS4_DBS_VALID);
1751 
1752 	rw_exit(&rfs4_findclient_lock);
1753 
1754 	if (cp && cp->need_confirm && find_unconfirmed == FALSE) {
1755 		rfs4_client_rele(cp);
1756 		return (NULL);
1757 	} else {
1758 		return (cp);
1759 	}
1760 }
1761 
1762 bool_t
1763 rfs4_lease_expired(rfs4_client_t *cp)
1764 {
1765 	bool_t rc;
1766 
1767 	rfs4_dbe_lock(cp->dbe);
1768 
1769 	/*
1770 	 * If the admin has executed clear_locks for this
1771 	 * client id, force expire will be set, so no need
1772 	 * to calculate anything because it's "outa here".
1773 	 */
1774 	if (cp->forced_expire) {
1775 		rc = TRUE;
1776 	} else {
1777 		rc = (gethrestime_sec() - cp->last_access > rfs4_lease_time);
1778 	}
1779 
1780 	/*
1781 	 * If the lease has expired we will also want
1782 	 * to remove any stable storage state data. So
1783 	 * mark the client id accordingly.
1784 	 */
1785 	if (!cp->ss_remove)
1786 		cp->ss_remove = (rc == TRUE);
1787 
1788 	rfs4_dbe_unlock(cp->dbe);
1789 
1790 	return (rc);
1791 }
1792 
1793 void
1794 rfs4_update_lease(rfs4_client_t *cp)
1795 {
1796 	rfs4_dbe_lock(cp->dbe);
1797 	if (!cp->forced_expire)
1798 		cp->last_access = gethrestime_sec();
1799 	rfs4_dbe_unlock(cp->dbe);
1800 }
1801 
1802 
1803 static bool_t
1804 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
1805 {
1806 	bool_t rc;
1807 
1808 	if (a->clientid != b->clientid)
1809 		return (FALSE);
1810 
1811 	if (a->owner_len != b->owner_len)
1812 		return (FALSE);
1813 
1814 	rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
1815 
1816 	return (rc);
1817 }
1818 
1819 static uint_t
1820 openowner_hash(void *key)
1821 {
1822 	int i;
1823 	open_owner4 *openowner = key;
1824 	uint_t hash = 0;
1825 
1826 	for (i = 0; i < openowner->owner_len; i++) {
1827 		hash <<= 4;
1828 		hash += (uint_t)openowner->owner_val[i];
1829 	}
1830 	hash += (uint_t)openowner->clientid;
1831 	hash |= (openowner->clientid >> 32);
1832 
1833 	return (hash);
1834 }
1835 
1836 static bool_t
1837 openowner_compare(rfs4_entry_t u_entry, void *key)
1838 {
1839 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1840 	open_owner4 *arg = key;
1841 
1842 	return (EQOPENOWNER(&op->owner, arg));
1843 }
1844 
1845 void *
1846 openowner_mkkey(rfs4_entry_t u_entry)
1847 {
1848 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1849 
1850 	return (&op->owner);
1851 }
1852 
1853 static bool_t
1854 rfs4_openowner_expiry(rfs4_entry_t u_entry)
1855 {
1856 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1857 
1858 	if (rfs4_dbe_is_invalid(op->dbe))
1859 		return (TRUE);
1860 	return ((gethrestime_sec() - op->client->last_access
1861 		> rfs4_lease_time));
1862 }
1863 
1864 static void
1865 rfs4_openowner_destroy(rfs4_entry_t u_entry)
1866 {
1867 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1868 
1869 	rfs4_sw_destroy(&op->oo_sw);
1870 
1871 	/* Remove open owner from client's lists of open owners */
1872 	rfs4_dbe_lock(op->client->dbe);
1873 
1874 	remque(&op->openownerlist);
1875 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1876 
1877 	rfs4_dbe_unlock(op->client->dbe);
1878 
1879 	/* One less reference to the client */
1880 	rfs4_client_rele(op->client);
1881 	op->client = NULL;
1882 
1883 	/* Free the last reply for this lock owner */
1884 	rfs4_free_reply(op->reply);
1885 
1886 	if (op->reply_fh.nfs_fh4_val) {
1887 		kmem_free(op->reply_fh.nfs_fh4_val, op->reply_fh.nfs_fh4_len);
1888 		op->reply_fh.nfs_fh4_val = NULL;
1889 		op->reply_fh.nfs_fh4_len = 0;
1890 	}
1891 
1892 	/* Free the lock owner id */
1893 	kmem_free(op->owner.owner_val, op->owner.owner_len);
1894 }
1895 
1896 void
1897 rfs4_openowner_rele(rfs4_openowner_t *op)
1898 {
1899 	rfs4_dbe_rele(op->dbe);
1900 }
1901 
1902 static bool_t
1903 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
1904 {
1905 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1906 	rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
1907 	open_owner4 *openowner = &argp->owner;
1908 	seqid4 seqid = argp->open_seqid;
1909 	rfs4_client_t *cp;
1910 	bool_t create = FALSE;
1911 
1912 	rw_enter(&rfs4_findclient_lock, RW_READER);
1913 
1914 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
1915 					&openowner->clientid,
1916 					&create, NULL, RFS4_DBS_VALID);
1917 
1918 	rw_exit(&rfs4_findclient_lock);
1919 
1920 	if (cp == NULL)
1921 		return (FALSE);
1922 
1923 	op->reply_fh.nfs_fh4_len = 0;
1924 	op->reply_fh.nfs_fh4_val = NULL;
1925 
1926 	op->owner.clientid = openowner->clientid;
1927 	op->owner.owner_val =
1928 		kmem_alloc(openowner->owner_len, KM_SLEEP);
1929 	bcopy(openowner->owner_val,
1930 	    op->owner.owner_val, openowner->owner_len);
1931 	op->owner.owner_len = openowner->owner_len;
1932 
1933 	op->need_confirm = TRUE;
1934 
1935 	rfs4_sw_init(&op->oo_sw);
1936 
1937 	op->open_seqid = seqid;
1938 	bzero(op->reply, sizeof (nfs_resop4));
1939 	op->client = cp;
1940 	op->cr_set = NULL;
1941 	/* Init lists for remque/insque */
1942 	op->ownerstateids.next = op->ownerstateids.prev = &op->ownerstateids;
1943 	op->ownerstateids.sp = NULL; /* NULL since this is the state list */
1944 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1945 	op->openownerlist.oop = op; /* ourselves */
1946 
1947 	/* Insert openowner into client's open owner list */
1948 	rfs4_dbe_lock(cp->dbe);
1949 
1950 	insque(&op->openownerlist, cp->openownerlist.prev);
1951 
1952 	rfs4_dbe_unlock(cp->dbe);
1953 
1954 	return (TRUE);
1955 }
1956 
1957 rfs4_openowner_t *
1958 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
1959 {
1960 	rfs4_openowner_t *op;
1961 	rfs4_openowner_t arg;
1962 
1963 	arg.owner = *openowner;
1964 	arg.open_seqid = seqid;
1965 	op = (rfs4_openowner_t *)rfs4_dbsearch(rfs4_openowner_idx, openowner,
1966 					    create, &arg, RFS4_DBS_VALID);
1967 
1968 	return (op);
1969 }
1970 
1971 void
1972 rfs4_update_open_sequence(rfs4_openowner_t *op)
1973 {
1974 
1975 	rfs4_dbe_lock(op->dbe);
1976 
1977 	op->open_seqid++;
1978 
1979 	rfs4_dbe_unlock(op->dbe);
1980 }
1981 
1982 void
1983 rfs4_update_open_resp(rfs4_openowner_t *op, nfs_resop4 *resp, nfs_fh4 *fh)
1984 {
1985 
1986 	rfs4_dbe_lock(op->dbe);
1987 
1988 	rfs4_free_reply(op->reply);
1989 
1990 	rfs4_copy_reply(op->reply, resp);
1991 
1992 	/* Save the filehandle if provided and free if not used */
1993 	if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
1994 	    fh && fh->nfs_fh4_len) {
1995 		if (op->reply_fh.nfs_fh4_val == NULL)
1996 			op->reply_fh.nfs_fh4_val =
1997 				kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
1998 		nfs_fh4_copy(fh, &op->reply_fh);
1999 	} else {
2000 		if (op->reply_fh.nfs_fh4_val) {
2001 			kmem_free(op->reply_fh.nfs_fh4_val,
2002 				op->reply_fh.nfs_fh4_len);
2003 			op->reply_fh.nfs_fh4_val = NULL;
2004 			op->reply_fh.nfs_fh4_len = 0;
2005 		}
2006 	}
2007 
2008 	rfs4_dbe_unlock(op->dbe);
2009 }
2010 
2011 static bool_t
2012 lockowner_compare(rfs4_entry_t u_entry, void *key)
2013 {
2014 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2015 	lock_owner4 *b = (lock_owner4 *)key;
2016 
2017 	if (lo->owner.clientid != b->clientid)
2018 		return (FALSE);
2019 
2020 	if (lo->owner.owner_len != b->owner_len)
2021 		return (FALSE);
2022 
2023 	return (bcmp(lo->owner.owner_val, b->owner_val,
2024 					lo->owner.owner_len) == 0);
2025 }
2026 
2027 void *
2028 lockowner_mkkey(rfs4_entry_t u_entry)
2029 {
2030 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2031 
2032 	return (&lo->owner);
2033 }
2034 
2035 static uint32_t
2036 lockowner_hash(void *key)
2037 {
2038 	int i;
2039 	lock_owner4 *lockowner = key;
2040 	uint_t hash = 0;
2041 
2042 	for (i = 0; i < lockowner->owner_len; i++) {
2043 		hash <<= 4;
2044 		hash += (uint_t)lockowner->owner_val[i];
2045 	}
2046 	hash += (uint_t)lockowner->clientid;
2047 	hash |= (lockowner->clientid >> 32);
2048 
2049 	return (hash);
2050 }
2051 
2052 static uint32_t
2053 pid_hash(void *key)
2054 {
2055 	return ((uint32_t)(uintptr_t)key);
2056 }
2057 
2058 static void *
2059 pid_mkkey(rfs4_entry_t u_entry)
2060 {
2061 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2062 
2063 	return ((void *)(uintptr_t)lo->pid);
2064 }
2065 
2066 static bool_t
2067 pid_compare(rfs4_entry_t u_entry, void *key)
2068 {
2069 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2070 
2071 	return (lo->pid == (pid_t)(uintptr_t)key);
2072 }
2073 
2074 static void
2075 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2076 {
2077 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2078 
2079 	/* Free the lock owner id */
2080 	kmem_free(lo->owner.owner_val, lo->owner.owner_len);
2081 	rfs4_client_rele(lo->client);
2082 }
2083 
2084 void
2085 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2086 {
2087 	rfs4_dbe_rele(lo->dbe);
2088 }
2089 
2090 /* ARGSUSED */
2091 static bool_t
2092 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2093 {
2094 	/*
2095 	 * Since expiry is called with no other references on
2096 	 * this struct, go ahead and have it removed.
2097 	 */
2098 	return (TRUE);
2099 }
2100 
2101 static bool_t
2102 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2103 {
2104 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2105 	lock_owner4 *lockowner = (lock_owner4 *)arg;
2106 	rfs4_client_t *cp;
2107 	bool_t create = FALSE;
2108 
2109 	rw_enter(&rfs4_findclient_lock, RW_READER);
2110 
2111 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
2112 					&lockowner->clientid,
2113 					&create, NULL, RFS4_DBS_VALID);
2114 
2115 	rw_exit(&rfs4_findclient_lock);
2116 
2117 	if (cp == NULL)
2118 		return (FALSE);
2119 
2120 	/* Reference client */
2121 	lo->client = cp;
2122 	lo->owner.clientid = lockowner->clientid;
2123 	lo->owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2124 	bcopy(lockowner->owner_val, lo->owner.owner_val, lockowner->owner_len);
2125 	lo->owner.owner_len = lockowner->owner_len;
2126 	lo->pid = rfs4_dbe_getid(lo->dbe);
2127 
2128 	return (TRUE);
2129 }
2130 
2131 rfs4_lockowner_t *
2132 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2133 {
2134 	rfs4_lockowner_t *lo;
2135 
2136 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_idx, lockowner,
2137 					    create, lockowner, RFS4_DBS_VALID);
2138 
2139 	return (lo);
2140 }
2141 
2142 rfs4_lockowner_t *
2143 rfs4_findlockowner_by_pid(pid_t pid)
2144 {
2145 	rfs4_lockowner_t *lo;
2146 	bool_t create = FALSE;
2147 
2148 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_pid_idx,
2149 		(void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2150 
2151 	return (lo);
2152 }
2153 
2154 
2155 static uint32_t
2156 file_hash(void *key)
2157 {
2158 	return (ADDRHASH(key));
2159 }
2160 
2161 static void *
2162 file_mkkey(rfs4_entry_t u_entry)
2163 {
2164 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2165 
2166 	return (fp->vp);
2167 }
2168 
2169 static bool_t
2170 file_compare(rfs4_entry_t u_entry, void *key)
2171 {
2172 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2173 
2174 	return (fp->vp == (vnode_t *)key);
2175 }
2176 
2177 static void
2178 rfs4_file_destroy(rfs4_entry_t u_entry)
2179 {
2180 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2181 
2182 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
2183 	if (fp->filehandle.nfs_fh4_val)
2184 		kmem_free(fp->filehandle.nfs_fh4_val,
2185 			fp->filehandle.nfs_fh4_len);
2186 	cv_destroy(fp->dinfo->recall_cv);
2187 	if (fp->vp) {
2188 		VN_RELE(fp->vp);
2189 		fp->vp = NULL;
2190 	}
2191 	rw_destroy(&fp->file_rwlock);
2192 }
2193 
2194 /*
2195  * Used to unlock the underlying dbe struct only
2196  */
2197 void
2198 rfs4_file_rele(rfs4_file_t *fp)
2199 {
2200 	rfs4_dbe_rele(fp->dbe);
2201 }
2202 
2203 /*
2204  * Used to unlock the file rw lock and the file's dbe entry
2205  * Only used to pair with rfs4_findfile_withlock()
2206  */
2207 void
2208 rfs4_file_rele_withunlock(rfs4_file_t *fp)
2209 {
2210 	rw_exit(&fp->file_rwlock);
2211 	rfs4_dbe_rele(fp->dbe);
2212 }
2213 
2214 typedef struct {
2215     vnode_t *vp;
2216     nfs_fh4 *fh;
2217 } rfs4_fcreate_arg;
2218 
2219 static bool_t
2220 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2221 {
2222 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2223 	rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2224 	vnode_t *vp = ap->vp;
2225 	nfs_fh4 *fh = ap->fh;
2226 
2227 	VN_HOLD(vp);
2228 
2229 	fp->filehandle.nfs_fh4_len = 0;
2230 	fp->filehandle.nfs_fh4_val = NULL;
2231 	ASSERT(fh && fh->nfs_fh4_len);
2232 	if (fh && fh->nfs_fh4_len) {
2233 		fp->filehandle.nfs_fh4_val =
2234 			kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2235 		nfs_fh4_copy(fh, &fp->filehandle);
2236 	}
2237 	fp->vp = vp;
2238 
2239 	/* Init list for remque/insque */
2240 	fp->delegationlist.next = fp->delegationlist.prev =
2241 		&fp->delegationlist;
2242 	fp->delegationlist.dsp = NULL; /* NULL since this is state list */
2243 
2244 	fp->share_deny = fp->share_access = fp->access_read = 0;
2245 	fp->access_write = fp->deny_read = fp->deny_write = 0;
2246 
2247 	mutex_init(fp->dinfo->recall_lock, NULL, MUTEX_DEFAULT, NULL);
2248 	cv_init(fp->dinfo->recall_cv, NULL, CV_DEFAULT, NULL);
2249 
2250 	fp->dinfo->dtype = OPEN_DELEGATE_NONE;
2251 
2252 	rw_init(&fp->file_rwlock, NULL, RW_DEFAULT, NULL);
2253 
2254 	return (TRUE);
2255 }
2256 
2257 rfs4_file_t *
2258 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2259 {
2260 	rfs4_file_t *fp;
2261 	rfs4_fcreate_arg arg;
2262 
2263 	arg.vp = vp;
2264 	arg.fh = fh;
2265 
2266 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2267 					&arg, RFS4_DBS_VALID);
2268 	return (fp);
2269 }
2270 
2271 /*
2272  * Find a file in the db and once it is located, take the rw lock.
2273  * Need to check the vnode pointer and if it does not exist (it was
2274  * removed between the db location and check) redo the find.  This
2275  * assumes that a file struct that has a NULL vnode pointer is marked
2276  * at 'invalid' and will not be found in the db the second time
2277  * around.
2278  */
2279 rfs4_file_t *
2280 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2281 {
2282 	rfs4_file_t *fp;
2283 	rfs4_fcreate_arg arg;
2284 	bool_t screate = *create;
2285 
2286 retry:
2287 	arg.vp = vp;
2288 	arg.fh = fh;
2289 
2290 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2291 					&arg, RFS4_DBS_VALID);
2292 	if (fp != NULL) {
2293 		rw_enter(&fp->file_rwlock, RW_WRITER);
2294 		if (fp->vp == NULL) {
2295 			rw_exit(&fp->file_rwlock);
2296 			rfs4_file_rele(fp);
2297 			*create = screate;
2298 			goto retry;
2299 		}
2300 	}
2301 
2302 	return (fp);
2303 }
2304 
2305 static uint32_t
2306 lo_state_hash(void *key)
2307 {
2308 	stateid_t *id = key;
2309 
2310 	return (id->bits.ident+id->bits.pid);
2311 }
2312 
2313 static bool_t
2314 lo_state_compare(rfs4_entry_t u_entry, void *key)
2315 {
2316 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2317 	stateid_t *id = key;
2318 	bool_t rc;
2319 
2320 	rc = (lop->lockid.bits.boottime == id->bits.boottime &&
2321 	    lop->lockid.bits.type == id->bits.type &&
2322 	    lop->lockid.bits.ident == id->bits.ident &&
2323 	    lop->lockid.bits.pid == id->bits.pid);
2324 
2325 	return (rc);
2326 }
2327 
2328 static void *
2329 lo_state_mkkey(rfs4_entry_t u_entry)
2330 {
2331 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2332 
2333 	return (&lsp->lockid);
2334 }
2335 
2336 static bool_t
2337 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2338 {
2339 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2340 
2341 	if (rfs4_dbe_is_invalid(lsp->dbe))
2342 		return (TRUE);
2343 	if (lsp->state->closed)
2344 		return (TRUE);
2345 	return ((gethrestime_sec() - lsp->state->owner->client->last_access
2346 		> rfs4_lease_time));
2347 }
2348 
2349 static void
2350 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2351 {
2352 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2353 
2354 	rfs4_sw_destroy(&lsp->ls_sw);
2355 
2356 	/* Make sure to release the file locks */
2357 	if (lsp->locks_cleaned == FALSE) {
2358 		lsp->locks_cleaned = TRUE;
2359 		if (lsp->locker->client->sysidt != LM_NOSYSID) {
2360 			/* Is the PxFS kernel module loaded? */
2361 			if (lm_remove_file_locks != NULL) {
2362 				int new_sysid;
2363 
2364 				/* Encode the cluster nodeid in new sysid */
2365 				new_sysid = lsp->locker->client->sysidt;
2366 				lm_set_nlmid_flk(&new_sysid);
2367 
2368 				/*
2369 				 * This PxFS routine removes file locks for a
2370 				 * client over all nodes of a cluster.
2371 				 */
2372 				NFS4_DEBUG(rfs4_debug, (CE_NOTE,
2373 				    "lm_remove_file_locks(sysid=0x%x)\n",
2374 				    new_sysid));
2375 				(*lm_remove_file_locks)(new_sysid);
2376 			} else {
2377 				(void) cleanlocks(lsp->state->finfo->vp,
2378 				    lsp->locker->pid,
2379 				    lsp->locker->client->sysidt);
2380 			}
2381 		}
2382 	}
2383 
2384 	rfs4_dbe_lock(lsp->state->dbe);
2385 
2386 	remque(&lsp->lockownerlist);
2387 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2388 		&lsp->lockownerlist;
2389 
2390 	rfs4_dbe_unlock(lsp->state->dbe);
2391 
2392 	/* Free the last reply for this state */
2393 	rfs4_free_reply(lsp->reply);
2394 
2395 	rfs4_lockowner_rele(lsp->locker);
2396 	lsp->locker = NULL;
2397 
2398 	rfs4_state_rele_nounlock(lsp->state);
2399 	lsp->state = NULL;
2400 }
2401 
2402 static bool_t
2403 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2404 {
2405 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2406 	rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2407 	rfs4_lockowner_t *lo = argp->locker;
2408 	rfs4_state_t *sp = argp->state;
2409 
2410 	lsp->state = sp;
2411 
2412 	lsp->lockid = sp->stateid;
2413 	lsp->lockid.bits.type = LOCKID;
2414 	lsp->lockid.bits.chgseq = 0;
2415 	lsp->lockid.bits.pid = lo->pid;
2416 
2417 	lsp->locks_cleaned = FALSE;
2418 	lsp->lock_completed = FALSE;
2419 
2420 	rfs4_sw_init(&lsp->ls_sw);
2421 
2422 	/* Attached the supplied lock owner */
2423 	rfs4_dbe_hold(lo->dbe);
2424 	lsp->locker = lo;
2425 
2426 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2427 		&lsp->lockownerlist;
2428 	lsp->lockownerlist.lsp = lsp;
2429 
2430 	rfs4_dbe_lock(sp->dbe);
2431 
2432 	insque(&lsp->lockownerlist, sp->lockownerlist.prev);
2433 
2434 	rfs4_dbe_hold(sp->dbe);
2435 
2436 	rfs4_dbe_unlock(sp->dbe);
2437 
2438 	return (TRUE);
2439 }
2440 
2441 void
2442 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2443 {
2444 	if (unlock_fp == TRUE)
2445 		rw_exit(&lsp->state->finfo->file_rwlock);
2446 	rfs4_dbe_rele(lsp->dbe);
2447 }
2448 
2449 static rfs4_lo_state_t *
2450 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2451 {
2452 	rfs4_lo_state_t *lsp;
2453 	bool_t create = FALSE;
2454 
2455 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_idx, id,
2456 					    &create, NULL, RFS4_DBS_VALID);
2457 	if (lock_fp == TRUE && lsp != NULL)
2458 		rw_enter(&lsp->state->finfo->file_rwlock, RW_READER);
2459 
2460 	return (lsp);
2461 }
2462 
2463 
2464 static uint32_t
2465 lo_state_lo_hash(void *key)
2466 {
2467 	rfs4_lo_state_t *lop = key;
2468 
2469 	return (ADDRHASH(lop->locker) ^ ADDRHASH(lop->state));
2470 }
2471 
2472 static bool_t
2473 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2474 {
2475 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2476 	rfs4_lo_state_t *keyp = key;
2477 
2478 	return (keyp->locker == lop->locker && keyp->state == lop->state);
2479 }
2480 
2481 static void *
2482 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2483 {
2484 	return (u_entry);
2485 }
2486 
2487 rfs4_lo_state_t *
2488 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo,
2489 			rfs4_state_t *sp, bool_t *create)
2490 {
2491 	rfs4_lo_state_t *lsp;
2492 	rfs4_lo_state_t arg;
2493 
2494 	arg.locker = lo;
2495 	arg.state = sp;
2496 
2497 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_owner_idx, &arg,
2498 					    create, &arg, RFS4_DBS_VALID);
2499 
2500 	return (lsp);
2501 }
2502 
2503 static stateid_t
2504 get_stateid(id_t eid)
2505 {
2506 	stateid_t id;
2507 
2508 	id.bits.boottime = rfs4_start_time;
2509 	id.bits.ident = eid;
2510 	id.bits.chgseq = 0;
2511 	id.bits.type = 0;
2512 	id.bits.pid = 0;
2513 
2514 	/*
2515 	 * If we are booted as a cluster node, embed our nodeid.
2516 	 * We've already done sanity checks in rfs4_client_create() so no
2517 	 * need to repeat them here.
2518 	 */
2519 	id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2520 	    clconf_get_nodeid() : 0;
2521 
2522 	return (id);
2523 }
2524 
2525 /*
2526  * For use only when booted as a cluster node.
2527  * Returns TRUE if the embedded nodeid indicates that this stateid was
2528  * generated on another node.
2529  */
2530 static int
2531 foreign_stateid(stateid_t *id)
2532 {
2533 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2534 	return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2535 }
2536 
2537 /*
2538  * For use only when booted as a cluster node.
2539  * Returns TRUE if the embedded nodeid indicates that this clientid was
2540  * generated on another node.
2541  */
2542 static int
2543 foreign_clientid(cid *cidp)
2544 {
2545 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2546 	return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2547 	    (uint32_t)clconf_get_nodeid());
2548 }
2549 
2550 /*
2551  * For use only when booted as a cluster node.
2552  * Embed our cluster nodeid into the clientid.
2553  */
2554 static void
2555 embed_nodeid(cid *cidp)
2556 {
2557 	int clnodeid;
2558 	/*
2559 	 * Currently, our state tables are small enough that their
2560 	 * ids will leave enough bits free for the nodeid. If the
2561 	 * tables become larger, we mustn't overwrite the id.
2562 	 * Equally, we only have room for so many bits of nodeid, so
2563 	 * must check that too.
2564 	 */
2565 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2566 	ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2567 	clnodeid = clconf_get_nodeid();
2568 	ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2569 	ASSERT(clnodeid != NODEID_UNKNOWN);
2570 	cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2571 }
2572 
2573 static uint32_t
2574 state_hash(void *key)
2575 {
2576 	stateid_t *ip = (stateid_t *)key;
2577 
2578 	return (ip->bits.ident);
2579 }
2580 
2581 static bool_t
2582 state_compare(rfs4_entry_t u_entry, void *key)
2583 {
2584 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2585 	stateid_t *id = (stateid_t *)key;
2586 	bool_t rc;
2587 
2588 	rc = (sp->stateid.bits.boottime == id->bits.boottime &&
2589 	    sp->stateid.bits.ident == id->bits.ident);
2590 
2591 	return (rc);
2592 }
2593 
2594 static void *
2595 state_mkkey(rfs4_entry_t u_entry)
2596 {
2597 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2598 
2599 	return (&sp->stateid);
2600 }
2601 
2602 static void
2603 rfs4_state_destroy(rfs4_entry_t u_entry)
2604 {
2605 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2606 
2607 	ASSERT(&sp->lockownerlist == sp->lockownerlist.next);
2608 
2609 	/* release any share locks for this stateid if it's still open */
2610 	if (!sp->closed)
2611 		rfs4_unshare(sp);
2612 
2613 	/* Were done with the file */
2614 	rfs4_file_rele(sp->finfo);
2615 	sp->finfo = NULL;
2616 
2617 	/* And now with the openowner */
2618 	rfs4_dbe_lock(sp->owner->dbe);
2619 
2620 	remque(&sp->ownerstateids);
2621 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2622 
2623 	rfs4_dbe_unlock(sp->owner->dbe);
2624 
2625 	rfs4_openowner_rele(sp->owner);
2626 	sp->owner = NULL;
2627 }
2628 
2629 static void
2630 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2631 {
2632 	rfs4_dbe_rele(sp->dbe);
2633 }
2634 
2635 void
2636 rfs4_state_rele(rfs4_state_t *sp)
2637 {
2638 	rw_exit(&sp->finfo->file_rwlock);
2639 	rfs4_dbe_rele(sp->dbe);
2640 }
2641 
2642 static uint32_t
2643 deleg_hash(void *key)
2644 {
2645 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2646 
2647 	return (ADDRHASH(dsp->client) ^ ADDRHASH(dsp->finfo));
2648 }
2649 
2650 static bool_t
2651 deleg_compare(rfs4_entry_t u_entry, void *key)
2652 {
2653 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2654 	rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2655 
2656 	return (dsp->client == kdsp->client && dsp->finfo == kdsp->finfo);
2657 }
2658 
2659 static void *
2660 deleg_mkkey(rfs4_entry_t u_entry)
2661 {
2662 	return (u_entry);
2663 }
2664 
2665 static uint32_t
2666 deleg_state_hash(void *key)
2667 {
2668 	stateid_t *ip = (stateid_t *)key;
2669 
2670 	return (ip->bits.ident);
2671 }
2672 
2673 static bool_t
2674 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2675 {
2676 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2677 	stateid_t *id = (stateid_t *)key;
2678 	bool_t rc;
2679 
2680 	if (id->bits.type != DELEGID)
2681 		return (FALSE);
2682 
2683 	rc = (dsp->delegid.bits.boottime == id->bits.boottime &&
2684 	    dsp->delegid.bits.ident == id->bits.ident);
2685 
2686 	return (rc);
2687 }
2688 
2689 static void *
2690 deleg_state_mkkey(rfs4_entry_t u_entry)
2691 {
2692 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2693 
2694 	return (&dsp->delegid);
2695 }
2696 
2697 static bool_t
2698 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2699 {
2700 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2701 
2702 	if (rfs4_dbe_is_invalid(dsp->dbe))
2703 		return (TRUE);
2704 	return ((gethrestime_sec() - dsp->client->last_access
2705 		> rfs4_lease_time));
2706 
2707 }
2708 
2709 static bool_t
2710 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2711 {
2712 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2713 	rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->finfo;
2714 	rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->client;
2715 
2716 	rfs4_dbe_hold(fp->dbe);
2717 	rfs4_dbe_hold(cp->dbe);
2718 
2719 	dsp->delegid = get_stateid(rfs4_dbe_getid(dsp->dbe));
2720 	dsp->delegid.bits.type = DELEGID;
2721 	dsp->finfo = fp;
2722 	dsp->client = cp;
2723 	dsp->dtype = OPEN_DELEGATE_NONE;
2724 
2725 	dsp->time_granted = gethrestime_sec();	/* observability */
2726 	dsp->time_revoked = 0;
2727 
2728 	/* Init lists for remque/insque */
2729 	dsp->delegationlist.next = dsp->delegationlist.prev =
2730 		&dsp->delegationlist;
2731 	dsp->delegationlist.dsp = dsp;
2732 
2733 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2734 		&dsp->clientdeleglist;
2735 	dsp->clientdeleglist.dsp = dsp;
2736 
2737 	/* Insert state on per open owner's list */
2738 	rfs4_dbe_lock(cp->dbe);
2739 
2740 	insque(&dsp->clientdeleglist, cp->clientdeleglist.prev);
2741 
2742 	rfs4_dbe_unlock(cp->dbe);
2743 
2744 	return (TRUE);
2745 }
2746 
2747 static void
2748 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
2749 {
2750 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2751 
2752 	if (&dsp->delegationlist != dsp->delegationlist.next)
2753 		rfs4_return_deleg(dsp, FALSE);
2754 
2755 	/* Were done with the file */
2756 	rfs4_file_rele(dsp->finfo);
2757 	dsp->finfo = NULL;
2758 
2759 	/* And now with the openowner */
2760 	rfs4_dbe_lock(dsp->client->dbe);
2761 
2762 	remque(&dsp->clientdeleglist);
2763 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2764 		&dsp->clientdeleglist;
2765 
2766 	rfs4_dbe_unlock(dsp->client->dbe);
2767 
2768 	rfs4_client_rele(dsp->client);
2769 	dsp->client = NULL;
2770 }
2771 
2772 rfs4_deleg_state_t *
2773 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
2774 {
2775 	rfs4_deleg_state_t ds, *dsp;
2776 
2777 	ds.client = sp->owner->client;
2778 	ds.finfo = sp->finfo;
2779 
2780 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_idx, &ds,
2781 					create, &ds, RFS4_DBS_VALID);
2782 
2783 	return (dsp);
2784 }
2785 
2786 rfs4_deleg_state_t *
2787 rfs4_finddelegstate(stateid_t *id)
2788 {
2789 	rfs4_deleg_state_t *dsp;
2790 	bool_t create = FALSE;
2791 
2792 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_state_idx, id,
2793 					&create, NULL, RFS4_DBS_VALID);
2794 
2795 	return (dsp);
2796 }
2797 
2798 void
2799 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
2800 {
2801 	rfs4_dbe_rele(dsp->dbe);
2802 }
2803 
2804 void
2805 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
2806 {
2807 
2808 	rfs4_dbe_lock(lsp->dbe);
2809 
2810 	/*
2811 	 * If we are skipping sequence id checking, this means that
2812 	 * this is the first lock request and therefore the sequence
2813 	 * id does not need to be updated.  This only happens on the
2814 	 * first lock request for a lockowner
2815 	 */
2816 	if (!lsp->skip_seqid_check)
2817 		lsp->seqid++;
2818 
2819 	rfs4_dbe_unlock(lsp->dbe);
2820 }
2821 
2822 void
2823 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
2824 {
2825 
2826 	rfs4_dbe_lock(lsp->dbe);
2827 
2828 	rfs4_free_reply(lsp->reply);
2829 
2830 	rfs4_copy_reply(lsp->reply, resp);
2831 
2832 	rfs4_dbe_unlock(lsp->dbe);
2833 }
2834 
2835 void
2836 rfs4_free_opens(rfs4_openowner_t *op, bool_t invalidate,
2837 	bool_t close_of_client)
2838 {
2839 	rfs4_state_t *sp;
2840 
2841 	rfs4_dbe_lock(op->dbe);
2842 
2843 	for (sp = op->ownerstateids.next->sp; sp != NULL;
2844 		sp = sp->ownerstateids.next->sp) {
2845 		rfs4_state_close(sp, FALSE, close_of_client, CRED());
2846 		if (invalidate == TRUE)
2847 			rfs4_dbe_invalidate(sp->dbe);
2848 	}
2849 
2850 	rfs4_dbe_unlock(op->dbe);
2851 	rfs4_dbe_invalidate(op->dbe);
2852 }
2853 
2854 static uint32_t
2855 state_owner_file_hash(void *key)
2856 {
2857 	rfs4_state_t *sp = key;
2858 
2859 	return (ADDRHASH(sp->owner) ^ ADDRHASH(sp->finfo));
2860 }
2861 
2862 static bool_t
2863 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
2864 {
2865 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2866 	rfs4_state_t *arg = key;
2867 
2868 	if (sp->closed == TRUE)
2869 		return (FALSE);
2870 
2871 	return (arg->owner == sp->owner && arg->finfo == sp->finfo);
2872 }
2873 
2874 static void *
2875 state_owner_file_mkkey(rfs4_entry_t u_entry)
2876 {
2877 	return (u_entry);
2878 }
2879 
2880 static uint32_t
2881 state_file_hash(void *key)
2882 {
2883 	return (ADDRHASH(key));
2884 }
2885 
2886 static bool_t
2887 state_file_compare(rfs4_entry_t u_entry, void *key)
2888 {
2889 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2890 	rfs4_file_t *fp = key;
2891 
2892 	if (sp->closed == TRUE)
2893 		return (FALSE);
2894 
2895 	return (fp == sp->finfo);
2896 }
2897 
2898 static void *
2899 state_file_mkkey(rfs4_entry_t u_entry)
2900 {
2901 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2902 
2903 	return (sp->finfo);
2904 }
2905 
2906 rfs4_state_t *
2907 rfs4_findstate_by_owner_file(rfs4_openowner_t *op, rfs4_file_t *file,
2908 	bool_t *create)
2909 {
2910 	rfs4_state_t *sp;
2911 	rfs4_state_t key;
2912 
2913 	key.owner = op;
2914 	key.finfo = file;
2915 
2916 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_owner_file_idx, &key,
2917 					create, &key, RFS4_DBS_VALID);
2918 
2919 	return (sp);
2920 }
2921 
2922 /* This returns ANY state struct that refers to this file */
2923 static rfs4_state_t *
2924 rfs4_findstate_by_file(rfs4_file_t *fp)
2925 {
2926 	bool_t create = FALSE;
2927 
2928 	return ((rfs4_state_t *)rfs4_dbsearch(rfs4_state_file_idx, fp,
2929 		&create, fp, RFS4_DBS_VALID));
2930 }
2931 
2932 static bool_t
2933 rfs4_state_expiry(rfs4_entry_t u_entry)
2934 {
2935 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2936 
2937 	if (rfs4_dbe_is_invalid(sp->dbe))
2938 		return (TRUE);
2939 
2940 	if (sp->closed == TRUE &&
2941 	    ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->dbe))
2942 		> rfs4_lease_time))
2943 		return (TRUE);
2944 
2945 	return ((gethrestime_sec() - sp->owner->client->last_access
2946 		> rfs4_lease_time));
2947 }
2948 
2949 static bool_t
2950 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
2951 {
2952 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2953 	rfs4_file_t *fp = ((rfs4_state_t *)argp)->finfo;
2954 	rfs4_openowner_t *op = ((rfs4_state_t *)argp)->owner;
2955 
2956 	rfs4_dbe_hold(fp->dbe);
2957 	rfs4_dbe_hold(op->dbe);
2958 	sp->stateid = get_stateid(rfs4_dbe_getid(sp->dbe));
2959 	sp->stateid.bits.type = OPENID;
2960 	sp->owner = op;
2961 	sp->finfo = fp;
2962 
2963 	/* Init lists for remque/insque */
2964 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2965 	sp->ownerstateids.sp = sp;
2966 	sp->lockownerlist.next = sp->lockownerlist.prev = &sp->lockownerlist;
2967 	sp->lockownerlist.lsp = NULL;
2968 
2969 	/* Insert state on per open owner's list */
2970 	rfs4_dbe_lock(op->dbe);
2971 
2972 	insque(&sp->ownerstateids, op->ownerstateids.prev);
2973 
2974 	rfs4_dbe_unlock(op->dbe);
2975 
2976 	return (TRUE);
2977 }
2978 
2979 static rfs4_state_t *
2980 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid,
2981 		bool_t lock_fp)
2982 {
2983 	rfs4_state_t *sp;
2984 	bool_t create = FALSE;
2985 
2986 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_idx, id,
2987 					&create, NULL, find_invalid);
2988 	if (lock_fp == TRUE && sp != NULL)
2989 		rw_enter(&sp->finfo->file_rwlock, RW_READER);
2990 
2991 	return (sp);
2992 }
2993 
2994 void
2995 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held,
2996 			bool_t close_of_client, cred_t *cr)
2997 {
2998 	/* Remove the associated lo_state owners */
2999 	if (!lock_held)
3000 		rfs4_dbe_lock(sp->dbe);
3001 
3002 	/*
3003 	 * If refcnt == 0, the dbe is about to be destroyed.
3004 	 * lock state will be released by the reaper thread.
3005 	 */
3006 
3007 	if (rfs4_dbe_refcnt(sp->dbe) > 0) {
3008 		if (sp->closed == FALSE) {
3009 			sp->closed = TRUE;
3010 
3011 			rfs4_release_share_lock_state(sp, cr, close_of_client);
3012 		}
3013 	}
3014 
3015 	if (!lock_held)
3016 		rfs4_dbe_unlock(sp->dbe);
3017 }
3018 
3019 /*
3020  * Remove all state associated with the given client.
3021  */
3022 void
3023 rfs4_client_state_remove(rfs4_client_t *cp)
3024 {
3025 	rfs4_openowner_t *oop;
3026 
3027 	rfs4_dbe_lock(cp->dbe);
3028 
3029 	for (oop = cp->openownerlist.next->oop;  oop != NULL;
3030 		oop = oop->openownerlist.next->oop) {
3031 		rfs4_free_opens(oop, TRUE, TRUE);
3032 	}
3033 
3034 	rfs4_dbe_unlock(cp->dbe);
3035 }
3036 
3037 void
3038 rfs4_client_close(rfs4_client_t *cp)
3039 {
3040 	/* Mark client as going away. */
3041 	rfs4_dbe_lock(cp->dbe);
3042 	rfs4_dbe_invalidate(cp->dbe);
3043 	rfs4_dbe_unlock(cp->dbe);
3044 
3045 	rfs4_client_state_remove(cp);
3046 
3047 	/* Release the client */
3048 	rfs4_client_rele(cp);
3049 }
3050 
3051 nfsstat4
3052 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3053 {
3054 	cid *cidp = (cid *) cp;
3055 
3056 	/*
3057 	 * If we are booted as a cluster node, check the embedded nodeid.
3058 	 * If it indicates that this clientid was generated on another node,
3059 	 * inform the client accordingly.
3060 	 */
3061 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
3062 		return (NFS4ERR_STALE_CLIENTID);
3063 
3064 	/*
3065 	 * If the server start time matches the time provided
3066 	 * by the client (via the clientid) and this is NOT a
3067 	 * setclientid_confirm then return EXPIRED.
3068 	 */
3069 	if (!setclid_confirm && cidp->impl_id.start_time == rfs4_start_time)
3070 		return (NFS4ERR_EXPIRED);
3071 
3072 	return (NFS4ERR_STALE_CLIENTID);
3073 }
3074 
3075 /*
3076  * This is used when a stateid has not been found amongst the
3077  * current server's state.  Check the stateid to see if it
3078  * was from this server instantiation or not.
3079  */
3080 static nfsstat4
3081 what_stateid_error(stateid_t *id, stateid_type_t type)
3082 {
3083 	/* If we are booted as a cluster node, was stateid locally generated? */
3084 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3085 		return (NFS4ERR_STALE_STATEID);
3086 
3087 	/* If types don't match then no use checking further */
3088 	if (type != id->bits.type)
3089 		return (NFS4ERR_BAD_STATEID);
3090 
3091 	/* From a previous server instantiation, return STALE */
3092 	if (id->bits.boottime < rfs4_start_time)
3093 		return (NFS4ERR_STALE_STATEID);
3094 
3095 	/*
3096 	 * From this server but the state is most likely beyond lease
3097 	 * timeout: return NFS4ERR_EXPIRED.  However, there is the
3098 	 * case of a delegation stateid.  For delegations, there is a
3099 	 * case where the state can be removed without the client's
3100 	 * knowledge/consent: revocation.  In the case of delegation
3101 	 * revocation, the delegation state will be removed and will
3102 	 * not be found.  If the client does something like a
3103 	 * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3104 	 * that has been revoked, the server should return BAD_STATEID
3105 	 * instead of the more common EXPIRED error.
3106 	 */
3107 	if (id->bits.boottime == rfs4_start_time) {
3108 		if (type == DELEGID)
3109 			return (NFS4ERR_BAD_STATEID);
3110 		else
3111 			return (NFS4ERR_EXPIRED);
3112 	}
3113 
3114 	return (NFS4ERR_BAD_STATEID);
3115 }
3116 
3117 /*
3118  * Used later on to find the various state structs.  When called from
3119  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3120  * taken (it is not needed) and helps on the read/write path with
3121  * respect to performance.
3122  */
3123 static nfsstat4
3124 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3125 		rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3126 {
3127 	stateid_t *id = (stateid_t *)stateid;
3128 	rfs4_state_t *sp;
3129 
3130 	*spp = NULL;
3131 
3132 	/* If we are booted as a cluster node, was stateid locally generated? */
3133 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3134 		return (NFS4ERR_STALE_STATEID);
3135 
3136 	sp = rfs4_findstate(id, find_invalid, lock_fp);
3137 	if (sp == NULL) {
3138 		return (what_stateid_error(id, OPENID));
3139 	}
3140 
3141 	if (rfs4_lease_expired(sp->owner->client)) {
3142 		if (lock_fp == TRUE)
3143 			rfs4_state_rele(sp);
3144 		else
3145 			rfs4_state_rele_nounlock(sp);
3146 		return (NFS4ERR_EXPIRED);
3147 	}
3148 
3149 	*spp = sp;
3150 
3151 	return (NFS4_OK);
3152 }
3153 
3154 nfsstat4
3155 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3156 		rfs4_dbsearch_type_t find_invalid)
3157 {
3158 	return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3159 }
3160 
3161 int
3162 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3163 {
3164 	stateid_t *id = (stateid_t *)stateid;
3165 
3166 	if (rfs4_lease_expired(sp->owner->client))
3167 		return (NFS4_CHECK_STATEID_EXPIRED);
3168 
3169 	/* Stateid is some time in the future - that's bad */
3170 	if (sp->stateid.bits.chgseq < id->bits.chgseq)
3171 		return (NFS4_CHECK_STATEID_BAD);
3172 
3173 	if (sp->stateid.bits.chgseq == id->bits.chgseq + 1)
3174 		return (NFS4_CHECK_STATEID_REPLAY);
3175 
3176 	/* Stateid is some time in the past - that's old */
3177 	if (sp->stateid.bits.chgseq > id->bits.chgseq)
3178 		return (NFS4_CHECK_STATEID_OLD);
3179 
3180 	/* Caller needs to know about confirmation before closure */
3181 	if (sp->owner->need_confirm)
3182 		return (NFS4_CHECK_STATEID_UNCONFIRMED);
3183 
3184 	if (sp->closed == TRUE)
3185 		return (NFS4_CHECK_STATEID_CLOSED);
3186 
3187 	return (NFS4_CHECK_STATEID_OKAY);
3188 }
3189 
3190 int
3191 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3192 {
3193 	stateid_t *id = (stateid_t *)stateid;
3194 
3195 	if (rfs4_lease_expired(lsp->state->owner->client))
3196 		return (NFS4_CHECK_STATEID_EXPIRED);
3197 
3198 	/* Stateid is some time in the future - that's bad */
3199 	if (lsp->lockid.bits.chgseq < id->bits.chgseq)
3200 		return (NFS4_CHECK_STATEID_BAD);
3201 
3202 	if (lsp->lockid.bits.chgseq == id->bits.chgseq + 1)
3203 		return (NFS4_CHECK_STATEID_REPLAY);
3204 
3205 	/* Stateid is some time in the past - that's old */
3206 	if (lsp->lockid.bits.chgseq > id->bits.chgseq)
3207 		return (NFS4_CHECK_STATEID_OLD);
3208 
3209 	return (NFS4_CHECK_STATEID_OKAY);
3210 }
3211 
3212 nfsstat4
3213 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3214 {
3215 	stateid_t *id = (stateid_t *)stateid;
3216 	rfs4_deleg_state_t *dsp;
3217 
3218 	*dspp = NULL;
3219 
3220 	/* If we are booted as a cluster node, was stateid locally generated? */
3221 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3222 		return (NFS4ERR_STALE_STATEID);
3223 
3224 	dsp = rfs4_finddelegstate(id);
3225 	if (dsp == NULL) {
3226 		return (what_stateid_error(id, DELEGID));
3227 	}
3228 
3229 	if (rfs4_lease_expired(dsp->client)) {
3230 		rfs4_deleg_state_rele(dsp);
3231 		return (NFS4ERR_EXPIRED);
3232 	}
3233 
3234 	*dspp = dsp;
3235 
3236 	return (NFS4_OK);
3237 }
3238 
3239 nfsstat4
3240 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3241 {
3242 	stateid_t *id = (stateid_t *)stateid;
3243 	rfs4_lo_state_t *lsp;
3244 
3245 	*lspp = NULL;
3246 
3247 	/* If we are booted as a cluster node, was stateid locally generated? */
3248 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3249 		return (NFS4ERR_STALE_STATEID);
3250 
3251 	lsp = rfs4_findlo_state(id, lock_fp);
3252 	if (lsp == NULL) {
3253 		return (what_stateid_error(id, LOCKID));
3254 	}
3255 
3256 	if (rfs4_lease_expired(lsp->state->owner->client)) {
3257 		rfs4_lo_state_rele(lsp, lock_fp);
3258 		return (NFS4ERR_EXPIRED);
3259 	}
3260 
3261 	*lspp = lsp;
3262 
3263 	return (NFS4_OK);
3264 }
3265 
3266 static nfsstat4
3267 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3268 	rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lospp)
3269 {
3270 	rfs4_state_t *sp = NULL;
3271 	rfs4_deleg_state_t *dsp = NULL;
3272 	rfs4_lo_state_t *losp = NULL;
3273 	stateid_t *id;
3274 	nfsstat4 status;
3275 
3276 	*spp = NULL; *dspp = NULL; *lospp = NULL;
3277 
3278 	id = (stateid_t *)sid;
3279 	switch (id->bits.type) {
3280 	case OPENID:
3281 		status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3282 		break;
3283 	case DELEGID:
3284 		status = rfs4_get_deleg_state(sid, &dsp);
3285 		break;
3286 	case LOCKID:
3287 		status = rfs4_get_lo_state(sid, &losp, FALSE);
3288 		if (status == NFS4_OK) {
3289 			sp = losp->state;
3290 			rfs4_dbe_hold(sp->dbe);
3291 		}
3292 		break;
3293 	default:
3294 		status = NFS4ERR_BAD_STATEID;
3295 	}
3296 
3297 	if (status == NFS4_OK) {
3298 		*spp = sp;
3299 		*dspp = dsp;
3300 		*lospp = losp;
3301 	}
3302 
3303 	return (status);
3304 }
3305 
3306 /*
3307  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3308  * rfs4_state_t struct has access to do this operation and if so
3309  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3310  */
3311 nfsstat4
3312 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3313 {
3314 	nfsstat4 stat = NFS4_OK;
3315 	rfs4_file_t *fp;
3316 	bool_t create = FALSE;
3317 
3318 	rfs4_dbe_lock(sp->dbe);
3319 	if (mode == FWRITE) {
3320 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3321 			stat = NFS4ERR_OPENMODE;
3322 		}
3323 	} else if (mode == FREAD) {
3324 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_READ)) {
3325 			/*
3326 			 * If we have OPENed the file with DENYing access
3327 			 * to both READ and WRITE then no one else could
3328 			 * have OPENed the file, hence no conflicting READ
3329 			 * deny.  This check is merely an optimization.
3330 			 */
3331 			if (sp->share_deny == OPEN4_SHARE_DENY_BOTH)
3332 				goto out;
3333 
3334 			/* Check against file struct's DENY mode */
3335 			fp = rfs4_findfile(vp, NULL, &create);
3336 			if (fp != NULL) {
3337 				int deny_read = 0;
3338 				rfs4_dbe_lock(fp->dbe);
3339 				/*
3340 				 * Check if any other open owner has the file
3341 				 * OPENed with deny READ.
3342 				 */
3343 				if (sp->share_deny & OPEN4_SHARE_DENY_READ)
3344 					deny_read = 1;
3345 				ASSERT(fp->deny_read - deny_read >= 0);
3346 				if (fp->deny_read - deny_read > 0)
3347 					stat = NFS4ERR_OPENMODE;
3348 				rfs4_dbe_unlock(fp->dbe);
3349 				rfs4_file_rele(fp);
3350 			}
3351 		}
3352 	} else {
3353 		/* Illegal I/O mode */
3354 		stat = NFS4ERR_INVAL;
3355 	}
3356 out:
3357 	rfs4_dbe_unlock(sp->dbe);
3358 	return (stat);
3359 }
3360 
3361 /*
3362  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3363  * the file is being truncated, return NFS4_OK if allowed or approriate
3364  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3365  * the associated file will be done if the I/O is not consistent with any
3366  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3367  * as reader or writer as appropriate. rfs4_op_open will accquire the
3368  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3369  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3370  * deleg parameter, we will return whether a write delegation is held by
3371  * the client associated with this stateid.
3372  * If the server instance associated with the relevant client is in its
3373  * grace period, return NFS4ERR_GRACE.
3374  */
3375 
3376 nfsstat4
3377 rfs4_check_stateid(int mode, vnode_t *vp,
3378 		stateid4 *stateid, bool_t trunc, bool_t *deleg,
3379 		bool_t do_access)
3380 {
3381 	rfs4_file_t *fp;
3382 	bool_t create = FALSE;
3383 	rfs4_state_t *sp;
3384 	rfs4_deleg_state_t *dsp;
3385 	rfs4_lo_state_t *lsp;
3386 	stateid_t *id = (stateid_t *)stateid;
3387 	nfsstat4 stat = NFS4_OK;
3388 
3389 	if (ISSPECIAL(stateid)) {
3390 		fp = rfs4_findfile(vp, NULL, &create);
3391 		if (fp == NULL)
3392 			return (NFS4_OK);
3393 		if (fp->dinfo->dtype == OPEN_DELEGATE_NONE) {
3394 			rfs4_file_rele(fp);
3395 			return (NFS4_OK);
3396 		}
3397 		if (mode == FWRITE ||
3398 			fp->dinfo->dtype == OPEN_DELEGATE_WRITE) {
3399 			rfs4_recall_deleg(fp, trunc, NULL);
3400 			rfs4_file_rele(fp);
3401 			return (NFS4ERR_DELAY);
3402 		}
3403 		rfs4_file_rele(fp);
3404 		return (NFS4_OK);
3405 	} else {
3406 		stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3407 		if (stat != NFS4_OK)
3408 			return (stat);
3409 		if (lsp != NULL) {
3410 			/* Is associated server instance in its grace period? */
3411 			if (rfs4_clnt_in_grace(lsp->locker->client)) {
3412 				rfs4_lo_state_rele(lsp, FALSE);
3413 				if (sp != NULL)
3414 					rfs4_state_rele_nounlock(sp);
3415 				return (NFS4ERR_GRACE);
3416 			}
3417 			if (id->bits.type == LOCKID) {
3418 				/* Seqid in the future? - that's bad */
3419 				if (lsp->lockid.bits.chgseq <
3420 					id->bits.chgseq) {
3421 					rfs4_lo_state_rele(lsp, FALSE);
3422 					if (sp != NULL)
3423 						rfs4_state_rele_nounlock(sp);
3424 					return (NFS4ERR_BAD_STATEID);
3425 				}
3426 				/* Seqid in the past? - that's old */
3427 				if (lsp->lockid.bits.chgseq >
3428 					id->bits.chgseq) {
3429 					rfs4_lo_state_rele(lsp, FALSE);
3430 					if (sp != NULL)
3431 						rfs4_state_rele_nounlock(sp);
3432 					return (NFS4ERR_OLD_STATEID);
3433 				}
3434 				/* Ensure specified filehandle matches */
3435 				if (lsp->state->finfo->vp != vp) {
3436 					rfs4_lo_state_rele(lsp, FALSE);
3437 					if (sp != NULL)
3438 						rfs4_state_rele_nounlock(sp);
3439 					return (NFS4ERR_BAD_STATEID);
3440 				}
3441 			}
3442 			rfs4_lo_state_rele(lsp, FALSE);
3443 		}
3444 
3445 		/* Stateid provided was an "open" stateid */
3446 		if (sp != NULL) {
3447 			/* Is associated server instance in its grace period? */
3448 			if (rfs4_clnt_in_grace(sp->owner->client)) {
3449 				rfs4_state_rele_nounlock(sp);
3450 				return (NFS4ERR_GRACE);
3451 			}
3452 			if (id->bits.type == OPENID) {
3453 				/* Seqid in the future? - that's bad */
3454 				if (sp->stateid.bits.chgseq <
3455 					id->bits.chgseq) {
3456 					rfs4_state_rele_nounlock(sp);
3457 					return (NFS4ERR_BAD_STATEID);
3458 				}
3459 				/* Seqid in the past - that's old */
3460 				if (sp->stateid.bits.chgseq >
3461 					id->bits.chgseq) {
3462 					rfs4_state_rele_nounlock(sp);
3463 					return (NFS4ERR_OLD_STATEID);
3464 				}
3465 			}
3466 			/* Ensure specified filehandle matches */
3467 			if (sp->finfo->vp != vp) {
3468 				rfs4_state_rele_nounlock(sp);
3469 				return (NFS4ERR_BAD_STATEID);
3470 			}
3471 
3472 			if (sp->owner->need_confirm) {
3473 				rfs4_state_rele_nounlock(sp);
3474 				return (NFS4ERR_BAD_STATEID);
3475 			}
3476 
3477 			if (sp->closed == TRUE) {
3478 				rfs4_state_rele_nounlock(sp);
3479 				return (NFS4ERR_OLD_STATEID);
3480 			}
3481 
3482 			if (do_access)
3483 				stat = rfs4_state_has_access(sp, mode, vp);
3484 			else
3485 				stat = NFS4_OK;
3486 
3487 			/*
3488 			 * Return whether this state has write
3489 			 * delegation if desired
3490 			 */
3491 			if (deleg &&
3492 			    (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3493 				*deleg = TRUE;
3494 
3495 			/*
3496 			 * We got a valid stateid, so we update the
3497 			 * lease on the client. Ideally we would like
3498 			 * to do this after the calling op succeeds,
3499 			 * but for now this will be good
3500 			 * enough. Callers of this routine are
3501 			 * currently insulated from the state stuff.
3502 			 */
3503 			rfs4_update_lease(sp->owner->client);
3504 
3505 			/*
3506 			 * If a delegation is present on this file and
3507 			 * this is a WRITE, then update the lastwrite
3508 			 * time to indicate that activity is present.
3509 			 */
3510 			if (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3511 				mode == FWRITE) {
3512 				sp->finfo->dinfo->time_lastwrite =
3513 					gethrestime_sec();
3514 			}
3515 
3516 			rfs4_state_rele_nounlock(sp);
3517 
3518 			return (stat);
3519 		}
3520 
3521 		if (dsp != NULL) {
3522 			/* Is associated server instance in its grace period? */
3523 			if (rfs4_clnt_in_grace(dsp->client)) {
3524 				rfs4_deleg_state_rele(dsp);
3525 				return (NFS4ERR_GRACE);
3526 			}
3527 			if (dsp->delegid.bits.chgseq !=	id->bits.chgseq) {
3528 				rfs4_deleg_state_rele(dsp);
3529 				return (NFS4ERR_BAD_STATEID);
3530 			}
3531 
3532 			/* Ensure specified filehandle matches */
3533 			if (dsp->finfo->vp != vp) {
3534 				rfs4_deleg_state_rele(dsp);
3535 				return (NFS4ERR_BAD_STATEID);
3536 			}
3537 			/*
3538 			 * Return whether this state has write
3539 			 * delegation if desired
3540 			 */
3541 			if (deleg &&
3542 			    (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3543 				*deleg = TRUE;
3544 
3545 			rfs4_update_lease(dsp->client);
3546 
3547 			/*
3548 			 * If a delegation is present on this file and
3549 			 * this is a WRITE, then update the lastwrite
3550 			 * time to indicate that activity is present.
3551 			 */
3552 			if (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3553 				mode == FWRITE) {
3554 				dsp->finfo->dinfo->time_lastwrite =
3555 					gethrestime_sec();
3556 			}
3557 
3558 			/*
3559 			 * XXX - what happens if this is a WRITE and the
3560 			 * delegation type of for READ.
3561 			 */
3562 			rfs4_deleg_state_rele(dsp);
3563 
3564 			return (stat);
3565 		}
3566 		/*
3567 		 * If we got this far, something bad happened
3568 		 */
3569 		return (NFS4ERR_BAD_STATEID);
3570 	}
3571 }
3572 
3573 
3574 /*
3575  * This is a special function in that for the file struct provided the
3576  * server wants to remove/close all current state associated with the
3577  * file.  The prime use of this would be with OP_REMOVE to force the
3578  * release of state and particularly of file locks.
3579  *
3580  * There is an assumption that there is no delegations outstanding on
3581  * this file at this point.  The caller should have waited for those
3582  * to be returned or revoked.
3583  */
3584 void
3585 rfs4_close_all_state(rfs4_file_t *fp)
3586 {
3587 	rfs4_state_t *sp;
3588 
3589 	rfs4_dbe_lock(fp->dbe);
3590 
3591 #ifdef DEBUG
3592 	/* only applies when server is handing out delegations */
3593 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE)
3594 		ASSERT(fp->dinfo->hold_grant > 0);
3595 #endif
3596 
3597 	/* No delegations for this file */
3598 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
3599 
3600 	/* Make sure that it can not be found */
3601 	rfs4_dbe_invalidate(fp->dbe);
3602 
3603 	if (fp->vp == NULL) {
3604 		rfs4_dbe_unlock(fp->dbe);
3605 		return;
3606 	}
3607 	rfs4_dbe_unlock(fp->dbe);
3608 
3609 	/*
3610 	 * Hold as writer to prevent other server threads from
3611 	 * processing requests related to the file while all state is
3612 	 * being removed.
3613 	 */
3614 	rw_enter(&fp->file_rwlock, RW_WRITER);
3615 
3616 	/* Remove ALL state from the file */
3617 	while (sp = rfs4_findstate_by_file(fp)) {
3618 		rfs4_state_close(sp, FALSE, FALSE, CRED());
3619 		rfs4_state_rele_nounlock(sp);
3620 	}
3621 
3622 	/*
3623 	 * This is only safe since there are no further references to
3624 	 * the file.
3625 	 */
3626 	rfs4_dbe_lock(fp->dbe);
3627 	if (fp->vp) {
3628 		VN_RELE(fp->vp);
3629 		fp->vp = NULL;
3630 	}
3631 	rfs4_dbe_unlock(fp->dbe);
3632 
3633 	/* Finally let other references to proceed */
3634 	rw_exit(&fp->file_rwlock);
3635 }
3636 
3637 /*
3638  * This function is used as a target for the rfs4_dbe_walk() call
3639  * below.  The purpose of this function is to see if the
3640  * lockowner_state refers to a file that resides within the exportinfo
3641  * export.  If so, then remove the lock_owner state (file locks and
3642  * share "locks") for this object since the intent is the server is
3643  * unexporting the specified directory.  Be sure to invalidate the
3644  * object after the state has been released
3645  */
3646 static void
3647 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3648 {
3649 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3650 	struct exportinfo *exi = (struct exportinfo *)e;
3651 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3652 	fhandle_t *efhp;
3653 
3654 	efhp = (fhandle_t *)&exi->exi_fh;
3655 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3656 
3657 	FH_TO_FMT4(efhp, exi_fhp);
3658 
3659 	finfo_fhp =
3660 		(nfs_fh4_fmt_t *)lsp->state->finfo->filehandle.nfs_fh4_val;
3661 
3662 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3663 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3664 		exi_fhp->fh4_xlen) == 0) {
3665 		rfs4_state_close(lsp->state, FALSE, FALSE, CRED());
3666 		rfs4_dbe_invalidate(lsp->dbe);
3667 		rfs4_dbe_invalidate(lsp->state->dbe);
3668 	}
3669 }
3670 
3671 /*
3672  * This function is used as a target for the rfs4_dbe_walk() call
3673  * below.  The purpose of this function is to see if the state refers
3674  * to a file that resides within the exportinfo export.  If so, then
3675  * remove the open state for this object since the intent is the
3676  * server is unexporting the specified directory.  The main result for
3677  * this type of entry is to invalidate it such it will not be found in
3678  * the future.
3679  */
3680 static void
3681 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3682 {
3683 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3684 	struct exportinfo *exi = (struct exportinfo *)e;
3685 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3686 	fhandle_t *efhp;
3687 
3688 	efhp = (fhandle_t *)&exi->exi_fh;
3689 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3690 
3691 	FH_TO_FMT4(efhp, exi_fhp);
3692 
3693 	finfo_fhp =
3694 		(nfs_fh4_fmt_t *)sp->finfo->filehandle.nfs_fh4_val;
3695 
3696 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3697 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3698 		exi_fhp->fh4_xlen) == 0) {
3699 		rfs4_state_close(sp, TRUE, FALSE, CRED());
3700 		rfs4_dbe_invalidate(sp->dbe);
3701 	}
3702 }
3703 
3704 /*
3705  * This function is used as a target for the rfs4_dbe_walk() call
3706  * below.  The purpose of this function is to see if the state refers
3707  * to a file that resides within the exportinfo export.  If so, then
3708  * remove the deleg state for this object since the intent is the
3709  * server is unexporting the specified directory.  The main result for
3710  * this type of entry is to invalidate it such it will not be found in
3711  * the future.
3712  */
3713 static void
3714 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
3715 {
3716 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3717 	struct exportinfo *exi = (struct exportinfo *)e;
3718 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3719 	fhandle_t *efhp;
3720 
3721 	efhp = (fhandle_t *)&exi->exi_fh;
3722 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3723 
3724 	FH_TO_FMT4(efhp, exi_fhp);
3725 
3726 	finfo_fhp =
3727 		(nfs_fh4_fmt_t *)dsp->finfo->filehandle.nfs_fh4_val;
3728 
3729 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3730 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3731 		exi_fhp->fh4_xlen) == 0) {
3732 		rfs4_dbe_invalidate(dsp->dbe);
3733 	}
3734 }
3735 
3736 /*
3737  * This function is used as a target for the rfs4_dbe_walk() call
3738  * below.  The purpose of this function is to see if the state refers
3739  * to a file that resides within the exportinfo export.  If so, then
3740  * release vnode hold for this object since the intent is the server
3741  * is unexporting the specified directory.  Invalidation will prevent
3742  * this struct from being found in the future.
3743  */
3744 static void
3745 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
3746 {
3747 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
3748 	struct exportinfo *exi = (struct exportinfo *)e;
3749 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3750 	fhandle_t *efhp;
3751 
3752 	efhp = (fhandle_t *)&exi->exi_fh;
3753 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3754 
3755 	FH_TO_FMT4(efhp, exi_fhp);
3756 
3757 	finfo_fhp = (nfs_fh4_fmt_t *)fp->filehandle.nfs_fh4_val;
3758 
3759 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3760 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3761 		exi_fhp->fh4_xlen) == 0) {
3762 		if (fp->vp) {
3763 			/* don't leak monitors */
3764 			if (fp->dinfo->dtype == OPEN_DELEGATE_READ)
3765 				(void) fem_uninstall(fp->vp, deleg_rdops,
3766 						(void *)fp);
3767 			else if (fp->dinfo->dtype == OPEN_DELEGATE_WRITE)
3768 				(void) fem_uninstall(fp->vp, deleg_wrops,
3769 						(void *)fp);
3770 			VN_RELE(fp->vp);
3771 			fp->vp = NULL;
3772 		}
3773 		rfs4_dbe_invalidate(fp->dbe);
3774 	}
3775 }
3776 
3777 /*
3778  * Given a directory that is being unexported, cleanup/release all
3779  * state in the server that refers to objects residing underneath this
3780  * particular export.  The ordering of the release is important.
3781  * Lock_owner, then state and then file.
3782  */
3783 void
3784 rfs4_clean_state_exi(struct exportinfo *exi)
3785 {
3786 	mutex_enter(&rfs4_state_lock);
3787 
3788 	if (rfs4_server_state == NULL) {
3789 		mutex_exit(&rfs4_state_lock);
3790 		return;
3791 	}
3792 
3793 	rfs4_dbe_walk(rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
3794 	rfs4_dbe_walk(rfs4_state_tab, rfs4_state_walk_callout, exi);
3795 	rfs4_dbe_walk(rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
3796 	rfs4_dbe_walk(rfs4_file_tab, rfs4_file_walk_callout, exi);
3797 
3798 	mutex_exit(&rfs4_state_lock);
3799 }
3800