xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_state.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/systm.h>
30 #include <sys/kmem.h>
31 #include <sys/cmn_err.h>
32 #include <sys/atomic.h>
33 #include <sys/clconf.h>
34 #include <sys/cladm.h>
35 #include <sys/flock.h>
36 #include <nfs/export.h>
37 #include <nfs/nfs.h>
38 #include <nfs/nfs4.h>
39 #include <nfs/nfssys.h>
40 #include <nfs/lm.h>
41 #include <sys/pathname.h>
42 
43 
44 
45 extern time_t rfs4_start_time;
46 
47 stateid4 special0 = {
48 	0,
49 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
50 };
51 
52 stateid4 special1 = {
53 	0xffffffff,
54 	{
55 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
56 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
57 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
58 	}
59 };
60 
61 
62 #define	ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
63 			stateid4_cmp(id, &special1))
64 
65 /* For embedding the cluster nodeid into our clientid */
66 #define	CLUSTER_NODEID_SHIFT	24
67 #define	CLUSTER_MAX_NODEID	255
68 
69 #ifdef DEBUG
70 int rfs4_debug;
71 #endif
72 
73 static uint32_t rfs4_database_debug = 0x00;
74 
75 /*
76  * Couple of simple init/destroy functions for a general waiter
77  */
78 void
79 rfs4_sw_init(rfs4_state_wait_t *swp)
80 {
81 	mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
82 	cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
83 	swp->sw_active = FALSE;
84 	swp->sw_wait_count = 0;
85 }
86 
87 void
88 rfs4_sw_destroy(rfs4_state_wait_t *swp)
89 {
90 	mutex_destroy(swp->sw_cv_lock);
91 	cv_destroy(swp->sw_cv);
92 }
93 
94 void
95 rfs4_sw_enter(rfs4_state_wait_t *swp)
96 {
97 	mutex_enter(swp->sw_cv_lock);
98 	while (swp->sw_active) {
99 		swp->sw_wait_count++;
100 		cv_wait(swp->sw_cv, swp->sw_cv_lock);
101 		swp->sw_wait_count--;
102 	}
103 	ASSERT(swp->sw_active == FALSE);
104 	swp->sw_active = TRUE;
105 	mutex_exit(swp->sw_cv_lock);
106 }
107 
108 void
109 rfs4_sw_exit(rfs4_state_wait_t *swp)
110 {
111 	mutex_enter(swp->sw_cv_lock);
112 	ASSERT(swp->sw_active == TRUE);
113 	swp->sw_active = FALSE;
114 	if (swp->sw_wait_count != 0)
115 		cv_broadcast(swp->sw_cv);
116 	mutex_exit(swp->sw_cv_lock);
117 }
118 
119 /*
120  * CPR callback id -- not related to v4 callbacks
121  */
122 static callb_id_t cpr_id = 0;
123 
124 static void
125 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
126 {
127 	lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
128 	lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
129 
130 	if (sres->status == NFS4ERR_DENIED) {
131 		dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
132 		bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
133 	}
134 }
135 
136 static void
137 deep_lock_free(LOCK4res *res)
138 {
139 	lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
140 
141 	if (res->status == NFS4ERR_DENIED)
142 		kmem_free(lo->owner_val, lo->owner_len);
143 }
144 
145 static void
146 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
147 {
148 	nfsace4 *sacep, *dacep;
149 
150 	if (sres->status != NFS4_OK) {
151 		return;
152 	}
153 
154 	dres->attrset = sres->attrset;
155 
156 	switch (sres->delegation.delegation_type) {
157 	case OPEN_DELEGATE_NONE:
158 		return;
159 	case OPEN_DELEGATE_READ:
160 		sacep = &sres->delegation.open_delegation4_u.read.permissions;
161 		dacep = &dres->delegation.open_delegation4_u.read.permissions;
162 		break;
163 	case OPEN_DELEGATE_WRITE:
164 		sacep = &sres->delegation.open_delegation4_u.write.permissions;
165 		dacep = &dres->delegation.open_delegation4_u.write.permissions;
166 		break;
167 	}
168 	dacep->who.utf8string_val =
169 		kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
170 	bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
171 	    sacep->who.utf8string_len);
172 }
173 
174 static void
175 deep_open_free(OPEN4res *res)
176 {
177 	nfsace4 *acep;
178 	if (res->status != NFS4_OK)
179 		return;
180 
181 	switch (res->delegation.delegation_type) {
182 	case OPEN_DELEGATE_NONE:
183 		return;
184 	case OPEN_DELEGATE_READ:
185 		acep = &res->delegation.open_delegation4_u.read.permissions;
186 		break;
187 	case OPEN_DELEGATE_WRITE:
188 		acep = &res->delegation.open_delegation4_u.write.permissions;
189 		break;
190 	}
191 
192 	if (acep->who.utf8string_val) {
193 		kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
194 		acep->who.utf8string_val = NULL;
195 	}
196 }
197 
198 void
199 rfs4_free_reply(nfs_resop4 *rp)
200 {
201 	switch (rp->resop) {
202 	case OP_LOCK:
203 		deep_lock_free(&rp->nfs_resop4_u.oplock);
204 		break;
205 	case OP_OPEN:
206 		deep_open_free(&rp->nfs_resop4_u.opopen);
207 	default:
208 		break;
209 	}
210 }
211 
212 void
213 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
214 {
215 	*dst = *src;
216 
217 	/* Handle responses that need deep copy */
218 	switch (src->resop) {
219 	case OP_LOCK:
220 		deep_lock_copy(&dst->nfs_resop4_u.oplock,
221 			    &src->nfs_resop4_u.oplock);
222 		break;
223 	case OP_OPEN:
224 		deep_open_copy(&dst->nfs_resop4_u.opopen,
225 			    &src->nfs_resop4_u.opopen);
226 		break;
227 	default:
228 		break;
229 	};
230 }
231 
232 /*
233  * This is the implementation of the underlying state engine. The
234  * public interface to this engine is described by
235  * nfs4_state.h. Callers to the engine should hold no state engine
236  * locks when they call in to it. If the protocol needs to lock data
237  * structures it should do so after acquiring all references to them
238  * first and then follow the following lock order:
239  *
240  *	client > openowner > state > lo_state > lockowner > file.
241  *
242  * Internally we only allow a thread to hold one hash bucket lock at a
243  * time and the lock is higher in the lock order (must be acquired
244  * first) than the data structure that is on that hash list.
245  *
246  * If a new reference was acquired by the caller, that reference needs
247  * to be released after releasing all acquired locks with the
248  * corresponding rfs4_*_rele routine.
249  */
250 
251 /*
252  * This code is some what prototypical for now. Its purpose currently is to
253  * implement the interfaces sufficiently to finish the higher protocol
254  * elements. This will be replaced by a dynamically resizeable tables
255  * backed by kmem_cache allocator. However synchronization is handled
256  * correctly (I hope) and will not change by much.  The mutexes for
257  * the hash buckets that can be used to create new instances of data
258  * structures  might be good candidates to evolve into reader writer
259  * locks. If it has to do a creation, it would be holding the
260  * mutex across a kmem_alloc with KM_SLEEP specified.
261  */
262 
263 #ifdef DEBUG
264 #define	TABSIZE 17
265 #else
266 #define	TABSIZE 2047
267 #endif
268 
269 #define	ADDRHASH(key) ((unsigned long)(key) >> 3)
270 
271 /* Used to serialize create/destroy of rfs4_server_state database */
272 kmutex_t	rfs4_state_lock;
273 static rfs4_database_t *rfs4_server_state = NULL;
274 
275 /* Used to serialize lookups of clientids */
276 static	krwlock_t	rfs4_findclient_lock;
277 
278 /*
279  * For now this "table" is exposed so that the CPR callback
280  * function can tromp through it..
281  */
282 rfs4_table_t *rfs4_client_tab;
283 
284 static rfs4_index_t *rfs4_clientid_idx;
285 static rfs4_index_t *rfs4_nfsclnt_idx;
286 static rfs4_table_t *rfs4_openowner_tab;
287 static rfs4_index_t *rfs4_openowner_idx;
288 static rfs4_table_t *rfs4_state_tab;
289 static rfs4_index_t *rfs4_state_idx;
290 static rfs4_index_t *rfs4_state_owner_file_idx;
291 static rfs4_index_t *rfs4_state_file_idx;
292 static rfs4_table_t *rfs4_lo_state_tab;
293 static rfs4_index_t *rfs4_lo_state_idx;
294 static rfs4_index_t *rfs4_lo_state_owner_idx;
295 static rfs4_table_t *rfs4_lockowner_tab;
296 static rfs4_index_t *rfs4_lockowner_idx;
297 static rfs4_index_t *rfs4_lockowner_pid_idx;
298 static rfs4_table_t *rfs4_file_tab;
299 static rfs4_index_t *rfs4_file_idx;
300 static rfs4_table_t *rfs4_deleg_state_tab;
301 static rfs4_index_t *rfs4_deleg_idx;
302 static rfs4_index_t *rfs4_deleg_state_idx;
303 
304 #define	MAXTABSZ 1024*1024
305 
306 /* The values below are rfs4_lease_time units */
307 
308 #ifdef DEBUG
309 #define	CLIENT_CACHE_TIME 1
310 #define	OPENOWNER_CACHE_TIME 1
311 #define	STATE_CACHE_TIME 1
312 #define	LO_STATE_CACHE_TIME 1
313 #define	LOCKOWNER_CACHE_TIME 1
314 #define	FILE_CACHE_TIME 3
315 #define	DELEG_STATE_CACHE_TIME 1
316 #else
317 #define	CLIENT_CACHE_TIME 10
318 #define	OPENOWNER_CACHE_TIME 5
319 #define	STATE_CACHE_TIME 1
320 #define	LO_STATE_CACHE_TIME 1
321 #define	LOCKOWNER_CACHE_TIME 3
322 #define	FILE_CACHE_TIME 40
323 #define	DELEG_STATE_CACHE_TIME 1
324 #endif
325 
326 
327 static time_t rfs4_client_cache_time = 0;
328 static time_t rfs4_openowner_cache_time = 0;
329 static time_t rfs4_state_cache_time = 0;
330 static time_t rfs4_lo_state_cache_time = 0;
331 static time_t rfs4_lockowner_cache_time = 0;
332 static time_t rfs4_file_cache_time = 0;
333 static time_t rfs4_deleg_state_cache_time = 0;
334 
335 static bool_t rfs4_client_create(rfs4_entry_t, void *);
336 static void rfs4_client_destroy(rfs4_entry_t);
337 static bool_t rfs4_client_expiry(rfs4_entry_t);
338 static uint32_t clientid_hash(void *);
339 static bool_t clientid_compare(rfs4_entry_t, void *);
340 static void *clientid_mkkey(rfs4_entry_t);
341 static uint32_t nfsclnt_hash(void *);
342 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
343 static void *nfsclnt_mkkey(rfs4_entry_t);
344 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
345 static void rfs4_openowner_destroy(rfs4_entry_t);
346 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
347 static uint32_t openowner_hash(void *);
348 static bool_t openowner_compare(rfs4_entry_t, void *);
349 static void *openowner_mkkey(rfs4_entry_t);
350 static bool_t rfs4_state_create(rfs4_entry_t, void *);
351 static void rfs4_state_destroy(rfs4_entry_t);
352 static bool_t rfs4_state_expiry(rfs4_entry_t);
353 static uint32_t state_hash(void *);
354 static bool_t state_compare(rfs4_entry_t, void *);
355 static void *state_mkkey(rfs4_entry_t);
356 static uint32_t state_owner_file_hash(void *);
357 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
358 static void *state_owner_file_mkkey(rfs4_entry_t);
359 static uint32_t state_file_hash(void *);
360 static bool_t state_file_compare(rfs4_entry_t, void *);
361 static void *state_file_mkkey(rfs4_entry_t);
362 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
363 static void rfs4_lo_state_destroy(rfs4_entry_t);
364 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
365 static uint32_t lo_state_hash(void *);
366 static bool_t lo_state_compare(rfs4_entry_t, void *);
367 static void *lo_state_mkkey(rfs4_entry_t);
368 static uint32_t lo_state_lo_hash(void *);
369 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
370 static void *lo_state_lo_mkkey(rfs4_entry_t);
371 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
372 static void rfs4_lockowner_destroy(rfs4_entry_t);
373 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
374 static uint32_t lockowner_hash(void *);
375 static bool_t lockowner_compare(rfs4_entry_t, void *);
376 static void *lockowner_mkkey(rfs4_entry_t);
377 static uint32_t pid_hash(void *);
378 static bool_t pid_compare(rfs4_entry_t, void *);
379 static void *pid_mkkey(rfs4_entry_t);
380 static bool_t rfs4_file_create(rfs4_entry_t, void *);
381 static void rfs4_file_destroy(rfs4_entry_t);
382 static uint32_t file_hash(void *);
383 static bool_t file_compare(rfs4_entry_t, void *);
384 static void *file_mkkey(rfs4_entry_t);
385 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
386 static void rfs4_deleg_state_destroy(rfs4_entry_t);
387 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
388 static uint32_t deleg_hash(void *);
389 static bool_t deleg_compare(rfs4_entry_t, void *);
390 static void *deleg_mkkey(rfs4_entry_t);
391 static uint32_t deleg_state_hash(void *);
392 static bool_t deleg_state_compare(rfs4_entry_t, void *);
393 static void *deleg_state_mkkey(rfs4_entry_t);
394 
395 static void rfs4_state_rele_nounlock(rfs4_state_t *);
396 
397 static rfs4_oldstate_t *rfs4_oldstate = NULL;
398 static krwlock_t rfs4_oldstate_lock;
399 static int rfs4_ss_enabled = 0;
400 
401 #define	NFS4_VAR_DIR		"/var/nfs"
402 #define	NFS4_STATE_DIR 		NFS4_VAR_DIR"/v4_state"
403 #define	NFS4_OLDSTATE_DIR 	NFS4_VAR_DIR"/v4_oldstate"
404 #define	NFS4_SS_DIR_MODE	0755
405 
406 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
407 
408 void
409 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
410 {
411 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
412 }
413 
414 /*
415  * Free all malloced rsf4_oldstate_t memory
416  */
417 void
418 rfs4_oldstate_free(rfs4_oldstate_t *ros)
419 {
420 	if (ros == NULL)
421 		return;
422 
423 	if (ros->cl_id4.id_val)
424 		kmem_free(ros->cl_id4.id_val, ros->cl_id4.id_len);
425 
426 	if (ros->ss_pn)
427 		kmem_free(ros->ss_pn, sizeof (rfs4_ss_pn_t));
428 
429 	kmem_free(ros, sizeof (rfs4_oldstate_t));
430 }
431 
432 static rfs4_ss_pn_t *
433 rfs4_ss_pnalloc(char *dir, char *leaf)
434 {
435 	rfs4_ss_pn_t *ss_pn;
436 	int 	dir_len, leaf_len;
437 
438 	/*
439 	 * validate we have a resonable path
440 	 * (account for the '/' and trailing null)
441 	 */
442 	if ((dir_len = strlen(dir)) > MAXPATHLEN ||
443 		(leaf_len = strlen(leaf)) > MAXNAMELEN ||
444 		(dir_len + leaf_len + 2) > MAXPATHLEN) {
445 		return (NULL);
446 	}
447 
448 	ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
449 
450 	(void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
451 	/* Handy pointer to just the leaf name */
452 	ss_pn->leaf = ss_pn->pn + dir_len + 1;
453 	return (ss_pn);
454 }
455 
456 
457 /*
458  * Move the "leaf" filename from "sdir" directory
459  * to the "ddir" directory. Return the pathname of
460  * the destination unless the rename fails in which
461  * case we need to return the source pathname.
462  */
463 static rfs4_ss_pn_t *
464 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
465 {
466 	rfs4_ss_pn_t *src, *dst;
467 
468 	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL) {
469 		return (NULL);
470 	}
471 
472 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
473 		rfs4_ss_pnfree(src);
474 		return (NULL);
475 	}
476 
477 	/*
478 	 * If the rename fails we shall return the src
479 	 * pathname and free the dst. Otherwise we need
480 	 * to free the src and return the dst pathanme.
481 	 */
482 	if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
483 		rfs4_ss_pnfree(dst);
484 		return (src);
485 	}
486 	rfs4_ss_pnfree(src);
487 	return (dst);
488 }
489 
490 
491 static rfs4_oldstate_t *
492 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
493 {
494 	struct uio uio;
495 	struct iovec iov[3];
496 
497 	rfs4_oldstate_t *cl_ss = NULL;
498 	vnode_t *vp;
499 	vattr_t va;
500 	uint_t id_len;
501 	int err, kill_file, file_vers;
502 
503 	if (ss_pn == NULL) {
504 		return (NULL);
505 	}
506 
507 	/*
508 	 * open the state file.
509 	 */
510 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
511 		return (NULL);
512 	}
513 
514 	if (vp->v_type != VREG) {
515 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
516 		VN_RELE(vp);
517 		return (NULL);
518 	}
519 
520 	err = VOP_ACCESS(vp, VREAD, 0, CRED());
521 	if (err) {
522 		/*
523 		 * We don't have read access? better get the heck out.
524 		 */
525 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
526 		VN_RELE(vp);
527 		return (NULL);
528 	}
529 
530 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
531 	/*
532 	 * get the file size to do some basic validation
533 	 */
534 	va.va_mask = AT_SIZE;
535 	err = VOP_GETATTR(vp, &va, 0, CRED());
536 
537 	kill_file = (va.va_size == 0 || va.va_size <
538 		(NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
539 
540 	if (err || kill_file) {
541 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
542 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
543 		VN_RELE(vp);
544 		if (kill_file) {
545 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
546 		}
547 		return (NULL);
548 	}
549 
550 	cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
551 
552 	/*
553 	 * build iovecs to read in the file_version, verifier and id_len
554 	 */
555 	iov[0].iov_base = (caddr_t)&file_vers;
556 	iov[0].iov_len = sizeof (int);
557 	iov[1].iov_base = (caddr_t)cl_ss;
558 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
559 	iov[2].iov_base = (caddr_t)&id_len;
560 	iov[2].iov_len = sizeof (uint_t);
561 
562 	uio.uio_iov = iov;
563 	uio.uio_iovcnt = 3;
564 	uio.uio_segflg = UIO_SYSSPACE;
565 	uio.uio_loffset = 0;
566 	uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
567 
568 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
569 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
570 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
571 		VN_RELE(vp);
572 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
573 		return (NULL);
574 	}
575 
576 	/*
577 	 * if the file_version doesn't match or if the
578 	 * id_len is zero or the combination of the verifier,
579 	 * id_len and id_val is bigger than the file we have
580 	 * a problem. If so ditch the file.
581 	 */
582 	kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
583 	    (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
584 
585 	if (err || kill_file) {
586 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
587 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
588 		VN_RELE(vp);
589 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
590 		if (kill_file) {
591 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED());
592 		}
593 		return (NULL);
594 	}
595 
596 	/*
597 	 * now get the client id value
598 	 */
599 	cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
600 	iov[0].iov_base = cl_ss->cl_id4.id_val;
601 	iov[0].iov_len = id_len;
602 
603 	uio.uio_iov = iov;
604 	uio.uio_iovcnt = 1;
605 	uio.uio_segflg = UIO_SYSSPACE;
606 	uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
607 
608 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
609 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
610 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
611 		VN_RELE(vp);
612 		kmem_free(cl_ss->cl_id4.id_val, id_len);
613 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
614 		return (NULL);
615 	}
616 
617 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
618 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED());
619 	VN_RELE(vp);
620 	return (cl_ss);
621 }
622 
623 #ifdef	nextdp
624 #undef nextdp
625 #endif
626 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
627 
628 /*
629  */
630 void
631 rfs4_ss_oldstate(char *dir, int do_move)
632 {
633 	rfs4_ss_pn_t *ss_pn;
634 	rfs4_oldstate_t *cl_ss = NULL;
635 	char	*dirt = NULL;
636 	int	err, dir_eof = 0, size = 0;
637 	vnode_t *dvp;
638 	struct iovec iov;
639 	struct uio uio;
640 	struct dirent64 *dep;
641 	offset_t dirchunk_offset = 0;
642 
643 	/*
644 	 * open the state directory
645 	 */
646 	if (err = vn_open(dir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0)) {
647 		return;
648 	}
649 
650 	/*
651 	 * if this is not a directory return
652 	 */
653 	if (dvp->v_type != VDIR) {
654 		(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
655 		VN_RELE(dvp);
656 		return;
657 	}
658 
659 	err = VOP_ACCESS(dvp, VREAD, 0, CRED());
660 	if (err) {
661 		/* Can't read the directory. So get the heck out. */
662 		goto out;
663 	}
664 
665 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
666 
667 	/*
668 	 * Get and process the directory entries
669 	 */
670 	while (!dir_eof) {
671 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
672 		iov.iov_base = dirt;
673 		iov.iov_len = RFS4_SS_DIRSIZE;
674 		uio.uio_iov = &iov;
675 		uio.uio_iovcnt = 1;
676 		uio.uio_segflg = UIO_SYSSPACE;
677 		uio.uio_loffset = dirchunk_offset;
678 		uio.uio_resid = RFS4_SS_DIRSIZE;
679 
680 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof);
681 
682 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
683 
684 		if (err) {
685 			goto out;
686 		}
687 
688 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
689 
690 		/*
691 		 * Process all the directory entries in this
692 		 * readdir chunk
693 		 */
694 		for (dep = (struct dirent64 *)dirt; size > 0;
695 			dep = nextdp(dep)) {
696 
697 			size -= dep->d_reclen;
698 			dirchunk_offset = dep->d_off;
699 
700 			/*
701 			 * Skip '.' and '..'
702 			 */
703 			if (NFS_IS_DOTNAME(dep->d_name)) {
704 				continue;
705 			}
706 
707 			if ((ss_pn = rfs4_ss_pnalloc(dir, dep->d_name))
708 							== NULL) {
709 				continue;
710 			}
711 
712 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
713 				if (do_move) {
714 					rfs4_ss_pnfree(ss_pn);
715 					cl_ss->ss_pn = rfs4_ss_movestate(
716 						NFS4_STATE_DIR,
717 						NFS4_OLDSTATE_DIR,
718 						dep->d_name);
719 				} else {
720 					cl_ss->ss_pn = ss_pn;
721 				}
722 				insque(cl_ss, rfs4_oldstate);
723 			} else {
724 				rfs4_ss_pnfree(ss_pn);
725 			}
726 		}
727 	}
728 out:
729 
730 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED());
731 	VN_RELE(dvp);
732 	if (dirt)
733 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
734 }
735 
736 /*
737  * Validates that the needed directories exist
738  */
739 bool_t
740 rfs4_validate_var(void)
741 {
742 	vnode_t *vp;
743 	int i;
744 	char *dnp;
745 	bool_t ret_val = TRUE;
746 	char *dir_names[] = {
747 			NFS4_VAR_DIR,
748 			NFS4_STATE_DIR,
749 			NFS4_OLDSTATE_DIR,
750 			NULL
751 	};
752 
753 	for (i = 0, dnp = dir_names[i]; dnp; i++) {
754 		if (lookupname(dnp, UIO_SYSSPACE,
755 					NO_FOLLOW, NULLVPP, &vp) != 0) {
756 			cmn_err(CE_WARN, "!NFS4 stable storage directory "
757 				"missing!: %s", dnp);
758 			ret_val = FALSE;
759 		} else {
760 			VN_RELE(vp);
761 		}
762 		dnp = dir_names[i];
763 	}
764 	return (ret_val);
765 }
766 
767 /*
768  *
769  */
770 static void
771 rfs4_ss_init(void)
772 {
773 	rw_init(&rfs4_oldstate_lock, NULL, RW_DEFAULT, NULL);
774 
775 	if (rfs4_validate_var() == FALSE) {
776 		rfs4_oldstate = NULL;
777 		return;
778 	}
779 
780 	rfs4_oldstate = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
781 	rfs4_oldstate->next = rfs4_oldstate;
782 	rfs4_oldstate->prev = rfs4_oldstate;
783 
784 	/*
785 	 * load info from the OLD directory
786 	 */
787 	rfs4_ss_oldstate(NFS4_OLDSTATE_DIR, 0);
788 
789 	/*
790 	 * Gather and move NFS4_STATE_DIR to NFS4_OLDSTATE_DIR
791 	 */
792 	rfs4_ss_oldstate(NFS4_STATE_DIR, 1);
793 
794 	rfs4_ss_enabled = 1;
795 }
796 
797 static void
798 rfs4_ss_fini(void)
799 {
800 
801 	rfs4_oldstate_t *ost, *osp, *os_head;
802 
803 	rw_destroy(&rfs4_oldstate_lock);
804 
805 	/*
806 	 * short circuit everything if we have no
807 	 * remaining oldstate!
808 	 */
809 	if (rfs4_oldstate == NULL) {
810 		return;
811 	}
812 
813 	/*
814 	 * It is possible to start and immediately stop the server
815 	 * in which case we would not have cleaned up the oldstate
816 	 * circular queue so we may do it here.
817 	 */
818 	os_head = rfs4_oldstate;
819 	osp = os_head->next;
820 
821 	while (osp != os_head) {
822 		ost = osp->next;
823 		remque(osp);
824 		rfs4_oldstate_free(osp);
825 		osp = ost;
826 	}
827 	kmem_free(os_head, sizeof (rfs4_oldstate_t));
828 }
829 
830 
831 /*
832  * Check if we are still in grace and if the client can be
833  * granted permission to perform reclaims.
834  */
835 void
836 rfs4_ss_chkclid(rfs4_client_t *cp)
837 {
838 	rfs4_oldstate_t *ost, *osp, *os_head;
839 
840 	/*
841 	 * short circuit everything if we have no
842 	 * oldstate!
843 	 */
844 	if (rfs4_oldstate == NULL) {
845 		return;
846 	}
847 
848 	/*
849 	 * if we are not in the grace_period then
850 	 * we can destroy and mutilate all the old state.
851 	 */
852 	if (!rfs4_clnt_in_grace(cp)) {
853 		rw_enter(&rfs4_oldstate_lock, RW_WRITER);
854 		if (rfs4_oldstate == NULL) {
855 			/*
856 			 * some other thread is killing
857 			 * the state so we get to just return.
858 			 */
859 			rw_exit(&rfs4_oldstate_lock);
860 			return;
861 		}
862 
863 		os_head = rfs4_oldstate;
864 		rfs4_oldstate = NULL;
865 		rw_exit(&rfs4_oldstate_lock);
866 
867 		/*
868 		 * Now ditch the state files and structures
869 		 * we've malloc()'d
870 		 */
871 		osp = os_head->next;
872 
873 		while (osp != os_head) {
874 			if (osp->ss_pn != NULL) {
875 				(void) vn_remove(osp->ss_pn->pn,
876 						UIO_SYSSPACE, RMFILE);
877 			}
878 			ost = osp->next;
879 			remque(osp);
880 			rfs4_oldstate_free(osp);
881 			osp = ost;
882 		}
883 		kmem_free(os_head, sizeof (rfs4_oldstate_t));
884 		return;
885 	}
886 
887 	/*
888 	 * we're still in grace, search for the clientid
889 	 */
890 	rw_enter(&rfs4_oldstate_lock, RW_READER);
891 
892 	os_head = rfs4_oldstate;
893 	osp = os_head->next;
894 
895 	while (osp != os_head) {
896 		if (osp->cl_id4.id_len == cp->nfs_client.id_len) {
897 			if (bcmp(osp->cl_id4.id_val, cp->nfs_client.id_val,
898 					osp->cl_id4.id_len) == 0) {
899 				cp->can_reclaim = 1;
900 				break;
901 			}
902 		}
903 		osp = osp->next;
904 	}
905 
906 	rw_exit(&rfs4_oldstate_lock);
907 }
908 
909 /*
910  * Place client information into stable storage.
911  */
912 void
913 rfs4_ss_clid(rfs4_client_t *cp, struct svc_req *req)
914 {
915 	const char *kinet_ntop6(uchar_t *, char *, size_t);
916 
917 	nfs_client_id4		*cl_id4;
918 	rfs4_ss_pn_t *ss_pn;
919 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
920 	vnode_t *vp;
921 	struct uio uio;
922 	struct iovec iov[4];
923 	int file_vers = NFS4_SS_VERSION;
924 	int ioflag;
925 	struct sockaddr *ca;
926 	uchar_t *b;
927 
928 	if (rfs4_ss_enabled == 0) {
929 		return;
930 	}
931 
932 	buf[0] = 0;
933 
934 
935 	ca = (struct sockaddr *)svc_getrpccaller(req->rq_xprt)->buf;
936 	if (ca == NULL) {
937 		return;
938 	}
939 
940 	/*
941 	 * Convert the caller's IP address to a dotted string
942 	 */
943 	if (ca->sa_family == AF_INET) {
944 
945 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
946 			sizeof (struct sockaddr_in));
947 		b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
948 		(void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
949 				b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
950 	} else if (ca->sa_family == AF_INET6) {
951 		struct sockaddr_in6 *sin6;
952 
953 		sin6 = (struct sockaddr_in6 *)ca;
954 		bcopy(svc_getrpccaller(req->rq_xprt)->buf, &cp->cl_addr,
955 				sizeof (struct sockaddr_in6));
956 		(void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
957 				buf, INET6_ADDRSTRLEN);
958 	}
959 
960 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
961 	    (longlong_t)cp->clientid);
962 
963 	if ((ss_pn = rfs4_ss_pnalloc(NFS4_STATE_DIR, leaf)) == NULL) {
964 		return;
965 	}
966 
967 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
968 			    CRCREAT, 0)) {
969 		rfs4_ss_pnfree(ss_pn);
970 		return;
971 	}
972 
973 	if (cp->ss_pn)
974 		rfs4_ss_pnfree(cp->ss_pn);
975 
976 	cp->ss_pn = ss_pn;
977 
978 	cl_id4 = &(cp->nfs_client);
979 
980 	/*
981 	 * Build a scatter list that points to the nfs_client_id4
982 	 */
983 	iov[0].iov_base = (caddr_t)&file_vers;
984 	iov[0].iov_len = sizeof (int);
985 	iov[1].iov_base = (caddr_t)cl_id4;
986 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
987 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
988 	iov[2].iov_len = sizeof (uint_t);
989 	iov[3].iov_base = (caddr_t)cl_id4->id_val;
990 	iov[3].iov_len = cl_id4->id_len;
991 
992 	uio.uio_iov = iov;
993 	uio.uio_iovcnt = 4;
994 	uio.uio_loffset = 0;
995 	uio.uio_segflg = UIO_SYSSPACE;
996 	uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
997 	uio.uio_resid = cl_id4->id_len + sizeof (int) +
998 		NFS4_VERIFIER_SIZE + sizeof (uint_t);
999 
1000 	ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1001 	uio.uio_extflg = UIO_COPY_DEFAULT;
1002 
1003 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1004 	/* write the full client id to the file. */
1005 	(void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1006 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1007 
1008 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED());
1009 	VN_RELE(vp);
1010 }
1011 
1012 /*
1013  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1014  * to find and mark the client for forced expire.
1015  */
1016 static void
1017 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1018 {
1019 	rfs4_client_t *cp = (rfs4_client_t *)ent;
1020 	struct nfs4clrst_args *clr = arg;
1021 	struct sockaddr_in6 *ent_sin6;
1022 	struct in6_addr  clr_in6;
1023 	struct sockaddr_in  *ent_sin;
1024 	struct in_addr   clr_in;
1025 
1026 	if (clr->addr_type != cp->cl_addr.ss_family) {
1027 		return;
1028 	}
1029 
1030 	switch (clr->addr_type) {
1031 
1032 	case AF_INET6:
1033 		/* copyin the address from user space */
1034 		if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1035 			break;
1036 		}
1037 
1038 		ent_sin6 = (struct sockaddr_in6 *)&cp->cl_addr;
1039 
1040 		/*
1041 		 * now compare, and if equivalent mark entry
1042 		 * for forced expiration
1043 		 */
1044 		if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1045 			cp->forced_expire = 1;
1046 		}
1047 		break;
1048 
1049 	case AF_INET:
1050 		/* copyin the address from user space */
1051 		if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1052 			break;
1053 		}
1054 
1055 		ent_sin = (struct sockaddr_in *)&cp->cl_addr;
1056 
1057 		/*
1058 		 * now compare, and if equivalent mark entry
1059 		 * for forced expiration
1060 		 */
1061 		if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1062 			cp->forced_expire = 1;
1063 		}
1064 		break;
1065 
1066 	default:
1067 		/* force this assert to fail */
1068 		ASSERT(clr->addr_type != clr->addr_type);
1069 	}
1070 }
1071 
1072 /*
1073  * This is called from nfssys() in order to clear server state
1074  * for the specified client IP Address.
1075  */
1076 void
1077 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1078 {
1079 	(void) rfs4_dbe_walk(rfs4_client_tab, rfs4_client_scrub, clr);
1080 }
1081 
1082 /*
1083  * Used to initialize the NFSv4 server's state or database.  All of
1084  * the tables are created and timers are set. Only called when NFSv4
1085  * service is provided.
1086  */
1087 void
1088 rfs4_state_init()
1089 {
1090 	int start_grace;
1091 	extern boolean_t rfs4_cpr_callb(void *, int);
1092 
1093 	mutex_enter(&rfs4_state_lock);
1094 
1095 	/*
1096 	 * If the server state database has already been initialized,
1097 	 * skip it
1098 	 */
1099 	if (rfs4_server_state != NULL) {
1100 		mutex_exit(&rfs4_state_lock);
1101 		return;
1102 	}
1103 
1104 	rw_init(&rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1105 
1106 	/*
1107 	 * Set the boot time.  If the server
1108 	 * has been restarted quickly and has had the opportunity to
1109 	 * service clients, then the start_time needs to be bumped
1110 	 * regardless.  A small window but it exists...
1111 	 */
1112 	if (rfs4_start_time != gethrestime_sec())
1113 		rfs4_start_time = gethrestime_sec();
1114 	else
1115 		rfs4_start_time++;
1116 
1117 	/*
1118 	 * Create the first server instance, or a new one if the server has
1119 	 * been restarted; see above comments on rfs4_start_time. Don't
1120 	 * start its grace period; that will be done later, to maximise the
1121 	 * clients' recovery window.
1122 	 */
1123 	start_grace = 0;
1124 	rfs4_servinst_create(start_grace);
1125 
1126 	/* reset the "first NFSv4 request" status */
1127 	rfs4_seen_first_compound = 0;
1128 
1129 	/*
1130 	 * Add a CPR callback so that we can update client
1131 	 * access times to extend the lease after a suspend
1132 	 * and resume (using the same class as rpcmod/connmgr)
1133 	 */
1134 	cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1135 
1136 	/* set the various cache timers for table creation */
1137 	if (rfs4_client_cache_time == 0)
1138 		rfs4_client_cache_time = CLIENT_CACHE_TIME;
1139 	if (rfs4_openowner_cache_time == 0)
1140 		rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1141 	if (rfs4_state_cache_time == 0)
1142 		rfs4_state_cache_time = STATE_CACHE_TIME;
1143 	if (rfs4_lo_state_cache_time == 0)
1144 		rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1145 	if (rfs4_lockowner_cache_time == 0)
1146 		rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1147 	if (rfs4_file_cache_time == 0)
1148 		rfs4_file_cache_time = FILE_CACHE_TIME;
1149 	if (rfs4_deleg_state_cache_time == 0)
1150 		rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1151 
1152 	/* Create the overall database to hold all server state */
1153 	rfs4_server_state = rfs4_database_create(rfs4_database_debug);
1154 
1155 	/* Now create the individual tables */
1156 	rfs4_client_cache_time *= rfs4_lease_time;
1157 	rfs4_client_tab = rfs4_table_create(rfs4_server_state,
1158 					    "Client",
1159 					    rfs4_client_cache_time,
1160 					    2,
1161 					    rfs4_client_create,
1162 					    rfs4_client_destroy,
1163 					    rfs4_client_expiry,
1164 					    sizeof (rfs4_client_t),
1165 					    TABSIZE,
1166 					    MAXTABSZ/8, 100);
1167 	rfs4_nfsclnt_idx = rfs4_index_create(rfs4_client_tab,
1168 					    "nfs_client_id4", nfsclnt_hash,
1169 					    nfsclnt_compare, nfsclnt_mkkey,
1170 					    TRUE);
1171 	rfs4_clientid_idx = rfs4_index_create(rfs4_client_tab,
1172 					    "client_id", clientid_hash,
1173 					    clientid_compare, clientid_mkkey,
1174 					    FALSE);
1175 
1176 	rfs4_openowner_cache_time *= rfs4_lease_time;
1177 	rfs4_openowner_tab = rfs4_table_create(rfs4_server_state,
1178 					    "OpenOwner",
1179 					    rfs4_openowner_cache_time,
1180 					    1,
1181 					    rfs4_openowner_create,
1182 					    rfs4_openowner_destroy,
1183 					    rfs4_openowner_expiry,
1184 					    sizeof (rfs4_openowner_t),
1185 					    TABSIZE,
1186 					    MAXTABSZ, 100);
1187 	rfs4_openowner_idx = rfs4_index_create(rfs4_openowner_tab,
1188 					    "open_owner4", openowner_hash,
1189 					    openowner_compare,
1190 					    openowner_mkkey, TRUE);
1191 
1192 	rfs4_state_cache_time *= rfs4_lease_time;
1193 	rfs4_state_tab = rfs4_table_create(rfs4_server_state,
1194 					"OpenStateID",
1195 					rfs4_state_cache_time,
1196 					3,
1197 					rfs4_state_create,
1198 					rfs4_state_destroy,
1199 					rfs4_state_expiry,
1200 					sizeof (rfs4_state_t),
1201 					TABSIZE,
1202 					MAXTABSZ, 100);
1203 	rfs4_state_owner_file_idx = rfs4_index_create(rfs4_state_tab,
1204 						"Openowner-File",
1205 						state_owner_file_hash,
1206 						state_owner_file_compare,
1207 						state_owner_file_mkkey, TRUE);
1208 	rfs4_state_idx = rfs4_index_create(rfs4_state_tab,
1209 					"State-id", state_hash,
1210 					state_compare, state_mkkey, FALSE);
1211 	rfs4_state_file_idx = rfs4_index_create(rfs4_state_tab,
1212 					"File", state_file_hash,
1213 					state_file_compare, state_file_mkkey,
1214 					FALSE);
1215 
1216 	rfs4_lo_state_cache_time *= rfs4_lease_time;
1217 	rfs4_lo_state_tab = rfs4_table_create(rfs4_server_state,
1218 					    "LockStateID",
1219 					    rfs4_lo_state_cache_time,
1220 					    2,
1221 					    rfs4_lo_state_create,
1222 					    rfs4_lo_state_destroy,
1223 					    rfs4_lo_state_expiry,
1224 					    sizeof (rfs4_lo_state_t),
1225 					    TABSIZE,
1226 					    MAXTABSZ, 100);
1227 	rfs4_lo_state_owner_idx = rfs4_index_create(rfs4_lo_state_tab,
1228 						    "lockownerxstate",
1229 						    lo_state_lo_hash,
1230 						    lo_state_lo_compare,
1231 						    lo_state_lo_mkkey, TRUE);
1232 	rfs4_lo_state_idx = rfs4_index_create(rfs4_lo_state_tab,
1233 					    "State-id",
1234 					    lo_state_hash, lo_state_compare,
1235 					    lo_state_mkkey, FALSE);
1236 
1237 	rfs4_lockowner_cache_time *= rfs4_lease_time;
1238 	rfs4_lockowner_tab = rfs4_table_create(rfs4_server_state,
1239 					    "Lockowner",
1240 					    rfs4_lockowner_cache_time,
1241 					    2,
1242 					    rfs4_lockowner_create,
1243 					    rfs4_lockowner_destroy,
1244 					    rfs4_lockowner_expiry,
1245 					    sizeof (rfs4_lockowner_t),
1246 					    TABSIZE,
1247 					    MAXTABSZ, 100);
1248 	rfs4_lockowner_idx = rfs4_index_create(rfs4_lockowner_tab,
1249 					    "lock_owner4", lockowner_hash,
1250 					    lockowner_compare,
1251 					    lockowner_mkkey, TRUE);
1252 	rfs4_lockowner_pid_idx = rfs4_index_create(rfs4_lockowner_tab,
1253 						"pid", pid_hash,
1254 						pid_compare, pid_mkkey,
1255 						FALSE);
1256 
1257 	rfs4_file_cache_time *= rfs4_lease_time;
1258 	rfs4_file_tab = rfs4_table_create(rfs4_server_state,
1259 					"File",
1260 					rfs4_file_cache_time,
1261 					1,
1262 					rfs4_file_create,
1263 					rfs4_file_destroy,
1264 					NULL,
1265 					sizeof (rfs4_file_t),
1266 					TABSIZE,
1267 					MAXTABSZ, -1);
1268 	rfs4_file_idx = rfs4_index_create(rfs4_file_tab,
1269 					"Filehandle", file_hash,
1270 					file_compare, file_mkkey, TRUE);
1271 
1272 	rfs4_deleg_state_cache_time *= rfs4_lease_time;
1273 	rfs4_deleg_state_tab = rfs4_table_create(rfs4_server_state,
1274 					"DelegStateID",
1275 					rfs4_deleg_state_cache_time,
1276 					2,
1277 					rfs4_deleg_state_create,
1278 					rfs4_deleg_state_destroy,
1279 					rfs4_deleg_state_expiry,
1280 					sizeof (rfs4_deleg_state_t),
1281 					TABSIZE,
1282 					MAXTABSZ, 100);
1283 	rfs4_deleg_idx = rfs4_index_create(rfs4_deleg_state_tab,
1284 						"DelegByFileClient",
1285 						deleg_hash,
1286 						deleg_compare,
1287 						deleg_mkkey, TRUE);
1288 	rfs4_deleg_state_idx = rfs4_index_create(rfs4_deleg_state_tab,
1289 						"DelegState",
1290 						deleg_state_hash,
1291 						deleg_state_compare,
1292 						deleg_state_mkkey, FALSE);
1293 
1294 	/*
1295 	 * Init the stable storage.
1296 	 */
1297 	rfs4_ss_init();
1298 
1299 	rfs4_client_clrst = rfs4_clear_client_state;
1300 
1301 	mutex_exit(&rfs4_state_lock);
1302 }
1303 
1304 
1305 /*
1306  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1307  * and other state.
1308  */
1309 void
1310 rfs4_state_fini()
1311 {
1312 	rfs4_database_t *dbp;
1313 
1314 	mutex_enter(&rfs4_state_lock);
1315 
1316 	if (rfs4_server_state == NULL) {
1317 		mutex_exit(&rfs4_state_lock);
1318 		return;
1319 	}
1320 
1321 	rfs4_client_clrst = NULL;
1322 
1323 	rfs4_set_deleg_policy(SRV_NEVER_DELEGATE);
1324 	dbp = rfs4_server_state;
1325 	rfs4_server_state = NULL;
1326 
1327 	/*
1328 	 * Cleanup the CPR callback.
1329 	 */
1330 	if (cpr_id)
1331 		(void) callb_delete(cpr_id);
1332 
1333 	rw_destroy(&rfs4_findclient_lock);
1334 
1335 	/* First stop all of the reaper threads in the database */
1336 	rfs4_database_shutdown(dbp);
1337 	/* clean up any dangling stable storage structures */
1338 	rfs4_ss_fini();
1339 	/* Now actually destroy/release the database and its tables */
1340 	rfs4_database_destroy(dbp);
1341 
1342 	/* Reset the cache timers for next time */
1343 	rfs4_client_cache_time = 0;
1344 	rfs4_openowner_cache_time = 0;
1345 	rfs4_state_cache_time = 0;
1346 	rfs4_lo_state_cache_time = 0;
1347 	rfs4_lockowner_cache_time = 0;
1348 	rfs4_file_cache_time = 0;
1349 	rfs4_deleg_state_cache_time = 0;
1350 
1351 	mutex_exit(&rfs4_state_lock);
1352 
1353 	/* destroy server instances and current instance ptr */
1354 	rfs4_servinst_destroy_all();
1355 
1356 	/* reset the "first NFSv4 request" status */
1357 	rfs4_seen_first_compound = 0;
1358 }
1359 
1360 typedef union {
1361 	struct {
1362 		uint32_t start_time;
1363 		uint32_t c_id;
1364 	} impl_id;
1365 	clientid4 id4;
1366 } cid;
1367 
1368 static int foreign_stateid(stateid_t *id);
1369 static int foreign_clientid(cid *cidp);
1370 static void embed_nodeid(cid *cidp);
1371 
1372 typedef union {
1373 	struct {
1374 		uint32_t c_id;
1375 		uint32_t gen_num;
1376 	} cv_impl;
1377 	verifier4	confirm_verf;
1378 } scid_confirm_verf;
1379 
1380 static uint32_t
1381 clientid_hash(void *key)
1382 {
1383 	cid *idp = key;
1384 
1385 	return (idp->impl_id.c_id);
1386 }
1387 
1388 static bool_t
1389 clientid_compare(rfs4_entry_t entry, void *key)
1390 {
1391 	rfs4_client_t *client = (rfs4_client_t *)entry;
1392 	clientid4 *idp = key;
1393 
1394 	return (*idp == client->clientid);
1395 }
1396 
1397 static void *
1398 clientid_mkkey(rfs4_entry_t entry)
1399 {
1400 	rfs4_client_t *client = (rfs4_client_t *)entry;
1401 
1402 	return (&client->clientid);
1403 }
1404 
1405 static uint32_t
1406 nfsclnt_hash(void *key)
1407 {
1408 	nfs_client_id4 *client = key;
1409 	int i;
1410 	uint32_t hash = 0;
1411 
1412 	for (i = 0; i < client->id_len; i++) {
1413 		hash <<= 1;
1414 		hash += (uint_t)client->id_val[i];
1415 	}
1416 	return (hash);
1417 }
1418 
1419 
1420 static bool_t
1421 nfsclnt_compare(rfs4_entry_t entry, void *key)
1422 {
1423 	rfs4_client_t *client = (rfs4_client_t *)entry;
1424 	nfs_client_id4 *nfs_client = key;
1425 
1426 	if (client->nfs_client.id_len != nfs_client->id_len)
1427 		return (FALSE);
1428 
1429 	return (bcmp(client->nfs_client.id_val, nfs_client->id_val,
1430 						nfs_client->id_len) == 0);
1431 }
1432 
1433 static void *
1434 nfsclnt_mkkey(rfs4_entry_t entry)
1435 {
1436 	rfs4_client_t *client = (rfs4_client_t *)entry;
1437 
1438 	return (&client->nfs_client);
1439 }
1440 
1441 static bool_t
1442 rfs4_client_expiry(rfs4_entry_t u_entry)
1443 {
1444 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1445 	bool_t cp_expired;
1446 
1447 	if (rfs4_dbe_is_invalid(cp->dbe))
1448 		return (TRUE);
1449 	/*
1450 	 * If the sysadmin has used clear_locks for this
1451 	 * entry then forced_expire will be set and we
1452 	 * want this entry to be reaped. Or the entry
1453 	 * has exceeded its lease period.
1454 	 */
1455 	cp_expired = (cp->forced_expire ||
1456 		(gethrestime_sec() - cp->last_access
1457 			> rfs4_lease_time));
1458 	if (!cp->ss_remove && cp_expired)
1459 		cp->ss_remove = 1;
1460 	return (cp_expired);
1461 }
1462 
1463 static void
1464 rfs4_client_destroy(rfs4_entry_t u_entry)
1465 {
1466 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1467 
1468 	mutex_destroy(cp->cbinfo.cb_lock);
1469 	cv_destroy(cp->cbinfo.cb_cv);
1470 	cv_destroy(cp->cbinfo.cb_cv_nullcaller);
1471 
1472 	/* free callback info */
1473 	rfs4_cbinfo_free(&cp->cbinfo);
1474 
1475 	if (cp->cp_confirmed)
1476 		rfs4_client_rele(cp->cp_confirmed);
1477 
1478 	if (cp->ss_pn) {
1479 		/*
1480 		 * check if the stable storage file needs
1481 		 * to be removed
1482 		 */
1483 		if (cp->ss_remove)
1484 			(void) vn_remove(cp->ss_pn->pn, UIO_SYSSPACE, RMFILE);
1485 		rfs4_ss_pnfree(cp->ss_pn);
1486 	}
1487 
1488 	/* Free the client supplied client id */
1489 	kmem_free(cp->nfs_client.id_val, cp->nfs_client.id_len);
1490 
1491 	if (cp->sysidt != LM_NOSYSID)
1492 		lm_free_sysidt(cp->sysidt);
1493 }
1494 
1495 static bool_t
1496 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1497 {
1498 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1499 	nfs_client_id4 *client = (nfs_client_id4 *)arg;
1500 	cid *cidp;
1501 	scid_confirm_verf *scvp;
1502 
1503 	/* Get a clientid to give to the client */
1504 	cidp = (cid *)&cp->clientid;
1505 	cidp->impl_id.start_time = rfs4_start_time;
1506 	cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->dbe);
1507 
1508 	/* If we are booted as a cluster node, embed our nodeid */
1509 	if (cluster_bootflags & CLUSTER_BOOTED)
1510 		embed_nodeid(cidp);
1511 
1512 	/* Allocate and copy client's client id value */
1513 	cp->nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1514 	cp->nfs_client.id_len = client->id_len;
1515 	bcopy(client->id_val, cp->nfs_client.id_val, client->id_len);
1516 	cp->nfs_client.verifier = client->verifier;
1517 
1518 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1519 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1520 	scvp->cv_impl.c_id = cidp->impl_id.c_id;
1521 	scvp->cv_impl.gen_num = 0;
1522 
1523 	/* An F_UNLKSYS has been done for this client */
1524 	cp->unlksys_completed = FALSE;
1525 
1526 	/* We need the client to ack us */
1527 	cp->need_confirm = TRUE;
1528 	cp->cp_confirmed = NULL;
1529 
1530 	/* TRUE all the time until the callback path actually fails */
1531 	cp->cbinfo.cb_notified_of_cb_path_down = TRUE;
1532 
1533 	/* Initialize the access time to now */
1534 	cp->last_access = gethrestime_sec();
1535 
1536 	cp->cr_set = NULL;
1537 	/* Initialize list for insque/remque */
1538 	cp->openownerlist.next = cp->openownerlist.prev = &cp->openownerlist;
1539 	cp->openownerlist.oop = NULL; /* This is not an openowner */
1540 
1541 	cp->sysidt = LM_NOSYSID;
1542 
1543 	cp->clientdeleglist.next = cp->clientdeleglist.prev =
1544 		&cp->clientdeleglist;
1545 	cp->clientdeleglist.dsp = NULL;
1546 
1547 	/* set up the callback control structure */
1548 	cp->cbinfo.cb_state = CB_UNINIT;
1549 	mutex_init(cp->cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1550 	cv_init(cp->cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1551 	cv_init(cp->cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1552 
1553 	/*
1554 	 * Associate the client_t with the current server instance.
1555 	 * The hold is solely to satisfy the calling requirement of
1556 	 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1557 	 */
1558 	rfs4_dbe_hold(cp->dbe);
1559 	rfs4_servinst_assign(cp, rfs4_cur_servinst);
1560 	rfs4_dbe_rele(cp->dbe);
1561 
1562 	return (TRUE);
1563 }
1564 
1565 /*
1566  * Caller wants to generate/update the setclientid_confirm verifier
1567  * associated with a client.  This is done during the SETCLIENTID
1568  * processing.
1569  */
1570 void
1571 rfs4_client_scv_next(rfs4_client_t *cp)
1572 {
1573 	scid_confirm_verf *scvp;
1574 
1575 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1576 	scvp = (scid_confirm_verf *)&cp->confirm_verf;
1577 	scvp->cv_impl.gen_num++;
1578 }
1579 
1580 void
1581 rfs4_client_rele(rfs4_client_t *cp)
1582 {
1583 	rfs4_dbe_rele(cp->dbe);
1584 }
1585 
1586 rfs4_client_t *
1587 rfs4_findclient(nfs_client_id4 *client, bool_t *create,	rfs4_client_t *oldcp)
1588 {
1589 	rfs4_client_t *cp;
1590 
1591 
1592 	if (oldcp) {
1593 		rw_enter(&rfs4_findclient_lock, RW_WRITER);
1594 		rfs4_dbe_hide(oldcp->dbe);
1595 	} else {
1596 		rw_enter(&rfs4_findclient_lock, RW_READER);
1597 	}
1598 
1599 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_nfsclnt_idx, client,
1600 					create, (void *)client, RFS4_DBS_VALID);
1601 
1602 	if (oldcp)
1603 		rfs4_dbe_unhide(oldcp->dbe);
1604 
1605 	rw_exit(&rfs4_findclient_lock);
1606 
1607 	return (cp);
1608 }
1609 
1610 rfs4_client_t *
1611 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1612 {
1613 	rfs4_client_t *cp;
1614 	bool_t create = FALSE;
1615 	cid *cidp = (cid *)&clientid;
1616 
1617 	/* If we're a cluster and the nodeid isn't right, short-circuit */
1618 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1619 		return (NULL);
1620 
1621 	rw_enter(&rfs4_findclient_lock, RW_READER);
1622 
1623 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx, &clientid,
1624 					&create, NULL, RFS4_DBS_VALID);
1625 
1626 	rw_exit(&rfs4_findclient_lock);
1627 
1628 	if (cp && cp->need_confirm && find_unconfirmed == FALSE) {
1629 		rfs4_client_rele(cp);
1630 		return (NULL);
1631 	} else {
1632 		return (cp);
1633 	}
1634 }
1635 
1636 bool_t
1637 rfs4_lease_expired(rfs4_client_t *cp)
1638 {
1639 	bool_t rc;
1640 
1641 	rfs4_dbe_lock(cp->dbe);
1642 
1643 	/*
1644 	 * If the admin has executed clear_locks for this
1645 	 * client id, force expire will be set, so no need
1646 	 * to calculate anything because it's "outa here".
1647 	 */
1648 	if (cp->forced_expire) {
1649 		rc = TRUE;
1650 	} else {
1651 		rc = (gethrestime_sec() - cp->last_access > rfs4_lease_time);
1652 	}
1653 
1654 	/*
1655 	 * If the lease has expired we will also want
1656 	 * to remove any stable storage state data. So
1657 	 * mark the client id accordingly.
1658 	 */
1659 	if (!cp->ss_remove)
1660 		cp->ss_remove = (rc == TRUE);
1661 
1662 	rfs4_dbe_unlock(cp->dbe);
1663 
1664 	return (rc);
1665 }
1666 
1667 void
1668 rfs4_update_lease(rfs4_client_t *cp)
1669 {
1670 	rfs4_dbe_lock(cp->dbe);
1671 	if (!cp->forced_expire)
1672 		cp->last_access = gethrestime_sec();
1673 	rfs4_dbe_unlock(cp->dbe);
1674 }
1675 
1676 
1677 static bool_t
1678 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
1679 {
1680 	bool_t rc;
1681 
1682 	if (a->clientid != b->clientid)
1683 		return (FALSE);
1684 
1685 	if (a->owner_len != b->owner_len)
1686 		return (FALSE);
1687 
1688 	rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
1689 
1690 	return (rc);
1691 }
1692 
1693 static uint_t
1694 openowner_hash(void *key)
1695 {
1696 	int i;
1697 	open_owner4 *openowner = key;
1698 	uint_t hash = 0;
1699 
1700 	for (i = 0; i < openowner->owner_len; i++) {
1701 		hash <<= 4;
1702 		hash += (uint_t)openowner->owner_val[i];
1703 	}
1704 	hash += (uint_t)openowner->clientid;
1705 	hash |= (openowner->clientid >> 32);
1706 
1707 	return (hash);
1708 }
1709 
1710 static bool_t
1711 openowner_compare(rfs4_entry_t u_entry, void *key)
1712 {
1713 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1714 	open_owner4 *arg = key;
1715 
1716 	return (EQOPENOWNER(&op->owner, arg));
1717 }
1718 
1719 void *
1720 openowner_mkkey(rfs4_entry_t u_entry)
1721 {
1722 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1723 
1724 	return (&op->owner);
1725 }
1726 
1727 static bool_t
1728 rfs4_openowner_expiry(rfs4_entry_t u_entry)
1729 {
1730 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1731 
1732 	if (rfs4_dbe_is_invalid(op->dbe))
1733 		return (TRUE);
1734 	return ((gethrestime_sec() - op->client->last_access
1735 		> rfs4_lease_time));
1736 }
1737 
1738 static void
1739 rfs4_openowner_destroy(rfs4_entry_t u_entry)
1740 {
1741 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1742 
1743 	rfs4_sw_destroy(&op->oo_sw);
1744 
1745 	/* Remove open owner from client's lists of open owners */
1746 	rfs4_dbe_lock(op->client->dbe);
1747 
1748 	remque(&op->openownerlist);
1749 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1750 
1751 	rfs4_dbe_unlock(op->client->dbe);
1752 
1753 	/* One less reference to the client */
1754 	rfs4_client_rele(op->client);
1755 	op->client = NULL;
1756 
1757 	/* Free the last reply for this lock owner */
1758 	rfs4_free_reply(op->reply);
1759 
1760 	if (op->reply_fh.nfs_fh4_val) {
1761 		kmem_free(op->reply_fh.nfs_fh4_val, op->reply_fh.nfs_fh4_len);
1762 		op->reply_fh.nfs_fh4_val = NULL;
1763 		op->reply_fh.nfs_fh4_len = 0;
1764 	}
1765 
1766 	/* Free the lock owner id */
1767 	kmem_free(op->owner.owner_val, op->owner.owner_len);
1768 }
1769 
1770 void
1771 rfs4_openowner_rele(rfs4_openowner_t *op)
1772 {
1773 	rfs4_dbe_rele(op->dbe);
1774 }
1775 
1776 static bool_t
1777 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
1778 {
1779 	rfs4_openowner_t *op = (rfs4_openowner_t *)u_entry;
1780 	rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
1781 	open_owner4 *openowner = &argp->owner;
1782 	seqid4 seqid = argp->open_seqid;
1783 	rfs4_client_t *cp;
1784 	bool_t create = FALSE;
1785 
1786 	rw_enter(&rfs4_findclient_lock, RW_READER);
1787 
1788 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
1789 					&openowner->clientid,
1790 					&create, NULL, RFS4_DBS_VALID);
1791 
1792 	rw_exit(&rfs4_findclient_lock);
1793 
1794 	if (cp == NULL)
1795 		return (FALSE);
1796 
1797 	op->reply_fh.nfs_fh4_len = 0;
1798 	op->reply_fh.nfs_fh4_val = NULL;
1799 
1800 	op->owner.clientid = openowner->clientid;
1801 	op->owner.owner_val =
1802 		kmem_alloc(openowner->owner_len, KM_SLEEP);
1803 	bcopy(openowner->owner_val,
1804 	    op->owner.owner_val, openowner->owner_len);
1805 	op->owner.owner_len = openowner->owner_len;
1806 
1807 	op->need_confirm = TRUE;
1808 
1809 	rfs4_sw_init(&op->oo_sw);
1810 
1811 	op->open_seqid = seqid;
1812 	bzero(op->reply, sizeof (nfs_resop4));
1813 	op->client = cp;
1814 	op->cr_set = NULL;
1815 	/* Init lists for remque/insque */
1816 	op->ownerstateids.next = op->ownerstateids.prev = &op->ownerstateids;
1817 	op->ownerstateids.sp = NULL; /* NULL since this is the state list */
1818 	op->openownerlist.next = op->openownerlist.prev = &op->openownerlist;
1819 	op->openownerlist.oop = op; /* ourselves */
1820 
1821 	/* Insert openowner into client's open owner list */
1822 	rfs4_dbe_lock(cp->dbe);
1823 
1824 	insque(&op->openownerlist, cp->openownerlist.prev);
1825 
1826 	rfs4_dbe_unlock(cp->dbe);
1827 
1828 	return (TRUE);
1829 }
1830 
1831 rfs4_openowner_t *
1832 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
1833 {
1834 	rfs4_openowner_t *op;
1835 	rfs4_openowner_t arg;
1836 
1837 	arg.owner = *openowner;
1838 	arg.open_seqid = seqid;
1839 	op = (rfs4_openowner_t *)rfs4_dbsearch(rfs4_openowner_idx, openowner,
1840 					    create, &arg, RFS4_DBS_VALID);
1841 
1842 	return (op);
1843 }
1844 
1845 void
1846 rfs4_update_open_sequence(rfs4_openowner_t *op)
1847 {
1848 
1849 	rfs4_dbe_lock(op->dbe);
1850 
1851 	op->open_seqid++;
1852 
1853 	rfs4_dbe_unlock(op->dbe);
1854 }
1855 
1856 void
1857 rfs4_update_open_resp(rfs4_openowner_t *op, nfs_resop4 *resp, nfs_fh4 *fh)
1858 {
1859 
1860 	rfs4_dbe_lock(op->dbe);
1861 
1862 	rfs4_free_reply(op->reply);
1863 
1864 	rfs4_copy_reply(op->reply, resp);
1865 
1866 	/* Save the filehandle if provided and free if not used */
1867 	if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
1868 	    fh && fh->nfs_fh4_len) {
1869 		if (op->reply_fh.nfs_fh4_val == NULL)
1870 			op->reply_fh.nfs_fh4_val =
1871 				kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
1872 		nfs_fh4_copy(fh, &op->reply_fh);
1873 	} else {
1874 		if (op->reply_fh.nfs_fh4_val) {
1875 			kmem_free(op->reply_fh.nfs_fh4_val,
1876 				op->reply_fh.nfs_fh4_len);
1877 			op->reply_fh.nfs_fh4_val = NULL;
1878 			op->reply_fh.nfs_fh4_len = 0;
1879 		}
1880 	}
1881 
1882 	rfs4_dbe_unlock(op->dbe);
1883 }
1884 
1885 static bool_t
1886 lockowner_compare(rfs4_entry_t u_entry, void *key)
1887 {
1888 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1889 	lock_owner4 *b = (lock_owner4 *)key;
1890 
1891 	if (lo->owner.clientid != b->clientid)
1892 		return (FALSE);
1893 
1894 	if (lo->owner.owner_len != b->owner_len)
1895 		return (FALSE);
1896 
1897 	return (bcmp(lo->owner.owner_val, b->owner_val,
1898 					lo->owner.owner_len) == 0);
1899 }
1900 
1901 void *
1902 lockowner_mkkey(rfs4_entry_t u_entry)
1903 {
1904 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1905 
1906 	return (&lo->owner);
1907 }
1908 
1909 static uint32_t
1910 lockowner_hash(void *key)
1911 {
1912 	int i;
1913 	lock_owner4 *lockowner = key;
1914 	uint_t hash = 0;
1915 
1916 	for (i = 0; i < lockowner->owner_len; i++) {
1917 		hash <<= 4;
1918 		hash += (uint_t)lockowner->owner_val[i];
1919 	}
1920 	hash += (uint_t)lockowner->clientid;
1921 	hash |= (lockowner->clientid >> 32);
1922 
1923 	return (hash);
1924 }
1925 
1926 static uint32_t
1927 pid_hash(void *key)
1928 {
1929 	return ((uint32_t)(uintptr_t)key);
1930 }
1931 
1932 static void *
1933 pid_mkkey(rfs4_entry_t u_entry)
1934 {
1935 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1936 
1937 	return ((void *)(uintptr_t)lo->pid);
1938 }
1939 
1940 static bool_t
1941 pid_compare(rfs4_entry_t u_entry, void *key)
1942 {
1943 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1944 
1945 	return (lo->pid == (pid_t)(uintptr_t)key);
1946 }
1947 
1948 static void
1949 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
1950 {
1951 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1952 
1953 	/* Free the lock owner id */
1954 	kmem_free(lo->owner.owner_val, lo->owner.owner_len);
1955 	rfs4_client_rele(lo->client);
1956 }
1957 
1958 void
1959 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
1960 {
1961 	rfs4_dbe_rele(lo->dbe);
1962 }
1963 
1964 /* ARGSUSED */
1965 static bool_t
1966 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
1967 {
1968 	/*
1969 	 * Since expiry is called with no other references on
1970 	 * this struct, go ahead and have it removed.
1971 	 */
1972 	return (TRUE);
1973 }
1974 
1975 static bool_t
1976 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
1977 {
1978 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
1979 	lock_owner4 *lockowner = (lock_owner4 *)arg;
1980 	rfs4_client_t *cp;
1981 	bool_t create = FALSE;
1982 
1983 	rw_enter(&rfs4_findclient_lock, RW_READER);
1984 
1985 	cp = (rfs4_client_t *)rfs4_dbsearch(rfs4_clientid_idx,
1986 					&lockowner->clientid,
1987 					&create, NULL, RFS4_DBS_VALID);
1988 
1989 	rw_exit(&rfs4_findclient_lock);
1990 
1991 	if (cp == NULL)
1992 		return (FALSE);
1993 
1994 	/* Reference client */
1995 	lo->client = cp;
1996 	lo->owner.clientid = lockowner->clientid;
1997 	lo->owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
1998 	bcopy(lockowner->owner_val, lo->owner.owner_val, lockowner->owner_len);
1999 	lo->owner.owner_len = lockowner->owner_len;
2000 	lo->pid = rfs4_dbe_getid(lo->dbe);
2001 
2002 	return (TRUE);
2003 }
2004 
2005 rfs4_lockowner_t *
2006 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2007 {
2008 	rfs4_lockowner_t *lo;
2009 
2010 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_idx, lockowner,
2011 					    create, lockowner, RFS4_DBS_VALID);
2012 
2013 	return (lo);
2014 }
2015 
2016 rfs4_lockowner_t *
2017 rfs4_findlockowner_by_pid(pid_t pid)
2018 {
2019 	rfs4_lockowner_t *lo;
2020 	bool_t create = FALSE;
2021 
2022 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(rfs4_lockowner_pid_idx,
2023 		(void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2024 
2025 	return (lo);
2026 }
2027 
2028 
2029 static uint32_t
2030 file_hash(void *key)
2031 {
2032 	return (ADDRHASH(key));
2033 }
2034 
2035 static void *
2036 file_mkkey(rfs4_entry_t u_entry)
2037 {
2038 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2039 
2040 	return (fp->vp);
2041 }
2042 
2043 static bool_t
2044 file_compare(rfs4_entry_t u_entry, void *key)
2045 {
2046 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2047 
2048 	return (fp->vp == (vnode_t *)key);
2049 }
2050 
2051 static void
2052 rfs4_file_destroy(rfs4_entry_t u_entry)
2053 {
2054 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2055 
2056 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
2057 	if (fp->filehandle.nfs_fh4_val)
2058 		kmem_free(fp->filehandle.nfs_fh4_val,
2059 			fp->filehandle.nfs_fh4_len);
2060 	cv_destroy(fp->dinfo->recall_cv);
2061 	if (fp->vp) {
2062 		VN_RELE(fp->vp);
2063 		fp->vp = NULL;
2064 	}
2065 	rw_destroy(&fp->file_rwlock);
2066 }
2067 
2068 /*
2069  * Used to unlock the underlying dbe struct only
2070  */
2071 void
2072 rfs4_file_rele(rfs4_file_t *fp)
2073 {
2074 	rfs4_dbe_rele(fp->dbe);
2075 }
2076 
2077 /*
2078  * Used to unlock the file rw lock and the file's dbe entry
2079  * Only used to pair with rfs4_findfile_withlock()
2080  */
2081 void
2082 rfs4_file_rele_withunlock(rfs4_file_t *fp)
2083 {
2084 	rw_exit(&fp->file_rwlock);
2085 	rfs4_dbe_rele(fp->dbe);
2086 }
2087 
2088 typedef struct {
2089     vnode_t *vp;
2090     nfs_fh4 *fh;
2091 } rfs4_fcreate_arg;
2092 
2093 static bool_t
2094 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2095 {
2096 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2097 	rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2098 	vnode_t *vp = ap->vp;
2099 	nfs_fh4 *fh = ap->fh;
2100 
2101 	VN_HOLD(vp);
2102 
2103 	fp->filehandle.nfs_fh4_len = 0;
2104 	fp->filehandle.nfs_fh4_val = NULL;
2105 	ASSERT(fh && fh->nfs_fh4_len);
2106 	if (fh && fh->nfs_fh4_len) {
2107 		fp->filehandle.nfs_fh4_val =
2108 			kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2109 		nfs_fh4_copy(fh, &fp->filehandle);
2110 	}
2111 	fp->vp = vp;
2112 
2113 	/* Init list for remque/insque */
2114 	fp->delegationlist.next = fp->delegationlist.prev =
2115 		&fp->delegationlist;
2116 	fp->delegationlist.dsp = NULL; /* NULL since this is state list */
2117 
2118 	fp->share_deny = fp->share_access = fp->access_read = 0;
2119 	fp->access_write = fp->deny_read = fp->deny_write = 0;
2120 
2121 	mutex_init(fp->dinfo->recall_lock, NULL, MUTEX_DEFAULT, NULL);
2122 	cv_init(fp->dinfo->recall_cv, NULL, CV_DEFAULT, NULL);
2123 
2124 	fp->dinfo->dtype = OPEN_DELEGATE_NONE;
2125 
2126 	rw_init(&fp->file_rwlock, NULL, RW_DEFAULT, NULL);
2127 
2128 	return (TRUE);
2129 }
2130 
2131 rfs4_file_t *
2132 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2133 {
2134 	rfs4_file_t *fp;
2135 	rfs4_fcreate_arg arg;
2136 
2137 	arg.vp = vp;
2138 	arg.fh = fh;
2139 
2140 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2141 					&arg, RFS4_DBS_VALID);
2142 	return (fp);
2143 }
2144 
2145 /*
2146  * Find a file in the db and once it is located, take the rw lock.
2147  * Need to check the vnode pointer and if it does not exist (it was
2148  * removed between the db location and check) redo the find.  This
2149  * assumes that a file struct that has a NULL vnode pointer is marked
2150  * at 'invalid' and will not be found in the db the second time
2151  * around.
2152  */
2153 rfs4_file_t *
2154 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2155 {
2156 	rfs4_file_t *fp;
2157 	rfs4_fcreate_arg arg;
2158 	bool_t screate = *create;
2159 
2160 retry:
2161 	arg.vp = vp;
2162 	arg.fh = fh;
2163 
2164 	fp = (rfs4_file_t *)rfs4_dbsearch(rfs4_file_idx, vp, create,
2165 					&arg, RFS4_DBS_VALID);
2166 	if (fp != NULL) {
2167 		rw_enter(&fp->file_rwlock, RW_WRITER);
2168 		if (fp->vp == NULL) {
2169 			rw_exit(&fp->file_rwlock);
2170 			rfs4_file_rele(fp);
2171 			*create = screate;
2172 			goto retry;
2173 		}
2174 	}
2175 
2176 	return (fp);
2177 }
2178 
2179 static uint32_t
2180 lo_state_hash(void *key)
2181 {
2182 	stateid_t *id = key;
2183 
2184 	return (id->bits.ident+id->bits.pid);
2185 }
2186 
2187 static bool_t
2188 lo_state_compare(rfs4_entry_t u_entry, void *key)
2189 {
2190 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2191 	stateid_t *id = key;
2192 	bool_t rc;
2193 
2194 	rc = (lop->lockid.bits.boottime == id->bits.boottime &&
2195 	    lop->lockid.bits.type == id->bits.type &&
2196 	    lop->lockid.bits.ident == id->bits.ident &&
2197 	    lop->lockid.bits.pid == id->bits.pid);
2198 
2199 	return (rc);
2200 }
2201 
2202 static void *
2203 lo_state_mkkey(rfs4_entry_t u_entry)
2204 {
2205 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2206 
2207 	return (&lsp->lockid);
2208 }
2209 
2210 static bool_t
2211 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2212 {
2213 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2214 
2215 	if (rfs4_dbe_is_invalid(lsp->dbe))
2216 		return (TRUE);
2217 	if (lsp->state->closed)
2218 		return (TRUE);
2219 	return ((gethrestime_sec() - lsp->state->owner->client->last_access
2220 		> rfs4_lease_time));
2221 }
2222 
2223 static void
2224 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2225 {
2226 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2227 
2228 	rfs4_sw_destroy(&lsp->ls_sw);
2229 
2230 	/* Make sure to release the file locks */
2231 	if (lsp->locks_cleaned == FALSE) {
2232 		lsp->locks_cleaned = TRUE;
2233 		if (lsp->locker->client->sysidt != LM_NOSYSID) {
2234 			/* Is the PxFS kernel module loaded? */
2235 			if (lm_remove_file_locks != NULL) {
2236 				int new_sysid;
2237 
2238 				/* Encode the cluster nodeid in new sysid */
2239 				new_sysid = lsp->locker->client->sysidt;
2240 				lm_set_nlmid_flk(&new_sysid);
2241 
2242 				/*
2243 				 * This PxFS routine removes file locks for a
2244 				 * client over all nodes of a cluster.
2245 				 */
2246 				NFS4_DEBUG(rfs4_debug, (CE_NOTE,
2247 				    "lm_remove_file_locks(sysid=0x%x)\n",
2248 				    new_sysid));
2249 				(*lm_remove_file_locks)(new_sysid);
2250 			} else {
2251 				(void) cleanlocks(lsp->state->finfo->vp,
2252 				    lsp->locker->pid,
2253 				    lsp->locker->client->sysidt);
2254 			}
2255 		}
2256 	}
2257 
2258 	rfs4_dbe_lock(lsp->state->dbe);
2259 
2260 	remque(&lsp->lockownerlist);
2261 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2262 		&lsp->lockownerlist;
2263 
2264 	rfs4_dbe_unlock(lsp->state->dbe);
2265 
2266 	/* Free the last reply for this state */
2267 	rfs4_free_reply(lsp->reply);
2268 
2269 	rfs4_lockowner_rele(lsp->locker);
2270 	lsp->locker = NULL;
2271 
2272 	rfs4_state_rele_nounlock(lsp->state);
2273 	lsp->state = NULL;
2274 }
2275 
2276 static bool_t
2277 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2278 {
2279 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2280 	rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2281 	rfs4_lockowner_t *lo = argp->locker;
2282 	rfs4_state_t *sp = argp->state;
2283 
2284 	lsp->state = sp;
2285 
2286 	lsp->lockid = sp->stateid;
2287 	lsp->lockid.bits.type = LOCKID;
2288 	lsp->lockid.bits.chgseq = 0;
2289 	lsp->lockid.bits.pid = lo->pid;
2290 
2291 	lsp->locks_cleaned = FALSE;
2292 	lsp->lock_completed = FALSE;
2293 
2294 	rfs4_sw_init(&lsp->ls_sw);
2295 
2296 	/* Attached the supplied lock owner */
2297 	rfs4_dbe_hold(lo->dbe);
2298 	lsp->locker = lo;
2299 
2300 	lsp->lockownerlist.next = lsp->lockownerlist.prev =
2301 		&lsp->lockownerlist;
2302 	lsp->lockownerlist.lsp = lsp;
2303 
2304 	rfs4_dbe_lock(sp->dbe);
2305 
2306 	insque(&lsp->lockownerlist, sp->lockownerlist.prev);
2307 
2308 	rfs4_dbe_hold(sp->dbe);
2309 
2310 	rfs4_dbe_unlock(sp->dbe);
2311 
2312 	return (TRUE);
2313 }
2314 
2315 void
2316 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2317 {
2318 	if (unlock_fp == TRUE)
2319 		rw_exit(&lsp->state->finfo->file_rwlock);
2320 	rfs4_dbe_rele(lsp->dbe);
2321 }
2322 
2323 static rfs4_lo_state_t *
2324 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2325 {
2326 	rfs4_lo_state_t *lsp;
2327 	bool_t create = FALSE;
2328 
2329 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_idx, id,
2330 					    &create, NULL, RFS4_DBS_VALID);
2331 	if (lock_fp == TRUE && lsp != NULL)
2332 		rw_enter(&lsp->state->finfo->file_rwlock, RW_READER);
2333 
2334 	return (lsp);
2335 }
2336 
2337 
2338 static uint32_t
2339 lo_state_lo_hash(void *key)
2340 {
2341 	rfs4_lo_state_t *lop = key;
2342 
2343 	return (ADDRHASH(lop->locker) ^ ADDRHASH(lop->state));
2344 }
2345 
2346 static bool_t
2347 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2348 {
2349 	rfs4_lo_state_t *lop = (rfs4_lo_state_t *)u_entry;
2350 	rfs4_lo_state_t *keyp = key;
2351 
2352 	return (keyp->locker == lop->locker && keyp->state == lop->state);
2353 }
2354 
2355 static void *
2356 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2357 {
2358 	return (u_entry);
2359 }
2360 
2361 rfs4_lo_state_t *
2362 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo,
2363 			rfs4_state_t *sp, bool_t *create)
2364 {
2365 	rfs4_lo_state_t *lsp;
2366 	rfs4_lo_state_t arg;
2367 
2368 	arg.locker = lo;
2369 	arg.state = sp;
2370 
2371 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(rfs4_lo_state_owner_idx, &arg,
2372 					    create, &arg, RFS4_DBS_VALID);
2373 
2374 	return (lsp);
2375 }
2376 
2377 static stateid_t
2378 get_stateid(id_t eid)
2379 {
2380 	stateid_t id;
2381 
2382 	id.bits.boottime = rfs4_start_time;
2383 	id.bits.ident = eid;
2384 	id.bits.chgseq = 0;
2385 	id.bits.type = 0;
2386 	id.bits.pid = 0;
2387 
2388 	/*
2389 	 * If we are booted as a cluster node, embed our nodeid.
2390 	 * We've already done sanity checks in rfs4_client_create() so no
2391 	 * need to repeat them here.
2392 	 */
2393 	id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2394 	    clconf_get_nodeid() : 0;
2395 
2396 	return (id);
2397 }
2398 
2399 /*
2400  * For use only when booted as a cluster node.
2401  * Returns TRUE if the embedded nodeid indicates that this stateid was
2402  * generated on another node.
2403  */
2404 static int
2405 foreign_stateid(stateid_t *id)
2406 {
2407 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2408 	return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2409 }
2410 
2411 /*
2412  * For use only when booted as a cluster node.
2413  * Returns TRUE if the embedded nodeid indicates that this clientid was
2414  * generated on another node.
2415  */
2416 static int
2417 foreign_clientid(cid *cidp)
2418 {
2419 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2420 	return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2421 	    (uint32_t)clconf_get_nodeid());
2422 }
2423 
2424 /*
2425  * For use only when booted as a cluster node.
2426  * Embed our cluster nodeid into the clientid.
2427  */
2428 static void
2429 embed_nodeid(cid *cidp)
2430 {
2431 	int clnodeid;
2432 	/*
2433 	 * Currently, our state tables are small enough that their
2434 	 * ids will leave enough bits free for the nodeid. If the
2435 	 * tables become larger, we mustn't overwrite the id.
2436 	 * Equally, we only have room for so many bits of nodeid, so
2437 	 * must check that too.
2438 	 */
2439 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2440 	ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2441 	clnodeid = clconf_get_nodeid();
2442 	ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2443 	ASSERT(clnodeid != NODEID_UNKNOWN);
2444 	cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2445 }
2446 
2447 static uint32_t
2448 state_hash(void *key)
2449 {
2450 	stateid_t *ip = (stateid_t *)key;
2451 
2452 	return (ip->bits.ident);
2453 }
2454 
2455 static bool_t
2456 state_compare(rfs4_entry_t u_entry, void *key)
2457 {
2458 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2459 	stateid_t *id = (stateid_t *)key;
2460 	bool_t rc;
2461 
2462 	rc = (sp->stateid.bits.boottime == id->bits.boottime &&
2463 	    sp->stateid.bits.ident == id->bits.ident);
2464 
2465 	return (rc);
2466 }
2467 
2468 static void *
2469 state_mkkey(rfs4_entry_t u_entry)
2470 {
2471 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2472 
2473 	return (&sp->stateid);
2474 }
2475 
2476 static void
2477 rfs4_state_destroy(rfs4_entry_t u_entry)
2478 {
2479 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2480 
2481 	ASSERT(&sp->lockownerlist == sp->lockownerlist.next);
2482 
2483 	/* release any share locks for this stateid if it's still open */
2484 	if (!sp->closed)
2485 		rfs4_unshare(sp);
2486 
2487 	/* Were done with the file */
2488 	rfs4_file_rele(sp->finfo);
2489 	sp->finfo = NULL;
2490 
2491 	/* And now with the openowner */
2492 	rfs4_dbe_lock(sp->owner->dbe);
2493 
2494 	remque(&sp->ownerstateids);
2495 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2496 
2497 	rfs4_dbe_unlock(sp->owner->dbe);
2498 
2499 	rfs4_openowner_rele(sp->owner);
2500 	sp->owner = NULL;
2501 }
2502 
2503 static void
2504 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2505 {
2506 	rfs4_dbe_rele(sp->dbe);
2507 }
2508 
2509 void
2510 rfs4_state_rele(rfs4_state_t *sp)
2511 {
2512 	rw_exit(&sp->finfo->file_rwlock);
2513 	rfs4_dbe_rele(sp->dbe);
2514 }
2515 
2516 static uint32_t
2517 deleg_hash(void *key)
2518 {
2519 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2520 
2521 	return (ADDRHASH(dsp->client) ^ ADDRHASH(dsp->finfo));
2522 }
2523 
2524 static bool_t
2525 deleg_compare(rfs4_entry_t u_entry, void *key)
2526 {
2527 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2528 	rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2529 
2530 	return (dsp->client == kdsp->client && dsp->finfo == kdsp->finfo);
2531 }
2532 
2533 static void *
2534 deleg_mkkey(rfs4_entry_t u_entry)
2535 {
2536 	return (u_entry);
2537 }
2538 
2539 static uint32_t
2540 deleg_state_hash(void *key)
2541 {
2542 	stateid_t *ip = (stateid_t *)key;
2543 
2544 	return (ip->bits.ident);
2545 }
2546 
2547 static bool_t
2548 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2549 {
2550 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2551 	stateid_t *id = (stateid_t *)key;
2552 	bool_t rc;
2553 
2554 	if (id->bits.type != DELEGID)
2555 		return (FALSE);
2556 
2557 	rc = (dsp->delegid.bits.boottime == id->bits.boottime &&
2558 	    dsp->delegid.bits.ident == id->bits.ident);
2559 
2560 	return (rc);
2561 }
2562 
2563 static void *
2564 deleg_state_mkkey(rfs4_entry_t u_entry)
2565 {
2566 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2567 
2568 	return (&dsp->delegid);
2569 }
2570 
2571 static bool_t
2572 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
2573 {
2574 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2575 
2576 	if (rfs4_dbe_is_invalid(dsp->dbe))
2577 		return (TRUE);
2578 	return ((gethrestime_sec() - dsp->client->last_access
2579 		> rfs4_lease_time));
2580 
2581 }
2582 
2583 static bool_t
2584 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
2585 {
2586 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2587 	rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->finfo;
2588 	rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->client;
2589 
2590 	rfs4_dbe_hold(fp->dbe);
2591 	rfs4_dbe_hold(cp->dbe);
2592 
2593 	dsp->delegid = get_stateid(rfs4_dbe_getid(dsp->dbe));
2594 	dsp->delegid.bits.type = DELEGID;
2595 	dsp->finfo = fp;
2596 	dsp->client = cp;
2597 	dsp->dtype = OPEN_DELEGATE_NONE;
2598 
2599 	dsp->time_granted = gethrestime_sec();	/* observability */
2600 	dsp->time_revoked = 0;
2601 
2602 	/* Init lists for remque/insque */
2603 	dsp->delegationlist.next = dsp->delegationlist.prev =
2604 		&dsp->delegationlist;
2605 	dsp->delegationlist.dsp = dsp;
2606 
2607 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2608 		&dsp->clientdeleglist;
2609 	dsp->clientdeleglist.dsp = dsp;
2610 
2611 	/* Insert state on per open owner's list */
2612 	rfs4_dbe_lock(cp->dbe);
2613 
2614 	insque(&dsp->clientdeleglist, cp->clientdeleglist.prev);
2615 
2616 	rfs4_dbe_unlock(cp->dbe);
2617 
2618 	return (TRUE);
2619 }
2620 
2621 static void
2622 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
2623 {
2624 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2625 
2626 	if (&dsp->delegationlist != dsp->delegationlist.next)
2627 		rfs4_return_deleg(dsp, FALSE);
2628 
2629 	/* Were done with the file */
2630 	rfs4_file_rele(dsp->finfo);
2631 	dsp->finfo = NULL;
2632 
2633 	/* And now with the openowner */
2634 	rfs4_dbe_lock(dsp->client->dbe);
2635 
2636 	remque(&dsp->clientdeleglist);
2637 	dsp->clientdeleglist.next = dsp->clientdeleglist.prev =
2638 		&dsp->clientdeleglist;
2639 
2640 	rfs4_dbe_unlock(dsp->client->dbe);
2641 
2642 	rfs4_client_rele(dsp->client);
2643 	dsp->client = NULL;
2644 }
2645 
2646 rfs4_deleg_state_t *
2647 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
2648 {
2649 	rfs4_deleg_state_t ds, *dsp;
2650 
2651 	ds.client = sp->owner->client;
2652 	ds.finfo = sp->finfo;
2653 
2654 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_idx, &ds,
2655 					create, &ds, RFS4_DBS_VALID);
2656 
2657 	return (dsp);
2658 }
2659 
2660 rfs4_deleg_state_t *
2661 rfs4_finddelegstate(stateid_t *id)
2662 {
2663 	rfs4_deleg_state_t *dsp;
2664 	bool_t create = FALSE;
2665 
2666 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(rfs4_deleg_state_idx, id,
2667 					&create, NULL, RFS4_DBS_VALID);
2668 
2669 	return (dsp);
2670 }
2671 
2672 void
2673 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
2674 {
2675 	rfs4_dbe_rele(dsp->dbe);
2676 }
2677 
2678 void
2679 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
2680 {
2681 
2682 	rfs4_dbe_lock(lsp->dbe);
2683 
2684 	/*
2685 	 * If we are skipping sequence id checking, this means that
2686 	 * this is the first lock request and therefore the sequence
2687 	 * id does not need to be updated.  This only happens on the
2688 	 * first lock request for a lockowner
2689 	 */
2690 	if (!lsp->skip_seqid_check)
2691 		lsp->seqid++;
2692 
2693 	rfs4_dbe_unlock(lsp->dbe);
2694 }
2695 
2696 void
2697 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
2698 {
2699 
2700 	rfs4_dbe_lock(lsp->dbe);
2701 
2702 	rfs4_free_reply(lsp->reply);
2703 
2704 	rfs4_copy_reply(lsp->reply, resp);
2705 
2706 	rfs4_dbe_unlock(lsp->dbe);
2707 }
2708 
2709 void
2710 rfs4_free_opens(rfs4_openowner_t *op, bool_t invalidate,
2711 	bool_t close_of_client)
2712 {
2713 	rfs4_state_t *sp;
2714 
2715 	rfs4_dbe_lock(op->dbe);
2716 
2717 	for (sp = op->ownerstateids.next->sp; sp != NULL;
2718 		sp = sp->ownerstateids.next->sp) {
2719 		rfs4_state_close(sp, FALSE, close_of_client, CRED());
2720 		if (invalidate == TRUE)
2721 			rfs4_dbe_invalidate(sp->dbe);
2722 	}
2723 
2724 	rfs4_dbe_unlock(op->dbe);
2725 	rfs4_dbe_invalidate(op->dbe);
2726 }
2727 
2728 static uint32_t
2729 state_owner_file_hash(void *key)
2730 {
2731 	rfs4_state_t *sp = key;
2732 
2733 	return (ADDRHASH(sp->owner) ^ ADDRHASH(sp->finfo));
2734 }
2735 
2736 static bool_t
2737 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
2738 {
2739 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2740 	rfs4_state_t *arg = key;
2741 
2742 	if (sp->closed == TRUE)
2743 		return (FALSE);
2744 
2745 	return (arg->owner == sp->owner && arg->finfo == sp->finfo);
2746 }
2747 
2748 static void *
2749 state_owner_file_mkkey(rfs4_entry_t u_entry)
2750 {
2751 	return (u_entry);
2752 }
2753 
2754 static uint32_t
2755 state_file_hash(void *key)
2756 {
2757 	return (ADDRHASH(key));
2758 }
2759 
2760 static bool_t
2761 state_file_compare(rfs4_entry_t u_entry, void *key)
2762 {
2763 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2764 	rfs4_file_t *fp = key;
2765 
2766 	if (sp->closed == TRUE)
2767 		return (FALSE);
2768 
2769 	return (fp == sp->finfo);
2770 }
2771 
2772 static void *
2773 state_file_mkkey(rfs4_entry_t u_entry)
2774 {
2775 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2776 
2777 	return (sp->finfo);
2778 }
2779 
2780 rfs4_state_t *
2781 rfs4_findstate_by_owner_file(rfs4_openowner_t *op, rfs4_file_t *file,
2782 	bool_t *create)
2783 {
2784 	rfs4_state_t *sp;
2785 	rfs4_state_t key;
2786 
2787 	key.owner = op;
2788 	key.finfo = file;
2789 
2790 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_owner_file_idx, &key,
2791 					create, &key, RFS4_DBS_VALID);
2792 
2793 	return (sp);
2794 }
2795 
2796 /* This returns ANY state struct that refers to this file */
2797 static rfs4_state_t *
2798 rfs4_findstate_by_file(rfs4_file_t *fp)
2799 {
2800 	bool_t create = FALSE;
2801 
2802 	return ((rfs4_state_t *)rfs4_dbsearch(rfs4_state_file_idx, fp,
2803 		&create, fp, RFS4_DBS_VALID));
2804 }
2805 
2806 static bool_t
2807 rfs4_state_expiry(rfs4_entry_t u_entry)
2808 {
2809 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2810 
2811 	if (rfs4_dbe_is_invalid(sp->dbe))
2812 		return (TRUE);
2813 
2814 	if (sp->closed == TRUE &&
2815 	    ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->dbe))
2816 		> rfs4_lease_time))
2817 		return (TRUE);
2818 
2819 	return ((gethrestime_sec() - sp->owner->client->last_access
2820 		> rfs4_lease_time));
2821 }
2822 
2823 static bool_t
2824 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
2825 {
2826 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2827 	rfs4_file_t *fp = ((rfs4_state_t *)argp)->finfo;
2828 	rfs4_openowner_t *op = ((rfs4_state_t *)argp)->owner;
2829 
2830 	rfs4_dbe_hold(fp->dbe);
2831 	rfs4_dbe_hold(op->dbe);
2832 	sp->stateid = get_stateid(rfs4_dbe_getid(sp->dbe));
2833 	sp->stateid.bits.type = OPENID;
2834 	sp->owner = op;
2835 	sp->finfo = fp;
2836 
2837 	/* Init lists for remque/insque */
2838 	sp->ownerstateids.next = sp->ownerstateids.prev = &sp->ownerstateids;
2839 	sp->ownerstateids.sp = sp;
2840 	sp->lockownerlist.next = sp->lockownerlist.prev = &sp->lockownerlist;
2841 	sp->lockownerlist.lsp = NULL;
2842 
2843 	/* Insert state on per open owner's list */
2844 	rfs4_dbe_lock(op->dbe);
2845 
2846 	insque(&sp->ownerstateids, op->ownerstateids.prev);
2847 
2848 	rfs4_dbe_unlock(op->dbe);
2849 
2850 	return (TRUE);
2851 }
2852 
2853 static rfs4_state_t *
2854 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid,
2855 		bool_t lock_fp)
2856 {
2857 	rfs4_state_t *sp;
2858 	bool_t create = FALSE;
2859 
2860 	sp = (rfs4_state_t *)rfs4_dbsearch(rfs4_state_idx, id,
2861 					&create, NULL, find_invalid);
2862 	if (lock_fp == TRUE && sp != NULL)
2863 		rw_enter(&sp->finfo->file_rwlock, RW_READER);
2864 
2865 	return (sp);
2866 }
2867 
2868 void
2869 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held,
2870 			bool_t close_of_client, cred_t *cr)
2871 {
2872 	/* Remove the associated lo_state owners */
2873 	if (!lock_held)
2874 		rfs4_dbe_lock(sp->dbe);
2875 	if (sp->closed == FALSE) {
2876 		sp->closed = TRUE;
2877 
2878 		rfs4_release_share_lock_state(sp, cr, close_of_client);
2879 	}
2880 	if (!lock_held)
2881 		rfs4_dbe_unlock(sp->dbe);
2882 }
2883 
2884 /*
2885  * Remove all state associated with the given client.
2886  */
2887 void
2888 rfs4_client_state_remove(rfs4_client_t *cp)
2889 {
2890 	rfs4_openowner_t *oop;
2891 
2892 	rfs4_dbe_lock(cp->dbe);
2893 
2894 	for (oop = cp->openownerlist.next->oop;  oop != NULL;
2895 		oop = oop->openownerlist.next->oop) {
2896 		rfs4_free_opens(oop, TRUE, TRUE);
2897 	}
2898 
2899 	rfs4_dbe_unlock(cp->dbe);
2900 }
2901 
2902 void
2903 rfs4_client_close(rfs4_client_t *cp)
2904 {
2905 	/* Mark client as going away. */
2906 	rfs4_dbe_lock(cp->dbe);
2907 	rfs4_dbe_invalidate(cp->dbe);
2908 	rfs4_dbe_unlock(cp->dbe);
2909 
2910 	rfs4_client_state_remove(cp);
2911 
2912 	/* Release the client */
2913 	rfs4_client_rele(cp);
2914 }
2915 
2916 nfsstat4
2917 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
2918 {
2919 	cid *cidp = (cid *) cp;
2920 
2921 	/*
2922 	 * If we are booted as a cluster node, check the embedded nodeid.
2923 	 * If it indicates that this clientid was generated on another node,
2924 	 * inform the client accordingly.
2925 	 */
2926 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
2927 		return (NFS4ERR_STALE_CLIENTID);
2928 
2929 	/*
2930 	 * If the server start time matches the time provided
2931 	 * by the client (via the clientid) and this is NOT a
2932 	 * setclientid_confirm then return EXPIRED.
2933 	 */
2934 	if (!setclid_confirm && cidp->impl_id.start_time == rfs4_start_time)
2935 		return (NFS4ERR_EXPIRED);
2936 
2937 	return (NFS4ERR_STALE_CLIENTID);
2938 }
2939 
2940 /*
2941  * This is used when a stateid has not been found amongst the
2942  * current server's state.  Check the stateid to see if it
2943  * was from this server instantiation or not.
2944  */
2945 static nfsstat4
2946 what_stateid_error(stateid_t *id, stateid_type_t type)
2947 {
2948 	/* If we are booted as a cluster node, was stateid locally generated? */
2949 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
2950 		return (NFS4ERR_STALE_STATEID);
2951 
2952 	/* If types don't match then no use checking further */
2953 	if (type != id->bits.type)
2954 		return (NFS4ERR_BAD_STATEID);
2955 
2956 	/* From a previous server instantiation, return STALE */
2957 	if (id->bits.boottime < rfs4_start_time)
2958 		return (NFS4ERR_STALE_STATEID);
2959 
2960 	/*
2961 	 * From this server but the state is most likely beyond lease
2962 	 * timeout: return NFS4ERR_EXPIRED.  However, there is the
2963 	 * case of a delegation stateid.  For delegations, there is a
2964 	 * case where the state can be removed without the client's
2965 	 * knowledge/consent: revocation.  In the case of delegation
2966 	 * revocation, the delegation state will be removed and will
2967 	 * not be found.  If the client does something like a
2968 	 * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
2969 	 * that has been revoked, the server should return BAD_STATEID
2970 	 * instead of the more common EXPIRED error.
2971 	 */
2972 	if (id->bits.boottime == rfs4_start_time) {
2973 		if (type == DELEGID)
2974 			return (NFS4ERR_BAD_STATEID);
2975 		else
2976 			return (NFS4ERR_EXPIRED);
2977 	}
2978 
2979 	return (NFS4ERR_BAD_STATEID);
2980 }
2981 
2982 /*
2983  * Used later on to find the various state structs.  When called from
2984  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
2985  * taken (it is not needed) and helps on the read/write path with
2986  * respect to performance.
2987  */
2988 static nfsstat4
2989 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
2990 		rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
2991 {
2992 	stateid_t *id = (stateid_t *)stateid;
2993 	rfs4_state_t *sp;
2994 
2995 	*spp = NULL;
2996 
2997 	/* If we are booted as a cluster node, was stateid locally generated? */
2998 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
2999 		return (NFS4ERR_STALE_STATEID);
3000 
3001 	sp = rfs4_findstate(id, find_invalid, lock_fp);
3002 	if (sp == NULL) {
3003 		return (what_stateid_error(id, OPENID));
3004 	}
3005 
3006 	if (rfs4_lease_expired(sp->owner->client)) {
3007 		if (lock_fp == TRUE)
3008 			rfs4_state_rele(sp);
3009 		else
3010 			rfs4_state_rele_nounlock(sp);
3011 		return (NFS4ERR_EXPIRED);
3012 	}
3013 
3014 	*spp = sp;
3015 
3016 	return (NFS4_OK);
3017 }
3018 
3019 nfsstat4
3020 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3021 		rfs4_dbsearch_type_t find_invalid)
3022 {
3023 	return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3024 }
3025 
3026 int
3027 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3028 {
3029 	stateid_t *id = (stateid_t *)stateid;
3030 
3031 	if (rfs4_lease_expired(sp->owner->client))
3032 		return (NFS4_CHECK_STATEID_EXPIRED);
3033 
3034 	/* Stateid is some time in the future - that's bad */
3035 	if (sp->stateid.bits.chgseq < id->bits.chgseq)
3036 		return (NFS4_CHECK_STATEID_BAD);
3037 
3038 	if (sp->stateid.bits.chgseq == id->bits.chgseq + 1)
3039 		return (NFS4_CHECK_STATEID_REPLAY);
3040 
3041 	/* Stateid is some time in the past - that's old */
3042 	if (sp->stateid.bits.chgseq > id->bits.chgseq)
3043 		return (NFS4_CHECK_STATEID_OLD);
3044 
3045 	/* Caller needs to know about confirmation before closure */
3046 	if (sp->owner->need_confirm)
3047 		return (NFS4_CHECK_STATEID_UNCONFIRMED);
3048 
3049 	if (sp->closed == TRUE)
3050 		return (NFS4_CHECK_STATEID_CLOSED);
3051 
3052 	return (NFS4_CHECK_STATEID_OKAY);
3053 }
3054 
3055 int
3056 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3057 {
3058 	stateid_t *id = (stateid_t *)stateid;
3059 
3060 	if (rfs4_lease_expired(lsp->state->owner->client))
3061 		return (NFS4_CHECK_STATEID_EXPIRED);
3062 
3063 	/* Stateid is some time in the future - that's bad */
3064 	if (lsp->lockid.bits.chgseq < id->bits.chgseq)
3065 		return (NFS4_CHECK_STATEID_BAD);
3066 
3067 	if (lsp->lockid.bits.chgseq == id->bits.chgseq + 1)
3068 		return (NFS4_CHECK_STATEID_REPLAY);
3069 
3070 	/* Stateid is some time in the past - that's old */
3071 	if (lsp->lockid.bits.chgseq > id->bits.chgseq)
3072 		return (NFS4_CHECK_STATEID_OLD);
3073 
3074 	return (NFS4_CHECK_STATEID_OKAY);
3075 }
3076 
3077 nfsstat4
3078 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3079 {
3080 	stateid_t *id = (stateid_t *)stateid;
3081 	rfs4_deleg_state_t *dsp;
3082 
3083 	*dspp = NULL;
3084 
3085 	/* If we are booted as a cluster node, was stateid locally generated? */
3086 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3087 		return (NFS4ERR_STALE_STATEID);
3088 
3089 	dsp = rfs4_finddelegstate(id);
3090 	if (dsp == NULL) {
3091 		return (what_stateid_error(id, DELEGID));
3092 	}
3093 
3094 	if (rfs4_lease_expired(dsp->client)) {
3095 		rfs4_deleg_state_rele(dsp);
3096 		return (NFS4ERR_EXPIRED);
3097 	}
3098 
3099 	*dspp = dsp;
3100 
3101 	return (NFS4_OK);
3102 }
3103 
3104 nfsstat4
3105 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3106 {
3107 	stateid_t *id = (stateid_t *)stateid;
3108 	rfs4_lo_state_t *lsp;
3109 
3110 	*lspp = NULL;
3111 
3112 	/* If we are booted as a cluster node, was stateid locally generated? */
3113 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3114 		return (NFS4ERR_STALE_STATEID);
3115 
3116 	lsp = rfs4_findlo_state(id, lock_fp);
3117 	if (lsp == NULL) {
3118 		return (what_stateid_error(id, LOCKID));
3119 	}
3120 
3121 	if (rfs4_lease_expired(lsp->state->owner->client)) {
3122 		rfs4_lo_state_rele(lsp, lock_fp);
3123 		return (NFS4ERR_EXPIRED);
3124 	}
3125 
3126 	*lspp = lsp;
3127 
3128 	return (NFS4_OK);
3129 }
3130 
3131 static nfsstat4
3132 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3133 	rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lospp)
3134 {
3135 	rfs4_state_t *sp = NULL;
3136 	rfs4_deleg_state_t *dsp = NULL;
3137 	rfs4_lo_state_t *losp = NULL;
3138 	stateid_t *id;
3139 	nfsstat4 status;
3140 
3141 	*spp = NULL; *dspp = NULL; *lospp = NULL;
3142 
3143 	id = (stateid_t *)sid;
3144 	switch (id->bits.type) {
3145 	case OPENID:
3146 		status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3147 		break;
3148 	case DELEGID:
3149 		status = rfs4_get_deleg_state(sid, &dsp);
3150 		break;
3151 	case LOCKID:
3152 		status = rfs4_get_lo_state(sid, &losp, FALSE);
3153 		if (status == NFS4_OK) {
3154 			sp = losp->state;
3155 			rfs4_dbe_hold(sp->dbe);
3156 		}
3157 		break;
3158 	default:
3159 		status = NFS4ERR_BAD_STATEID;
3160 	}
3161 
3162 	if (status == NFS4_OK) {
3163 		*spp = sp;
3164 		*dspp = dsp;
3165 		*lospp = losp;
3166 	}
3167 
3168 	return (status);
3169 }
3170 
3171 /*
3172  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3173  * rfs4_state_t struct has access to do this operation and if so
3174  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3175  */
3176 nfsstat4
3177 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3178 {
3179 	nfsstat4 stat = NFS4_OK;
3180 	rfs4_file_t *fp;
3181 	bool_t create = FALSE;
3182 
3183 	rfs4_dbe_lock(sp->dbe);
3184 	if (mode == FWRITE) {
3185 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3186 			stat = NFS4ERR_OPENMODE;
3187 		}
3188 	} else if (mode == FREAD) {
3189 		if (!(sp->share_access & OPEN4_SHARE_ACCESS_READ)) {
3190 			/*
3191 			 * If we have OPENed the file with DENYing access
3192 			 * to both READ and WRITE then no one else could
3193 			 * have OPENed the file, hence no conflicting READ
3194 			 * deny.  This check is merely an optimization.
3195 			 */
3196 			if (sp->share_deny == OPEN4_SHARE_DENY_BOTH)
3197 				goto out;
3198 
3199 			/* Check against file struct's DENY mode */
3200 			fp = rfs4_findfile(vp, NULL, &create);
3201 			if (fp != NULL) {
3202 				int deny_read = 0;
3203 				rfs4_dbe_lock(fp->dbe);
3204 				/*
3205 				 * Check if any other open owner has the file
3206 				 * OPENed with deny READ.
3207 				 */
3208 				if (sp->share_deny & OPEN4_SHARE_DENY_READ)
3209 					deny_read = 1;
3210 				ASSERT(fp->deny_read - deny_read >= 0);
3211 				if (fp->deny_read - deny_read > 0)
3212 					stat = NFS4ERR_OPENMODE;
3213 				rfs4_dbe_unlock(fp->dbe);
3214 				rfs4_file_rele(fp);
3215 			}
3216 		}
3217 	} else {
3218 		/* Illegal I/O mode */
3219 		stat = NFS4ERR_INVAL;
3220 	}
3221 out:
3222 	rfs4_dbe_unlock(sp->dbe);
3223 	return (stat);
3224 }
3225 
3226 /*
3227  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3228  * the file is being truncated, return NFS4_OK if allowed or approriate
3229  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3230  * the associated file will be done if the I/O is not consistent with any
3231  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3232  * as reader or writer as appropriate. rfs4_op_open will accquire the
3233  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3234  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3235  * deleg parameter, we will return whether a write delegation is held by
3236  * the client associated with this stateid.
3237  * If the server instance associated with the relevant client is in its
3238  * grace period, return NFS4ERR_GRACE.
3239  */
3240 
3241 nfsstat4
3242 rfs4_check_stateid(int mode, vnode_t *vp,
3243 		stateid4 *stateid, bool_t trunc, bool_t *deleg,
3244 		bool_t do_access)
3245 {
3246 	rfs4_file_t *fp;
3247 	bool_t create = FALSE;
3248 	rfs4_state_t *sp;
3249 	rfs4_deleg_state_t *dsp;
3250 	rfs4_lo_state_t *lsp;
3251 	stateid_t *id = (stateid_t *)stateid;
3252 	nfsstat4 stat = NFS4_OK;
3253 
3254 	if (ISSPECIAL(stateid)) {
3255 		fp = rfs4_findfile(vp, NULL, &create);
3256 		if (fp == NULL)
3257 			return (NFS4_OK);
3258 		if (fp->dinfo->dtype == OPEN_DELEGATE_NONE) {
3259 			rfs4_file_rele(fp);
3260 			return (NFS4_OK);
3261 		}
3262 		if (mode == FWRITE ||
3263 			fp->dinfo->dtype == OPEN_DELEGATE_WRITE) {
3264 			rfs4_recall_deleg(fp, trunc, NULL);
3265 			rfs4_file_rele(fp);
3266 			return (NFS4ERR_DELAY);
3267 		}
3268 		rfs4_file_rele(fp);
3269 		return (NFS4_OK);
3270 	} else {
3271 		stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3272 		if (stat != NFS4_OK)
3273 			return (stat);
3274 		if (lsp != NULL) {
3275 			/* Is associated server instance in its grace period? */
3276 			if (rfs4_clnt_in_grace(lsp->locker->client)) {
3277 				rfs4_lo_state_rele(lsp, FALSE);
3278 				if (sp != NULL)
3279 					rfs4_state_rele_nounlock(sp);
3280 				return (NFS4ERR_GRACE);
3281 			}
3282 			if (id->bits.type == LOCKID) {
3283 				/* Seqid in the future? - that's bad */
3284 				if (lsp->lockid.bits.chgseq <
3285 					id->bits.chgseq) {
3286 					rfs4_lo_state_rele(lsp, FALSE);
3287 					if (sp != NULL)
3288 						rfs4_state_rele_nounlock(sp);
3289 					return (NFS4ERR_BAD_STATEID);
3290 				}
3291 				/* Seqid in the past? - that's old */
3292 				if (lsp->lockid.bits.chgseq >
3293 					id->bits.chgseq) {
3294 					rfs4_lo_state_rele(lsp, FALSE);
3295 					if (sp != NULL)
3296 						rfs4_state_rele_nounlock(sp);
3297 					return (NFS4ERR_OLD_STATEID);
3298 				}
3299 				/* Ensure specified filehandle matches */
3300 				if (lsp->state->finfo->vp != vp) {
3301 					rfs4_lo_state_rele(lsp, FALSE);
3302 					if (sp != NULL)
3303 						rfs4_state_rele_nounlock(sp);
3304 					return (NFS4ERR_BAD_STATEID);
3305 				}
3306 			}
3307 			rfs4_lo_state_rele(lsp, FALSE);
3308 		}
3309 
3310 		/* Stateid provided was an "open" stateid */
3311 		if (sp != NULL) {
3312 			/* Is associated server instance in its grace period? */
3313 			if (rfs4_clnt_in_grace(sp->owner->client)) {
3314 				rfs4_state_rele_nounlock(sp);
3315 				return (NFS4ERR_GRACE);
3316 			}
3317 			if (id->bits.type == OPENID) {
3318 				/* Seqid in the future? - that's bad */
3319 				if (sp->stateid.bits.chgseq <
3320 					id->bits.chgseq) {
3321 					rfs4_state_rele_nounlock(sp);
3322 					return (NFS4ERR_BAD_STATEID);
3323 				}
3324 				/* Seqid in the past - that's old */
3325 				if (sp->stateid.bits.chgseq >
3326 					id->bits.chgseq) {
3327 					rfs4_state_rele_nounlock(sp);
3328 					return (NFS4ERR_OLD_STATEID);
3329 				}
3330 			}
3331 			/* Ensure specified filehandle matches */
3332 			if (sp->finfo->vp != vp) {
3333 				rfs4_state_rele_nounlock(sp);
3334 				return (NFS4ERR_BAD_STATEID);
3335 			}
3336 
3337 			if (sp->owner->need_confirm) {
3338 				rfs4_state_rele_nounlock(sp);
3339 				return (NFS4ERR_BAD_STATEID);
3340 			}
3341 
3342 			if (sp->closed == TRUE) {
3343 				rfs4_state_rele_nounlock(sp);
3344 				return (NFS4ERR_OLD_STATEID);
3345 			}
3346 
3347 			if (do_access)
3348 				stat = rfs4_state_has_access(sp, mode, vp);
3349 			else
3350 				stat = NFS4_OK;
3351 
3352 			/*
3353 			 * Return whether this state has write
3354 			 * delegation if desired
3355 			 */
3356 			if (deleg &&
3357 			    (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3358 				*deleg = TRUE;
3359 
3360 			/*
3361 			 * We got a valid stateid, so we update the
3362 			 * lease on the client. Ideally we would like
3363 			 * to do this after the calling op succeeds,
3364 			 * but for now this will be good
3365 			 * enough. Callers of this routine are
3366 			 * currently insulated from the state stuff.
3367 			 */
3368 			rfs4_update_lease(sp->owner->client);
3369 
3370 			/*
3371 			 * If a delegation is present on this file and
3372 			 * this is a WRITE, then update the lastwrite
3373 			 * time to indicate that activity is present.
3374 			 */
3375 			if (sp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3376 				mode == FWRITE) {
3377 				sp->finfo->dinfo->time_lastwrite =
3378 					gethrestime_sec();
3379 			}
3380 
3381 			rfs4_state_rele_nounlock(sp);
3382 
3383 			return (stat);
3384 		}
3385 
3386 		if (dsp != NULL) {
3387 			/* Is associated server instance in its grace period? */
3388 			if (rfs4_clnt_in_grace(dsp->client)) {
3389 				rfs4_deleg_state_rele(dsp);
3390 				return (NFS4ERR_GRACE);
3391 			}
3392 			if (dsp->delegid.bits.chgseq !=	id->bits.chgseq) {
3393 				rfs4_deleg_state_rele(dsp);
3394 				return (NFS4ERR_BAD_STATEID);
3395 			}
3396 
3397 			/* Ensure specified filehandle matches */
3398 			if (dsp->finfo->vp != vp) {
3399 				rfs4_deleg_state_rele(dsp);
3400 				return (NFS4ERR_BAD_STATEID);
3401 			}
3402 			/*
3403 			 * Return whether this state has write
3404 			 * delegation if desired
3405 			 */
3406 			if (deleg &&
3407 			    (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE))
3408 				*deleg = TRUE;
3409 
3410 			rfs4_update_lease(dsp->client);
3411 
3412 			/*
3413 			 * If a delegation is present on this file and
3414 			 * this is a WRITE, then update the lastwrite
3415 			 * time to indicate that activity is present.
3416 			 */
3417 			if (dsp->finfo->dinfo->dtype == OPEN_DELEGATE_WRITE &&
3418 				mode == FWRITE) {
3419 				dsp->finfo->dinfo->time_lastwrite =
3420 					gethrestime_sec();
3421 			}
3422 
3423 			/*
3424 			 * XXX - what happens if this is a WRITE and the
3425 			 * delegation type of for READ.
3426 			 */
3427 			rfs4_deleg_state_rele(dsp);
3428 
3429 			return (stat);
3430 		}
3431 		/*
3432 		 * If we got this far, something bad happened
3433 		 */
3434 		return (NFS4ERR_BAD_STATEID);
3435 	}
3436 }
3437 
3438 
3439 /*
3440  * This is a special function in that for the file struct provided the
3441  * server wants to remove/close all current state associated with the
3442  * file.  The prime use of this would be with OP_REMOVE to force the
3443  * release of state and particularly of file locks.
3444  *
3445  * There is an assumption that there is no delegations outstanding on
3446  * this file at this point.  The caller should have waited for those
3447  * to be returned or revoked.
3448  */
3449 void
3450 rfs4_close_all_state(rfs4_file_t *fp)
3451 {
3452 	rfs4_state_t *sp;
3453 
3454 	rfs4_dbe_lock(fp->dbe);
3455 
3456 #ifdef DEBUG
3457 	/* only applies when server is handing out delegations */
3458 	if (rfs4_deleg_policy != SRV_NEVER_DELEGATE)
3459 		ASSERT(fp->dinfo->hold_grant > 0);
3460 #endif
3461 
3462 	/* No delegations for this file */
3463 	ASSERT(fp->delegationlist.next == &fp->delegationlist);
3464 
3465 	/* Make sure that it can not be found */
3466 	rfs4_dbe_invalidate(fp->dbe);
3467 
3468 	if (fp->vp == NULL) {
3469 		rfs4_dbe_unlock(fp->dbe);
3470 		return;
3471 	}
3472 	rfs4_dbe_unlock(fp->dbe);
3473 
3474 	/*
3475 	 * Hold as writer to prevent other server threads from
3476 	 * processing requests related to the file while all state is
3477 	 * being removed.
3478 	 */
3479 	rw_enter(&fp->file_rwlock, RW_WRITER);
3480 
3481 	/* Remove ALL state from the file */
3482 	while (sp = rfs4_findstate_by_file(fp)) {
3483 		rfs4_state_close(sp, FALSE, FALSE, CRED());
3484 		rfs4_state_rele_nounlock(sp);
3485 	}
3486 
3487 	/*
3488 	 * This is only safe since there are no further references to
3489 	 * the file.
3490 	 */
3491 	rfs4_dbe_lock(fp->dbe);
3492 	if (fp->vp) {
3493 		VN_RELE(fp->vp);
3494 		fp->vp = NULL;
3495 	}
3496 	rfs4_dbe_unlock(fp->dbe);
3497 
3498 	/* Finally let other references to proceed */
3499 	rw_exit(&fp->file_rwlock);
3500 }
3501 
3502 /*
3503  * This function is used as a target for the rfs4_dbe_walk() call
3504  * below.  The purpose of this function is to see if the
3505  * lockowner_state refers to a file that resides within the exportinfo
3506  * export.  If so, then remove the lock_owner state (file locks and
3507  * share "locks") for this object since the intent is the server is
3508  * unexporting the specified directory.  Be sure to invalidate the
3509  * object after the state has been released
3510  */
3511 static void
3512 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3513 {
3514 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3515 	struct exportinfo *exi = (struct exportinfo *)e;
3516 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3517 
3518 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3519 	finfo_fhp =
3520 		(nfs_fh4_fmt_t *)lsp->state->finfo->filehandle.nfs_fh4_val;
3521 
3522 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3523 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3524 		exi_fhp->fh4_xlen) == 0) {
3525 		rfs4_state_close(lsp->state, FALSE, FALSE, CRED());
3526 		rfs4_dbe_invalidate(lsp->dbe);
3527 		rfs4_dbe_invalidate(lsp->state->dbe);
3528 	}
3529 }
3530 
3531 /*
3532  * This function is used as a target for the rfs4_dbe_walk() call
3533  * below.  The purpose of this function is to see if the state refers
3534  * to a file that resides within the exportinfo export.  If so, then
3535  * remove the open state for this object since the intent is the
3536  * server is unexporting the specified directory.  The main result for
3537  * this type of entry is to invalidate it such it will not be found in
3538  * the future.
3539  */
3540 static void
3541 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
3542 {
3543 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3544 	struct exportinfo *exi = (struct exportinfo *)e;
3545 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3546 
3547 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3548 	finfo_fhp =
3549 		(nfs_fh4_fmt_t *)sp->finfo->filehandle.nfs_fh4_val;
3550 
3551 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3552 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3553 		exi_fhp->fh4_xlen) == 0) {
3554 		rfs4_state_close(sp, TRUE, FALSE, CRED());
3555 		rfs4_dbe_invalidate(sp->dbe);
3556 	}
3557 }
3558 
3559 /*
3560  * This function is used as a target for the rfs4_dbe_walk() call
3561  * below.  The purpose of this function is to see if the state refers
3562  * to a file that resides within the exportinfo export.  If so, then
3563  * remove the deleg state for this object since the intent is the
3564  * server is unexporting the specified directory.  The main result for
3565  * this type of entry is to invalidate it such it will not be found in
3566  * the future.
3567  */
3568 static void
3569 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
3570 {
3571 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3572 	struct exportinfo *exi = (struct exportinfo *)e;
3573 	nfs_fh4_fmt_t	*exi_fhp, *finfo_fhp;
3574 
3575 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3576 	finfo_fhp =
3577 		(nfs_fh4_fmt_t *)dsp->finfo->filehandle.nfs_fh4_val;
3578 
3579 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3580 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3581 		exi_fhp->fh4_xlen) == 0) {
3582 		rfs4_dbe_invalidate(dsp->dbe);
3583 	}
3584 }
3585 
3586 /*
3587  * This function is used as a target for the rfs4_dbe_walk() call
3588  * below.  The purpose of this function is to see if the state refers
3589  * to a file that resides within the exportinfo export.  If so, then
3590  * release vnode hold for this object since the intent is the server
3591  * is unexporting the specified directory.  Invalidation will prevent
3592  * this struct from being found in the future.
3593  */
3594 static void
3595 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
3596 {
3597 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
3598 	struct exportinfo *exi = (struct exportinfo *)e;
3599 	nfs_fh4_fmt_t *exi_fhp, *finfo_fhp;
3600 
3601 	exi_fhp = (nfs_fh4_fmt_t *)&exi->exi_fh;
3602 	finfo_fhp = (nfs_fh4_fmt_t *)fp->filehandle.nfs_fh4_val;
3603 
3604 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3605 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3606 		exi_fhp->fh4_xlen) == 0) {
3607 		if (fp->vp) {
3608 			/* don't leak monitors */
3609 			if (fp->dinfo->dtype == OPEN_DELEGATE_READ)
3610 				(void) fem_uninstall(fp->vp, deleg_rdops,
3611 						(void *)fp);
3612 			else if (fp->dinfo->dtype == OPEN_DELEGATE_WRITE)
3613 				(void) fem_uninstall(fp->vp, deleg_wrops,
3614 						(void *)fp);
3615 			VN_RELE(fp->vp);
3616 			fp->vp = NULL;
3617 		}
3618 		rfs4_dbe_invalidate(fp->dbe);
3619 	}
3620 }
3621 
3622 /*
3623  * Given a directory that is being unexported, cleanup/release all
3624  * state in the server that refers to objects residing underneath this
3625  * particular export.  The ordering of the release is important.
3626  * Lock_owner, then state and then file.
3627  */
3628 void
3629 rfs4_clean_state_exi(struct exportinfo *exi)
3630 {
3631 	mutex_enter(&rfs4_state_lock);
3632 
3633 	if (rfs4_server_state == NULL) {
3634 		mutex_exit(&rfs4_state_lock);
3635 		return;
3636 	}
3637 
3638 	rfs4_dbe_walk(rfs4_lo_state_tab, rfs4_lo_state_walk_callout, exi);
3639 	rfs4_dbe_walk(rfs4_state_tab, rfs4_state_walk_callout, exi);
3640 	rfs4_dbe_walk(rfs4_deleg_state_tab, rfs4_deleg_state_walk_callout, exi);
3641 	rfs4_dbe_walk(rfs4_file_tab, rfs4_file_walk_callout, exi);
3642 
3643 	mutex_exit(&rfs4_state_lock);
3644 }
3645