xref: /illumos-gate/usr/src/uts/common/fs/nfs/nfs4_state.c (revision 12042ab213b3af68474f48555504db816a449211)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 /*
27  * Copyright 2018 Nexenta Systems, Inc.
28  * Copyright 2019 Nexenta by DDN, Inc.
29  */
30 
31 #include <sys/systm.h>
32 #include <sys/kmem.h>
33 #include <sys/cmn_err.h>
34 #include <sys/atomic.h>
35 #include <sys/clconf.h>
36 #include <sys/cladm.h>
37 #include <sys/flock.h>
38 #include <nfs/export.h>
39 #include <nfs/nfs.h>
40 #include <nfs/nfs4.h>
41 #include <nfs/nfssys.h>
42 #include <nfs/lm.h>
43 #include <sys/pathname.h>
44 #include <sys/sdt.h>
45 #include <sys/nvpair.h>
46 
47 extern u_longlong_t nfs4_srv_caller_id;
48 
49 extern uint_t nfs4_srv_vkey;
50 
51 stateid4 special0 = {
52 	0,
53 	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }
54 };
55 
56 stateid4 special1 = {
57 	0xffffffff,
58 	{
59 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
60 		(char)0xff, (char)0xff, (char)0xff, (char)0xff,
61 		(char)0xff, (char)0xff, (char)0xff, (char)0xff
62 	}
63 };
64 
65 
66 #define	ISSPECIAL(id)  (stateid4_cmp(id, &special0) || \
67 			stateid4_cmp(id, &special1))
68 
69 /* For embedding the cluster nodeid into our clientid */
70 #define	CLUSTER_NODEID_SHIFT	24
71 #define	CLUSTER_MAX_NODEID	255
72 
73 #ifdef DEBUG
74 int rfs4_debug;
75 #endif
76 
77 static uint32_t rfs4_database_debug = 0x00;
78 
79 /* CSTYLED */
80 static void rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf);
81 static void rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dir, char *leaf);
82 static void rfs4_dss_clear_oldstate(rfs4_servinst_t *sip);
83 static void rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip);
84 
85 /*
86  * Couple of simple init/destroy functions for a general waiter
87  */
88 void
89 rfs4_sw_init(rfs4_state_wait_t *swp)
90 {
91 	mutex_init(swp->sw_cv_lock, NULL, MUTEX_DEFAULT, NULL);
92 	cv_init(swp->sw_cv, NULL, CV_DEFAULT, NULL);
93 	swp->sw_active = FALSE;
94 	swp->sw_wait_count = 0;
95 }
96 
97 void
98 rfs4_sw_destroy(rfs4_state_wait_t *swp)
99 {
100 	mutex_destroy(swp->sw_cv_lock);
101 	cv_destroy(swp->sw_cv);
102 }
103 
104 void
105 rfs4_sw_enter(rfs4_state_wait_t *swp)
106 {
107 	mutex_enter(swp->sw_cv_lock);
108 	while (swp->sw_active) {
109 		swp->sw_wait_count++;
110 		cv_wait(swp->sw_cv, swp->sw_cv_lock);
111 		swp->sw_wait_count--;
112 	}
113 	ASSERT(swp->sw_active == FALSE);
114 	swp->sw_active = TRUE;
115 	mutex_exit(swp->sw_cv_lock);
116 }
117 
118 void
119 rfs4_sw_exit(rfs4_state_wait_t *swp)
120 {
121 	mutex_enter(swp->sw_cv_lock);
122 	ASSERT(swp->sw_active == TRUE);
123 	swp->sw_active = FALSE;
124 	if (swp->sw_wait_count != 0)
125 		cv_broadcast(swp->sw_cv);
126 	mutex_exit(swp->sw_cv_lock);
127 }
128 
129 static void
130 deep_lock_copy(LOCK4res *dres, LOCK4res *sres)
131 {
132 	lock_owner4 *slo = &sres->LOCK4res_u.denied.owner;
133 	lock_owner4 *dlo = &dres->LOCK4res_u.denied.owner;
134 
135 	if (sres->status == NFS4ERR_DENIED) {
136 		dlo->owner_val = kmem_alloc(slo->owner_len, KM_SLEEP);
137 		bcopy(slo->owner_val, dlo->owner_val, slo->owner_len);
138 	}
139 }
140 
141 /*
142  * CPR callback id -- not related to v4 callbacks
143  */
144 static callb_id_t cpr_id = 0;
145 
146 static void
147 deep_lock_free(LOCK4res *res)
148 {
149 	lock_owner4 *lo = &res->LOCK4res_u.denied.owner;
150 
151 	if (res->status == NFS4ERR_DENIED)
152 		kmem_free(lo->owner_val, lo->owner_len);
153 }
154 
155 static void
156 deep_open_copy(OPEN4res *dres, OPEN4res *sres)
157 {
158 	nfsace4 *sacep, *dacep;
159 
160 	if (sres->status != NFS4_OK) {
161 		return;
162 	}
163 
164 	dres->attrset = sres->attrset;
165 
166 	switch (sres->delegation.delegation_type) {
167 	case OPEN_DELEGATE_NONE:
168 		return;
169 	case OPEN_DELEGATE_READ:
170 		sacep = &sres->delegation.open_delegation4_u.read.permissions;
171 		dacep = &dres->delegation.open_delegation4_u.read.permissions;
172 		break;
173 	case OPEN_DELEGATE_WRITE:
174 		sacep = &sres->delegation.open_delegation4_u.write.permissions;
175 		dacep = &dres->delegation.open_delegation4_u.write.permissions;
176 		break;
177 	}
178 	dacep->who.utf8string_val =
179 	    kmem_alloc(sacep->who.utf8string_len, KM_SLEEP);
180 	bcopy(sacep->who.utf8string_val, dacep->who.utf8string_val,
181 	    sacep->who.utf8string_len);
182 }
183 
184 static void
185 deep_open_free(OPEN4res *res)
186 {
187 	nfsace4 *acep;
188 	if (res->status != NFS4_OK)
189 		return;
190 
191 	switch (res->delegation.delegation_type) {
192 	case OPEN_DELEGATE_NONE:
193 		return;
194 	case OPEN_DELEGATE_READ:
195 		acep = &res->delegation.open_delegation4_u.read.permissions;
196 		break;
197 	case OPEN_DELEGATE_WRITE:
198 		acep = &res->delegation.open_delegation4_u.write.permissions;
199 		break;
200 	}
201 
202 	if (acep->who.utf8string_val) {
203 		kmem_free(acep->who.utf8string_val, acep->who.utf8string_len);
204 		acep->who.utf8string_val = NULL;
205 	}
206 }
207 
208 void
209 rfs4_free_reply(nfs_resop4 *rp)
210 {
211 	switch (rp->resop) {
212 	case OP_LOCK:
213 		deep_lock_free(&rp->nfs_resop4_u.oplock);
214 		break;
215 	case OP_OPEN:
216 		deep_open_free(&rp->nfs_resop4_u.opopen);
217 	default:
218 		break;
219 	}
220 }
221 
222 void
223 rfs4_copy_reply(nfs_resop4 *dst, nfs_resop4 *src)
224 {
225 	*dst = *src;
226 
227 	/* Handle responses that need deep copy */
228 	switch (src->resop) {
229 	case OP_LOCK:
230 		deep_lock_copy(&dst->nfs_resop4_u.oplock,
231 		    &src->nfs_resop4_u.oplock);
232 		break;
233 	case OP_OPEN:
234 		deep_open_copy(&dst->nfs_resop4_u.opopen,
235 		    &src->nfs_resop4_u.opopen);
236 		break;
237 	default:
238 		break;
239 	};
240 }
241 
242 /*
243  * This is the implementation of the underlying state engine. The
244  * public interface to this engine is described by
245  * nfs4_state.h. Callers to the engine should hold no state engine
246  * locks when they call in to it. If the protocol needs to lock data
247  * structures it should do so after acquiring all references to them
248  * first and then follow the following lock order:
249  *
250  *	client > openowner > state > lo_state > lockowner > file.
251  *
252  * Internally we only allow a thread to hold one hash bucket lock at a
253  * time and the lock is higher in the lock order (must be acquired
254  * first) than the data structure that is on that hash list.
255  *
256  * If a new reference was acquired by the caller, that reference needs
257  * to be released after releasing all acquired locks with the
258  * corresponding rfs4_*_rele routine.
259  */
260 
261 /*
262  * This code is some what prototypical for now. Its purpose currently is to
263  * implement the interfaces sufficiently to finish the higher protocol
264  * elements. This will be replaced by a dynamically resizeable tables
265  * backed by kmem_cache allocator. However synchronization is handled
266  * correctly (I hope) and will not change by much.  The mutexes for
267  * the hash buckets that can be used to create new instances of data
268  * structures  might be good candidates to evolve into reader writer
269  * locks. If it has to do a creation, it would be holding the
270  * mutex across a kmem_alloc with KM_SLEEP specified.
271  */
272 
273 #ifdef DEBUG
274 #define	TABSIZE 17
275 #else
276 #define	TABSIZE 2047
277 #endif
278 
279 #define	ADDRHASH(key) ((unsigned long)(key) >> 3)
280 
281 #define	MAXTABSZ 1024*1024
282 
283 /* The values below are rfs4_lease_time units */
284 
285 #ifdef DEBUG
286 #define	CLIENT_CACHE_TIME 1
287 #define	OPENOWNER_CACHE_TIME 1
288 #define	STATE_CACHE_TIME 1
289 #define	LO_STATE_CACHE_TIME 1
290 #define	LOCKOWNER_CACHE_TIME 1
291 #define	FILE_CACHE_TIME 3
292 #define	DELEG_STATE_CACHE_TIME 1
293 #else
294 #define	CLIENT_CACHE_TIME 10
295 #define	OPENOWNER_CACHE_TIME 5
296 #define	STATE_CACHE_TIME 1
297 #define	LO_STATE_CACHE_TIME 1
298 #define	LOCKOWNER_CACHE_TIME 3
299 #define	FILE_CACHE_TIME 40
300 #define	DELEG_STATE_CACHE_TIME 1
301 #endif
302 
303 /*
304  * NFSv4 server state databases
305  *
306  * Initilized when the module is loaded and used by NFSv4 state tables.
307  * These kmem_cache databases are global, the tables that make use of these
308  * are per zone.
309  */
310 kmem_cache_t *rfs4_client_mem_cache;
311 kmem_cache_t *rfs4_clntIP_mem_cache;
312 kmem_cache_t *rfs4_openown_mem_cache;
313 kmem_cache_t *rfs4_openstID_mem_cache;
314 kmem_cache_t *rfs4_lockstID_mem_cache;
315 kmem_cache_t *rfs4_lockown_mem_cache;
316 kmem_cache_t *rfs4_file_mem_cache;
317 kmem_cache_t *rfs4_delegstID_mem_cache;
318 
319 /*
320  * NFSv4 state table functions
321  */
322 static bool_t rfs4_client_create(rfs4_entry_t, void *);
323 static void rfs4_dss_remove_cpleaf(rfs4_client_t *);
324 static void rfs4_dss_remove_leaf(rfs4_servinst_t *, char *, char *);
325 static void rfs4_client_destroy(rfs4_entry_t);
326 static bool_t rfs4_client_expiry(rfs4_entry_t);
327 static uint32_t clientid_hash(void *);
328 static bool_t clientid_compare(rfs4_entry_t, void *);
329 static void *clientid_mkkey(rfs4_entry_t);
330 static uint32_t nfsclnt_hash(void *);
331 static bool_t nfsclnt_compare(rfs4_entry_t, void *);
332 static void *nfsclnt_mkkey(rfs4_entry_t);
333 static bool_t rfs4_clntip_expiry(rfs4_entry_t);
334 static void rfs4_clntip_destroy(rfs4_entry_t);
335 static bool_t rfs4_clntip_create(rfs4_entry_t, void *);
336 static uint32_t clntip_hash(void *);
337 static bool_t clntip_compare(rfs4_entry_t, void *);
338 static void *clntip_mkkey(rfs4_entry_t);
339 static bool_t rfs4_openowner_create(rfs4_entry_t, void *);
340 static void rfs4_openowner_destroy(rfs4_entry_t);
341 static bool_t rfs4_openowner_expiry(rfs4_entry_t);
342 static uint32_t openowner_hash(void *);
343 static bool_t openowner_compare(rfs4_entry_t, void *);
344 static void *openowner_mkkey(rfs4_entry_t);
345 static bool_t rfs4_state_create(rfs4_entry_t, void *);
346 static void rfs4_state_destroy(rfs4_entry_t);
347 static bool_t rfs4_state_expiry(rfs4_entry_t);
348 static uint32_t state_hash(void *);
349 static bool_t state_compare(rfs4_entry_t, void *);
350 static void *state_mkkey(rfs4_entry_t);
351 static uint32_t state_owner_file_hash(void *);
352 static bool_t state_owner_file_compare(rfs4_entry_t, void *);
353 static void *state_owner_file_mkkey(rfs4_entry_t);
354 static uint32_t state_file_hash(void *);
355 static bool_t state_file_compare(rfs4_entry_t, void *);
356 static void *state_file_mkkey(rfs4_entry_t);
357 static bool_t rfs4_lo_state_create(rfs4_entry_t, void *);
358 static void rfs4_lo_state_destroy(rfs4_entry_t);
359 static bool_t rfs4_lo_state_expiry(rfs4_entry_t);
360 static uint32_t lo_state_hash(void *);
361 static bool_t lo_state_compare(rfs4_entry_t, void *);
362 static void *lo_state_mkkey(rfs4_entry_t);
363 static uint32_t lo_state_lo_hash(void *);
364 static bool_t lo_state_lo_compare(rfs4_entry_t, void *);
365 static void *lo_state_lo_mkkey(rfs4_entry_t);
366 static bool_t rfs4_lockowner_create(rfs4_entry_t, void *);
367 static void rfs4_lockowner_destroy(rfs4_entry_t);
368 static bool_t rfs4_lockowner_expiry(rfs4_entry_t);
369 static uint32_t lockowner_hash(void *);
370 static bool_t lockowner_compare(rfs4_entry_t, void *);
371 static void *lockowner_mkkey(rfs4_entry_t);
372 static uint32_t pid_hash(void *);
373 static bool_t pid_compare(rfs4_entry_t, void *);
374 static void *pid_mkkey(rfs4_entry_t);
375 static bool_t rfs4_file_create(rfs4_entry_t, void *);
376 static void rfs4_file_destroy(rfs4_entry_t);
377 static uint32_t file_hash(void *);
378 static bool_t file_compare(rfs4_entry_t, void *);
379 static void *file_mkkey(rfs4_entry_t);
380 static bool_t rfs4_deleg_state_create(rfs4_entry_t, void *);
381 static void rfs4_deleg_state_destroy(rfs4_entry_t);
382 static bool_t rfs4_deleg_state_expiry(rfs4_entry_t);
383 static uint32_t deleg_hash(void *);
384 static bool_t deleg_compare(rfs4_entry_t, void *);
385 static void *deleg_mkkey(rfs4_entry_t);
386 static uint32_t deleg_state_hash(void *);
387 static bool_t deleg_state_compare(rfs4_entry_t, void *);
388 static void *deleg_state_mkkey(rfs4_entry_t);
389 
390 static void rfs4_state_rele_nounlock(rfs4_state_t *);
391 
392 static int rfs4_ss_enabled = 0;
393 
394 extern void (*rfs4_client_clrst)(struct nfs4clrst_args *);
395 
396 void
397 rfs4_ss_pnfree(rfs4_ss_pn_t *ss_pn)
398 {
399 	kmem_free(ss_pn, sizeof (rfs4_ss_pn_t));
400 }
401 
402 static rfs4_ss_pn_t *
403 rfs4_ss_pnalloc(char *dir, char *leaf)
404 {
405 	rfs4_ss_pn_t *ss_pn;
406 	int dir_len, leaf_len;
407 
408 	/*
409 	 * validate we have a resonable path
410 	 * (account for the '/' and trailing null)
411 	 */
412 	if ((dir_len = strlen(dir)) > MAXPATHLEN ||
413 	    (leaf_len = strlen(leaf)) > MAXNAMELEN ||
414 	    (dir_len + leaf_len + 2) > MAXPATHLEN) {
415 		return (NULL);
416 	}
417 
418 	ss_pn = kmem_alloc(sizeof (rfs4_ss_pn_t), KM_SLEEP);
419 
420 	(void) snprintf(ss_pn->pn, MAXPATHLEN, "%s/%s", dir, leaf);
421 	/* Handy pointer to just the leaf name */
422 	ss_pn->leaf = ss_pn->pn + dir_len + 1;
423 	return (ss_pn);
424 }
425 
426 
427 /*
428  * Move the "leaf" filename from "sdir" directory
429  * to the "ddir" directory. Return the pathname of
430  * the destination unless the rename fails in which
431  * case we need to return the source pathname.
432  */
433 static rfs4_ss_pn_t *
434 rfs4_ss_movestate(char *sdir, char *ddir, char *leaf)
435 {
436 	rfs4_ss_pn_t *src, *dst;
437 
438 	if ((src = rfs4_ss_pnalloc(sdir, leaf)) == NULL)
439 		return (NULL);
440 
441 	if ((dst = rfs4_ss_pnalloc(ddir, leaf)) == NULL) {
442 		rfs4_ss_pnfree(src);
443 		return (NULL);
444 	}
445 
446 	/*
447 	 * If the rename fails we shall return the src
448 	 * pathname and free the dst. Otherwise we need
449 	 * to free the src and return the dst pathanme.
450 	 */
451 	if (vn_rename(src->pn, dst->pn, UIO_SYSSPACE)) {
452 		rfs4_ss_pnfree(dst);
453 		return (src);
454 	}
455 	rfs4_ss_pnfree(src);
456 	return (dst);
457 }
458 
459 
460 static rfs4_oldstate_t *
461 rfs4_ss_getstate(vnode_t *dvp, rfs4_ss_pn_t *ss_pn)
462 {
463 	struct uio uio;
464 	struct iovec iov[3];
465 
466 	rfs4_oldstate_t *cl_ss = NULL;
467 	vnode_t *vp;
468 	vattr_t va;
469 	uint_t id_len;
470 	int err, kill_file, file_vers;
471 
472 	if (ss_pn == NULL)
473 		return (NULL);
474 
475 	/*
476 	 * open the state file.
477 	 */
478 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0) != 0) {
479 		return (NULL);
480 	}
481 
482 	if (vp->v_type != VREG) {
483 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
484 		VN_RELE(vp);
485 		return (NULL);
486 	}
487 
488 	err = VOP_ACCESS(vp, VREAD, 0, CRED(), NULL);
489 	if (err) {
490 		/*
491 		 * We don't have read access? better get the heck out.
492 		 */
493 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
494 		VN_RELE(vp);
495 		return (NULL);
496 	}
497 
498 	(void) VOP_RWLOCK(vp, V_WRITELOCK_FALSE, NULL);
499 	/*
500 	 * get the file size to do some basic validation
501 	 */
502 	va.va_mask = AT_SIZE;
503 	err = VOP_GETATTR(vp, &va, 0, CRED(), NULL);
504 
505 	kill_file = (va.va_size == 0 || va.va_size <
506 	    (NFS4_VERIFIER_SIZE + sizeof (uint_t)+1));
507 
508 	if (err || kill_file) {
509 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
510 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
511 		VN_RELE(vp);
512 		if (kill_file) {
513 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
514 		}
515 		return (NULL);
516 	}
517 
518 	cl_ss = kmem_alloc(sizeof (rfs4_oldstate_t), KM_SLEEP);
519 
520 	/*
521 	 * build iovecs to read in the file_version, verifier and id_len
522 	 */
523 	iov[0].iov_base = (caddr_t)&file_vers;
524 	iov[0].iov_len = sizeof (int);
525 	iov[1].iov_base = (caddr_t)&cl_ss->cl_id4.verifier;
526 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
527 	iov[2].iov_base = (caddr_t)&id_len;
528 	iov[2].iov_len = sizeof (uint_t);
529 
530 	uio.uio_iov = iov;
531 	uio.uio_iovcnt = 3;
532 	uio.uio_segflg = UIO_SYSSPACE;
533 	uio.uio_loffset = 0;
534 	uio.uio_resid = sizeof (int) + NFS4_VERIFIER_SIZE + sizeof (uint_t);
535 
536 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
537 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
538 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
539 		VN_RELE(vp);
540 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
541 		return (NULL);
542 	}
543 
544 	/*
545 	 * if the file_version doesn't match or if the
546 	 * id_len is zero or the combination of the verifier,
547 	 * id_len and id_val is bigger than the file we have
548 	 * a problem. If so ditch the file.
549 	 */
550 	kill_file = (file_vers != NFS4_SS_VERSION || id_len == 0 ||
551 	    (id_len + NFS4_VERIFIER_SIZE + sizeof (uint_t)) > va.va_size);
552 
553 	if (err || kill_file) {
554 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
555 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
556 		VN_RELE(vp);
557 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
558 		if (kill_file) {
559 			(void) VOP_REMOVE(dvp, ss_pn->leaf, CRED(), NULL, 0);
560 		}
561 		return (NULL);
562 	}
563 
564 	/*
565 	 * now get the client id value
566 	 */
567 	cl_ss->cl_id4.id_val = kmem_alloc(id_len, KM_SLEEP);
568 	iov[0].iov_base = cl_ss->cl_id4.id_val;
569 	iov[0].iov_len = id_len;
570 
571 	uio.uio_iov = iov;
572 	uio.uio_iovcnt = 1;
573 	uio.uio_segflg = UIO_SYSSPACE;
574 	uio.uio_resid = cl_ss->cl_id4.id_len = id_len;
575 
576 	if (err = VOP_READ(vp, &uio, FREAD, CRED(), NULL)) {
577 		VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
578 		(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
579 		VN_RELE(vp);
580 		kmem_free(cl_ss->cl_id4.id_val, id_len);
581 		kmem_free(cl_ss, sizeof (rfs4_oldstate_t));
582 		return (NULL);
583 	}
584 
585 	VOP_RWUNLOCK(vp, V_WRITELOCK_FALSE, NULL);
586 	(void) VOP_CLOSE(vp, FREAD, 1, (offset_t)0, CRED(), NULL);
587 	VN_RELE(vp);
588 	return (cl_ss);
589 }
590 
591 #ifdef	nextdp
592 #undef nextdp
593 #endif
594 #define	nextdp(dp)	((struct dirent64 *)((char *)(dp) + (dp)->d_reclen))
595 
596 /*
597  * Add entries from statedir to supplied oldstate list.
598  * Optionally, move all entries from statedir -> destdir.
599  */
600 void
601 rfs4_ss_oldstate(rfs4_oldstate_t *oldstate, char *statedir, char *destdir)
602 {
603 	rfs4_ss_pn_t *ss_pn;
604 	rfs4_oldstate_t *cl_ss = NULL;
605 	char	*dirt = NULL;
606 	int	err, dir_eof = 0, size = 0;
607 	vnode_t *dvp;
608 	struct iovec iov;
609 	struct uio uio;
610 	struct dirent64 *dep;
611 	offset_t dirchunk_offset = 0;
612 
613 	/*
614 	 * open the state directory
615 	 */
616 	if (vn_open(statedir, UIO_SYSSPACE, FREAD, 0, &dvp, 0, 0))
617 		return;
618 
619 	if (dvp->v_type != VDIR || VOP_ACCESS(dvp, VREAD, 0, CRED(), NULL))
620 		goto out;
621 
622 	dirt = kmem_alloc(RFS4_SS_DIRSIZE, KM_SLEEP);
623 
624 	/*
625 	 * Get and process the directory entries
626 	 */
627 	while (!dir_eof) {
628 		(void) VOP_RWLOCK(dvp, V_WRITELOCK_FALSE, NULL);
629 		iov.iov_base = dirt;
630 		iov.iov_len = RFS4_SS_DIRSIZE;
631 		uio.uio_iov = &iov;
632 		uio.uio_iovcnt = 1;
633 		uio.uio_segflg = UIO_SYSSPACE;
634 		uio.uio_loffset = dirchunk_offset;
635 		uio.uio_resid = RFS4_SS_DIRSIZE;
636 
637 		err = VOP_READDIR(dvp, &uio, CRED(), &dir_eof, NULL, 0);
638 		VOP_RWUNLOCK(dvp, V_WRITELOCK_FALSE, NULL);
639 		if (err)
640 			goto out;
641 
642 		size = RFS4_SS_DIRSIZE - uio.uio_resid;
643 
644 		/*
645 		 * Process all the directory entries in this
646 		 * readdir chunk
647 		 */
648 		for (dep = (struct dirent64 *)dirt; size > 0;
649 		    dep = nextdp(dep)) {
650 
651 			size -= dep->d_reclen;
652 			dirchunk_offset = dep->d_off;
653 
654 			/*
655 			 * Skip '.' and '..'
656 			 */
657 			if (NFS_IS_DOTNAME(dep->d_name))
658 				continue;
659 
660 			ss_pn = rfs4_ss_pnalloc(statedir, dep->d_name);
661 			if (ss_pn == NULL)
662 				continue;
663 
664 			if (cl_ss = rfs4_ss_getstate(dvp, ss_pn)) {
665 				if (destdir != NULL) {
666 					rfs4_ss_pnfree(ss_pn);
667 					cl_ss->ss_pn = rfs4_ss_movestate(
668 					    statedir, destdir, dep->d_name);
669 				} else {
670 					cl_ss->ss_pn = ss_pn;
671 				}
672 				insque(cl_ss, oldstate);
673 			} else {
674 				rfs4_ss_pnfree(ss_pn);
675 			}
676 		}
677 	}
678 
679 out:
680 	(void) VOP_CLOSE(dvp, FREAD, 1, (offset_t)0, CRED(), NULL);
681 	VN_RELE(dvp);
682 	if (dirt)
683 		kmem_free((caddr_t)dirt, RFS4_SS_DIRSIZE);
684 }
685 
686 static void
687 rfs4_ss_init(nfs4_srv_t *nsrv4)
688 {
689 	int npaths = 1;
690 	char *default_dss_path = NFS4_DSS_VAR_DIR;
691 
692 	/* read the default stable storage state */
693 	rfs4_dss_readstate(nsrv4, npaths, &default_dss_path);
694 
695 	rfs4_ss_enabled = 1;
696 }
697 
698 static void
699 rfs4_ss_fini(nfs4_srv_t *nsrv4)
700 {
701 	rfs4_servinst_t *sip;
702 
703 	mutex_enter(&nsrv4->servinst_lock);
704 	sip = nsrv4->nfs4_cur_servinst;
705 	while (sip != NULL) {
706 		rfs4_dss_clear_oldstate(sip);
707 		sip = sip->next;
708 	}
709 	mutex_exit(&nsrv4->servinst_lock);
710 }
711 
712 /*
713  * Remove all oldstate files referenced by this servinst.
714  */
715 static void
716 rfs4_dss_clear_oldstate(rfs4_servinst_t *sip)
717 {
718 	rfs4_oldstate_t *os_head, *osp;
719 
720 	rw_enter(&sip->oldstate_lock, RW_WRITER);
721 	os_head = sip->oldstate;
722 
723 	if (os_head == NULL) {
724 		rw_exit(&sip->oldstate_lock);
725 		return;
726 	}
727 
728 	/* skip dummy entry */
729 	osp = os_head->next;
730 	while (osp != os_head) {
731 		char *leaf = osp->ss_pn->leaf;
732 		rfs4_oldstate_t *os_next;
733 
734 		rfs4_dss_remove_leaf(sip, NFS4_DSS_OLDSTATE_LEAF, leaf);
735 
736 		if (osp->cl_id4.id_val)
737 			kmem_free(osp->cl_id4.id_val, osp->cl_id4.id_len);
738 		rfs4_ss_pnfree(osp->ss_pn);
739 
740 		os_next = osp->next;
741 		remque(osp);
742 		kmem_free(osp, sizeof (rfs4_oldstate_t));
743 		osp = os_next;
744 	}
745 
746 	rw_exit(&sip->oldstate_lock);
747 }
748 
749 /*
750  * Form the state and oldstate paths, and read in the stable storage files.
751  */
752 void
753 rfs4_dss_readstate(nfs4_srv_t *nsrv4, int npaths, char **paths)
754 {
755 	int i;
756 	char *state, *oldstate;
757 
758 	state = kmem_alloc(MAXPATHLEN, KM_SLEEP);
759 	oldstate = kmem_alloc(MAXPATHLEN, KM_SLEEP);
760 
761 	for (i = 0; i < npaths; i++) {
762 		char *path = paths[i];
763 
764 		(void) sprintf(state, "%s/%s", path, NFS4_DSS_STATE_LEAF);
765 		(void) sprintf(oldstate, "%s/%s", path, NFS4_DSS_OLDSTATE_LEAF);
766 
767 		/*
768 		 * Populate the current server instance's oldstate list.
769 		 *
770 		 * 1. Read stable storage data from old state directory,
771 		 *    leaving its contents alone.
772 		 *
773 		 * 2. Read stable storage data from state directory,
774 		 *    and move the latter's contents to old state
775 		 *    directory.
776 		 */
777 		/* CSTYLED */
778 		rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, oldstate, NULL);
779 		/* CSTYLED */
780 		rfs4_ss_oldstate(nsrv4->nfs4_cur_servinst->oldstate, state, oldstate);
781 	}
782 
783 	kmem_free(state, MAXPATHLEN);
784 	kmem_free(oldstate, MAXPATHLEN);
785 }
786 
787 
788 /*
789  * Check if we are still in grace and if the client can be
790  * granted permission to perform reclaims.
791  */
792 void
793 rfs4_ss_chkclid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
794 {
795 	rfs4_servinst_t *sip;
796 
797 	/*
798 	 * It should be sufficient to check the oldstate data for just
799 	 * this client's instance. However, since our per-instance
800 	 * client grouping is solely temporal, HA-NFSv4 RG failover
801 	 * might result in clients of the same RG being partitioned into
802 	 * separate instances.
803 	 *
804 	 * Until the client grouping is improved, we must check the
805 	 * oldstate data for all instances with an active grace period.
806 	 *
807 	 * This also serves as the mechanism to remove stale oldstate data.
808 	 * The first time we check an instance after its grace period has
809 	 * expired, the oldstate data should be cleared.
810 	 *
811 	 * Start at the current instance, and walk the list backwards
812 	 * to the first.
813 	 */
814 	mutex_enter(&nsrv4->servinst_lock);
815 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
816 		rfs4_ss_chkclid_sip(cp, sip);
817 
818 		/* if the above check found this client, we're done */
819 		if (cp->rc_can_reclaim)
820 			break;
821 	}
822 	mutex_exit(&nsrv4->servinst_lock);
823 }
824 
825 static void
826 rfs4_ss_chkclid_sip(rfs4_client_t *cp, rfs4_servinst_t *sip)
827 {
828 	rfs4_oldstate_t *osp, *os_head;
829 
830 	/* short circuit everything if this server instance has no oldstate */
831 	rw_enter(&sip->oldstate_lock, RW_READER);
832 	os_head = sip->oldstate;
833 	rw_exit(&sip->oldstate_lock);
834 	if (os_head == NULL)
835 		return;
836 
837 	/*
838 	 * If this server instance is no longer in a grace period then
839 	 * the client won't be able to reclaim. No further need for this
840 	 * instance's oldstate data, so it can be cleared.
841 	 */
842 	if (!rfs4_servinst_in_grace(sip))
843 		return;
844 
845 	/* this instance is still in grace; search for the clientid */
846 
847 	rw_enter(&sip->oldstate_lock, RW_READER);
848 
849 	os_head = sip->oldstate;
850 	/* skip dummy entry */
851 	osp = os_head->next;
852 	while (osp != os_head) {
853 		if (osp->cl_id4.id_len == cp->rc_nfs_client.id_len) {
854 			if (bcmp(osp->cl_id4.id_val, cp->rc_nfs_client.id_val,
855 			    osp->cl_id4.id_len) == 0) {
856 				cp->rc_can_reclaim = 1;
857 				break;
858 			}
859 		}
860 		osp = osp->next;
861 	}
862 
863 	rw_exit(&sip->oldstate_lock);
864 }
865 
866 /*
867  * Place client information into stable storage: 1/3.
868  * First, generate the leaf filename, from the client's IP address and
869  * the server-generated short-hand clientid.
870  */
871 void
872 rfs4_ss_clid(nfs4_srv_t *nsrv4, rfs4_client_t *cp)
873 {
874 	const char *kinet_ntop6(uchar_t *, char *, size_t);
875 	char leaf[MAXNAMELEN], buf[INET6_ADDRSTRLEN];
876 	struct sockaddr *ca;
877 	uchar_t *b;
878 
879 	if (rfs4_ss_enabled == 0) {
880 		return;
881 	}
882 
883 	buf[0] = 0;
884 
885 	ca = (struct sockaddr *)&cp->rc_addr;
886 
887 	/*
888 	 * Convert the caller's IP address to a dotted string
889 	 */
890 	if (ca->sa_family == AF_INET) {
891 		b = (uchar_t *)&((struct sockaddr_in *)ca)->sin_addr;
892 		(void) sprintf(buf, "%03d.%03d.%03d.%03d", b[0] & 0xFF,
893 		    b[1] & 0xFF, b[2] & 0xFF, b[3] & 0xFF);
894 	} else if (ca->sa_family == AF_INET6) {
895 		struct sockaddr_in6 *sin6;
896 
897 		sin6 = (struct sockaddr_in6 *)ca;
898 		(void) kinet_ntop6((uchar_t *)&sin6->sin6_addr,
899 		    buf, INET6_ADDRSTRLEN);
900 	}
901 
902 	(void) snprintf(leaf, MAXNAMELEN, "%s-%llx", buf,
903 	    (longlong_t)cp->rc_clientid);
904 	rfs4_ss_clid_write(nsrv4, cp, leaf);
905 }
906 
907 /*
908  * Place client information into stable storage: 2/3.
909  * DSS: distributed stable storage: the file may need to be written to
910  * multiple directories.
911  */
912 static void
913 rfs4_ss_clid_write(nfs4_srv_t *nsrv4, rfs4_client_t *cp, char *leaf)
914 {
915 	rfs4_servinst_t *sip;
916 
917 	/*
918 	 * It should be sufficient to write the leaf file to (all) DSS paths
919 	 * associated with just this client's instance. However, since our
920 	 * per-instance client grouping is solely temporal, HA-NFSv4 RG
921 	 * failover might result in us losing DSS data.
922 	 *
923 	 * Until the client grouping is improved, we must write the DSS data
924 	 * to all instances' paths. Start at the current instance, and
925 	 * walk the list backwards to the first.
926 	 */
927 	mutex_enter(&nsrv4->servinst_lock);
928 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
929 		int i, npaths = sip->dss_npaths;
930 
931 		/* write the leaf file to all DSS paths */
932 		for (i = 0; i < npaths; i++) {
933 			rfs4_dss_path_t *dss_path = sip->dss_paths[i];
934 
935 			/* HA-NFSv4 path might have been failed-away from us */
936 			if (dss_path == NULL)
937 				continue;
938 
939 			rfs4_ss_clid_write_one(cp, dss_path->path, leaf);
940 		}
941 	}
942 	mutex_exit(&nsrv4->servinst_lock);
943 }
944 
945 /*
946  * Place client information into stable storage: 3/3.
947  * Write the stable storage data to the requested file.
948  */
949 static void
950 rfs4_ss_clid_write_one(rfs4_client_t *cp, char *dss_path, char *leaf)
951 {
952 	int ioflag;
953 	int file_vers = NFS4_SS_VERSION;
954 	size_t dirlen;
955 	struct uio uio;
956 	struct iovec iov[4];
957 	char *dir;
958 	rfs4_ss_pn_t *ss_pn;
959 	vnode_t *vp;
960 	nfs_client_id4 *cl_id4 = &(cp->rc_nfs_client);
961 
962 	/* allow 2 extra bytes for '/' & NUL */
963 	dirlen = strlen(dss_path) + strlen(NFS4_DSS_STATE_LEAF) + 2;
964 	dir = kmem_alloc(dirlen, KM_SLEEP);
965 	(void) sprintf(dir, "%s/%s", dss_path, NFS4_DSS_STATE_LEAF);
966 
967 	ss_pn = rfs4_ss_pnalloc(dir, leaf);
968 	/* rfs4_ss_pnalloc takes its own copy */
969 	kmem_free(dir, dirlen);
970 	if (ss_pn == NULL)
971 		return;
972 
973 	if (vn_open(ss_pn->pn, UIO_SYSSPACE, FCREAT|FWRITE, 0600, &vp,
974 	    CRCREAT, 0)) {
975 		rfs4_ss_pnfree(ss_pn);
976 		return;
977 	}
978 
979 	/*
980 	 * We need to record leaf - i.e. the filename - so that we know
981 	 * what to remove, in the future. However, the dir part of cp->ss_pn
982 	 * should never be referenced directly, since it's potentially only
983 	 * one of several paths with this leaf in it.
984 	 */
985 	if (cp->rc_ss_pn != NULL) {
986 		if (strcmp(cp->rc_ss_pn->leaf, leaf) == 0) {
987 			/* we've already recorded *this* leaf */
988 			rfs4_ss_pnfree(ss_pn);
989 		} else {
990 			/* replace with this leaf */
991 			rfs4_ss_pnfree(cp->rc_ss_pn);
992 			cp->rc_ss_pn = ss_pn;
993 		}
994 	} else {
995 		cp->rc_ss_pn = ss_pn;
996 	}
997 
998 	/*
999 	 * Build a scatter list that points to the nfs_client_id4
1000 	 */
1001 	iov[0].iov_base = (caddr_t)&file_vers;
1002 	iov[0].iov_len = sizeof (int);
1003 	iov[1].iov_base = (caddr_t)&(cl_id4->verifier);
1004 	iov[1].iov_len = NFS4_VERIFIER_SIZE;
1005 	iov[2].iov_base = (caddr_t)&(cl_id4->id_len);
1006 	iov[2].iov_len = sizeof (uint_t);
1007 	iov[3].iov_base = (caddr_t)cl_id4->id_val;
1008 	iov[3].iov_len = cl_id4->id_len;
1009 
1010 	uio.uio_iov = iov;
1011 	uio.uio_iovcnt = 4;
1012 	uio.uio_loffset = 0;
1013 	uio.uio_segflg = UIO_SYSSPACE;
1014 	uio.uio_llimit = (rlim64_t)MAXOFFSET_T;
1015 	uio.uio_resid = cl_id4->id_len + sizeof (int) +
1016 	    NFS4_VERIFIER_SIZE + sizeof (uint_t);
1017 
1018 	ioflag = uio.uio_fmode = (FWRITE|FSYNC);
1019 	uio.uio_extflg = UIO_COPY_DEFAULT;
1020 
1021 	(void) VOP_RWLOCK(vp, V_WRITELOCK_TRUE, NULL);
1022 	/* write the full client id to the file. */
1023 	(void) VOP_WRITE(vp, &uio, ioflag, CRED(), NULL);
1024 	VOP_RWUNLOCK(vp, V_WRITELOCK_TRUE, NULL);
1025 
1026 	(void) VOP_CLOSE(vp, FWRITE, 1, (offset_t)0, CRED(), NULL);
1027 	VN_RELE(vp);
1028 }
1029 
1030 /*
1031  * DSS: distributed stable storage.
1032  * Unpack the list of paths passed by nfsd.
1033  * Use nvlist_alloc(9F) to manage the data.
1034  * The caller is responsible for allocating and freeing the buffer.
1035  */
1036 int
1037 rfs4_dss_setpaths(char *buf, size_t buflen)
1038 {
1039 	int error;
1040 
1041 	/*
1042 	 * If this is a "warm start", i.e. we previously had DSS paths,
1043 	 * preserve the old paths.
1044 	 */
1045 	if (rfs4_dss_paths != NULL) {
1046 		/*
1047 		 * Before we lose the ptr, destroy the nvlist and pathnames
1048 		 * array from the warm start before this one.
1049 		 */
1050 		nvlist_free(rfs4_dss_oldpaths);
1051 		rfs4_dss_oldpaths = rfs4_dss_paths;
1052 	}
1053 
1054 	/* unpack the buffer into a searchable nvlist */
1055 	error = nvlist_unpack(buf, buflen, &rfs4_dss_paths, KM_SLEEP);
1056 	if (error)
1057 		return (error);
1058 
1059 	/*
1060 	 * Search the nvlist for the pathnames nvpair (which is the only nvpair
1061 	 * in the list, and record its location.
1062 	 */
1063 	error = nvlist_lookup_string_array(rfs4_dss_paths, NFS4_DSS_NVPAIR_NAME,
1064 	    &rfs4_dss_newpaths, &rfs4_dss_numnewpaths);
1065 	return (error);
1066 }
1067 
1068 /*
1069  * Ultimately the nfssys() call NFS4_CLR_STATE endsup here
1070  * to find and mark the client for forced expire.
1071  */
1072 static void
1073 rfs4_client_scrub(rfs4_entry_t ent, void *arg)
1074 {
1075 	rfs4_client_t *cp = (rfs4_client_t *)ent;
1076 	struct nfs4clrst_args *clr = arg;
1077 	struct sockaddr_in6 *ent_sin6;
1078 	struct in6_addr  clr_in6;
1079 	struct sockaddr_in  *ent_sin;
1080 	struct in_addr   clr_in;
1081 
1082 	if (clr->addr_type != cp->rc_addr.ss_family) {
1083 		return;
1084 	}
1085 
1086 	switch (clr->addr_type) {
1087 
1088 	case AF_INET6:
1089 		/* copyin the address from user space */
1090 		if (copyin(clr->ap, &clr_in6, sizeof (clr_in6))) {
1091 			break;
1092 		}
1093 
1094 		ent_sin6 = (struct sockaddr_in6 *)&cp->rc_addr;
1095 
1096 		/*
1097 		 * now compare, and if equivalent mark entry
1098 		 * for forced expiration
1099 		 */
1100 		if (IN6_ARE_ADDR_EQUAL(&ent_sin6->sin6_addr, &clr_in6)) {
1101 			cp->rc_forced_expire = 1;
1102 		}
1103 		break;
1104 
1105 	case AF_INET:
1106 		/* copyin the address from user space */
1107 		if (copyin(clr->ap, &clr_in, sizeof (clr_in))) {
1108 			break;
1109 		}
1110 
1111 		ent_sin = (struct sockaddr_in *)&cp->rc_addr;
1112 
1113 		/*
1114 		 * now compare, and if equivalent mark entry
1115 		 * for forced expiration
1116 		 */
1117 		if (ent_sin->sin_addr.s_addr == clr_in.s_addr) {
1118 			cp->rc_forced_expire = 1;
1119 		}
1120 		break;
1121 
1122 	default:
1123 		/* force this assert to fail */
1124 		ASSERT(clr->addr_type != clr->addr_type);
1125 	}
1126 }
1127 
1128 /*
1129  * This is called from nfssys() in order to clear server state
1130  * for the specified client IP Address.
1131  */
1132 void
1133 rfs4_clear_client_state(struct nfs4clrst_args *clr)
1134 {
1135 	nfs4_srv_t *nsrv4;
1136 	nsrv4 = nfs4_get_srv();
1137 	(void) rfs4_dbe_walk(nsrv4->rfs4_client_tab, rfs4_client_scrub, clr);
1138 }
1139 
1140 /*
1141  * Used to initialize the NFSv4 server's state or database.  All of
1142  * the tables are created and timers are set.
1143  */
1144 void
1145 rfs4_state_g_init()
1146 {
1147 	extern boolean_t rfs4_cpr_callb(void *, int);
1148 	/*
1149 	 * Add a CPR callback so that we can update client
1150 	 * access times to extend the lease after a suspend
1151 	 * and resume (using the same class as rpcmod/connmgr)
1152 	 */
1153 	cpr_id = callb_add(rfs4_cpr_callb, 0, CB_CL_CPR_RPC, "rfs4");
1154 
1155 	/*
1156 	 * NFSv4 server state databases
1157 	 *
1158 	 * Initialized when the module is loaded and used by NFSv4 state
1159 	 * tables.  These kmem_cache free pools are used globally, the NFSv4
1160 	 * state tables which make use of these kmem_cache free pools are per
1161 	 * zone.
1162 	 *
1163 	 * initialize the global kmem_cache free pools which will be used by
1164 	 * the NFSv4 state tables.
1165 	 */
1166 	/* CSTYLED */
1167 	rfs4_client_mem_cache = nfs4_init_mem_cache("Client_entry_cache", 2, sizeof (rfs4_client_t), 0);
1168 	/* CSTYLED */
1169 	rfs4_clntIP_mem_cache = nfs4_init_mem_cache("ClntIP_entry_cache", 1, sizeof (rfs4_clntip_t), 1);
1170 	/* CSTYLED */
1171 	rfs4_openown_mem_cache = nfs4_init_mem_cache("OpenOwner_entry_cache", 1, sizeof (rfs4_openowner_t), 2);
1172 	/* CSTYLED */
1173 	rfs4_openstID_mem_cache = nfs4_init_mem_cache("OpenStateID_entry_cache", 3, sizeof (rfs4_state_t), 3);
1174 	/* CSTYLED */
1175 	rfs4_lockstID_mem_cache = nfs4_init_mem_cache("LockStateID_entry_cache", 3, sizeof (rfs4_lo_state_t), 4);
1176 	/* CSTYLED */
1177 	rfs4_lockown_mem_cache = nfs4_init_mem_cache("Lockowner_entry_cache", 2, sizeof (rfs4_lockowner_t), 5);
1178 	/* CSTYLED */
1179 	rfs4_file_mem_cache = nfs4_init_mem_cache("File_entry_cache", 1, sizeof (rfs4_file_t), 6);
1180 	/* CSTYLED */
1181 	rfs4_delegstID_mem_cache = nfs4_init_mem_cache("DelegStateID_entry_cache", 2, sizeof (rfs4_deleg_state_t), 7);
1182 
1183 	rfs4_client_clrst = rfs4_clear_client_state;
1184 }
1185 
1186 
1187 /*
1188  * Used at server shutdown to cleanup all of the NFSv4 server's structures
1189  * and other state.
1190  */
1191 void
1192 rfs4_state_g_fini()
1193 {
1194 	int i;
1195 	/*
1196 	 * Cleanup the CPR callback.
1197 	 */
1198 	if (cpr_id)
1199 		(void) callb_delete(cpr_id);
1200 
1201 	rfs4_client_clrst = NULL;
1202 
1203 	/* free the NFSv4 state databases */
1204 	for (i = 0; i < RFS4_DB_MEM_CACHE_NUM; i++) {
1205 		kmem_cache_destroy(rfs4_db_mem_cache_table[i].r_db_mem_cache);
1206 		rfs4_db_mem_cache_table[i].r_db_mem_cache = NULL;
1207 	}
1208 
1209 	rfs4_client_mem_cache = NULL;
1210 	rfs4_clntIP_mem_cache = NULL;
1211 	rfs4_openown_mem_cache = NULL;
1212 	rfs4_openstID_mem_cache = NULL;
1213 	rfs4_lockstID_mem_cache = NULL;
1214 	rfs4_lockown_mem_cache = NULL;
1215 	rfs4_file_mem_cache = NULL;
1216 	rfs4_delegstID_mem_cache = NULL;
1217 
1218 	/* DSS: distributed stable storage */
1219 	nvlist_free(rfs4_dss_oldpaths);
1220 	nvlist_free(rfs4_dss_paths);
1221 	rfs4_dss_paths = rfs4_dss_oldpaths = NULL;
1222 }
1223 
1224 /*
1225  * Used to initialize the per zone NFSv4 server's state
1226  */
1227 void
1228 rfs4_state_zone_init(nfs4_srv_t *nsrv4)
1229 {
1230 	time_t start_time;
1231 	int start_grace;
1232 	char *dss_path = NFS4_DSS_VAR_DIR;
1233 
1234 	/* DSS: distributed stable storage: initialise served paths list */
1235 	nsrv4->dss_pathlist = NULL;
1236 
1237 	/*
1238 	 * Set the boot time.  If the server
1239 	 * has been restarted quickly and has had the opportunity to
1240 	 * service clients, then the start_time needs to be bumped
1241 	 * regardless.  A small window but it exists...
1242 	 */
1243 	start_time = gethrestime_sec();
1244 	if (nsrv4->rfs4_start_time < start_time)
1245 		nsrv4->rfs4_start_time = start_time;
1246 	else
1247 		nsrv4->rfs4_start_time++;
1248 
1249 	/*
1250 	 * Create the first server instance, or a new one if the server has
1251 	 * been restarted; see above comments on rfs4_start_time. Don't
1252 	 * start its grace period; that will be done later, to maximise the
1253 	 * clients' recovery window.
1254 	 */
1255 	start_grace = 0;
1256 	if (curzone == global_zone && rfs4_dss_numnewpaths > 0) {
1257 		int i;
1258 		char **dss_allpaths = NULL;
1259 		dss_allpaths = kmem_alloc(sizeof (char *) *
1260 		    (rfs4_dss_numnewpaths + 1), KM_SLEEP);
1261 		/*
1262 		 * Add the default path into the list of paths for saving
1263 		 * state informantion.
1264 		 */
1265 		dss_allpaths[0] = dss_path;
1266 		for (i = 0; i < rfs4_dss_numnewpaths; i++) {
1267 			dss_allpaths[i + 1] = rfs4_dss_newpaths[i];
1268 		}
1269 		rfs4_servinst_create(nsrv4, start_grace,
1270 		    (rfs4_dss_numnewpaths + 1), dss_allpaths);
1271 		kmem_free(dss_allpaths,
1272 		    (sizeof (char *) * (rfs4_dss_numnewpaths + 1)));
1273 	} else {
1274 		rfs4_servinst_create(nsrv4, start_grace, 1, &dss_path);
1275 	}
1276 
1277 	/* reset the "first NFSv4 request" status */
1278 	nsrv4->seen_first_compound = 0;
1279 
1280 	mutex_enter(&nsrv4->state_lock);
1281 
1282 	/*
1283 	 * If the server state database has already been initialized,
1284 	 * skip it
1285 	 */
1286 	if (nsrv4->nfs4_server_state != NULL) {
1287 		mutex_exit(&nsrv4->state_lock);
1288 		return;
1289 	}
1290 
1291 	rw_init(&nsrv4->rfs4_findclient_lock, NULL, RW_DEFAULT, NULL);
1292 
1293 	/* set the various cache timers for table creation */
1294 	if (nsrv4->rfs4_client_cache_time == 0)
1295 		nsrv4->rfs4_client_cache_time = CLIENT_CACHE_TIME;
1296 	if (nsrv4->rfs4_openowner_cache_time == 0)
1297 		nsrv4->rfs4_openowner_cache_time = OPENOWNER_CACHE_TIME;
1298 	if (nsrv4->rfs4_state_cache_time == 0)
1299 		nsrv4->rfs4_state_cache_time = STATE_CACHE_TIME;
1300 	if (nsrv4->rfs4_lo_state_cache_time == 0)
1301 		nsrv4->rfs4_lo_state_cache_time = LO_STATE_CACHE_TIME;
1302 	if (nsrv4->rfs4_lockowner_cache_time == 0)
1303 		nsrv4->rfs4_lockowner_cache_time = LOCKOWNER_CACHE_TIME;
1304 	if (nsrv4->rfs4_file_cache_time == 0)
1305 		nsrv4->rfs4_file_cache_time = FILE_CACHE_TIME;
1306 	if (nsrv4->rfs4_deleg_state_cache_time == 0)
1307 		nsrv4->rfs4_deleg_state_cache_time = DELEG_STATE_CACHE_TIME;
1308 
1309 	/* Create the overall database to hold all server state */
1310 	nsrv4->nfs4_server_state = rfs4_database_create(rfs4_database_debug);
1311 
1312 	/* Now create the individual tables */
1313 	nsrv4->rfs4_client_cache_time *= rfs4_lease_time;
1314 	nsrv4->rfs4_client_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1315 	    "Client",
1316 	    nsrv4->rfs4_client_cache_time,
1317 	    2,
1318 	    rfs4_client_create,
1319 	    rfs4_client_destroy,
1320 	    rfs4_client_expiry,
1321 	    sizeof (rfs4_client_t),
1322 	    TABSIZE,
1323 	    MAXTABSZ/8, 100);
1324 	nsrv4->rfs4_nfsclnt_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1325 	    "nfs_client_id4", nfsclnt_hash,
1326 	    nfsclnt_compare, nfsclnt_mkkey,
1327 	    TRUE);
1328 	nsrv4->rfs4_clientid_idx = rfs4_index_create(nsrv4->rfs4_client_tab,
1329 	    "client_id", clientid_hash,
1330 	    clientid_compare, clientid_mkkey,
1331 	    FALSE);
1332 
1333 	nsrv4->rfs4_clntip_cache_time = 86400 * 365;	/* about a year */
1334 	nsrv4->rfs4_clntip_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1335 	    "ClntIP",
1336 	    nsrv4->rfs4_clntip_cache_time,
1337 	    1,
1338 	    rfs4_clntip_create,
1339 	    rfs4_clntip_destroy,
1340 	    rfs4_clntip_expiry,
1341 	    sizeof (rfs4_clntip_t),
1342 	    TABSIZE,
1343 	    MAXTABSZ, 100);
1344 	nsrv4->rfs4_clntip_idx = rfs4_index_create(nsrv4->rfs4_clntip_tab,
1345 	    "client_ip", clntip_hash,
1346 	    clntip_compare, clntip_mkkey,
1347 	    TRUE);
1348 
1349 	nsrv4->rfs4_openowner_cache_time *= rfs4_lease_time;
1350 	nsrv4->rfs4_openowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1351 	    "OpenOwner",
1352 	    nsrv4->rfs4_openowner_cache_time,
1353 	    1,
1354 	    rfs4_openowner_create,
1355 	    rfs4_openowner_destroy,
1356 	    rfs4_openowner_expiry,
1357 	    sizeof (rfs4_openowner_t),
1358 	    TABSIZE,
1359 	    MAXTABSZ, 100);
1360 	nsrv4->rfs4_openowner_idx = rfs4_index_create(nsrv4->rfs4_openowner_tab,
1361 	    "open_owner4", openowner_hash,
1362 	    openowner_compare,
1363 	    openowner_mkkey, TRUE);
1364 
1365 	nsrv4->rfs4_state_cache_time *= rfs4_lease_time;
1366 	nsrv4->rfs4_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1367 	    "OpenStateID",
1368 	    nsrv4->rfs4_state_cache_time,
1369 	    3,
1370 	    rfs4_state_create,
1371 	    rfs4_state_destroy,
1372 	    rfs4_state_expiry,
1373 	    sizeof (rfs4_state_t),
1374 	    TABSIZE,
1375 	    MAXTABSZ, 100);
1376 
1377 	/* CSTYLED */
1378 	nsrv4->rfs4_state_owner_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1379 	    "Openowner-File",
1380 	    state_owner_file_hash,
1381 	    state_owner_file_compare,
1382 	    state_owner_file_mkkey, TRUE);
1383 
1384 	nsrv4->rfs4_state_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1385 	    "State-id", state_hash,
1386 	    state_compare, state_mkkey, FALSE);
1387 
1388 	nsrv4->rfs4_state_file_idx = rfs4_index_create(nsrv4->rfs4_state_tab,
1389 	    "File", state_file_hash,
1390 	    state_file_compare, state_file_mkkey,
1391 	    FALSE);
1392 
1393 	nsrv4->rfs4_lo_state_cache_time *= rfs4_lease_time;
1394 	nsrv4->rfs4_lo_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1395 	    "LockStateID",
1396 	    nsrv4->rfs4_lo_state_cache_time,
1397 	    2,
1398 	    rfs4_lo_state_create,
1399 	    rfs4_lo_state_destroy,
1400 	    rfs4_lo_state_expiry,
1401 	    sizeof (rfs4_lo_state_t),
1402 	    TABSIZE,
1403 	    MAXTABSZ, 100);
1404 
1405 	/* CSTYLED */
1406 	nsrv4->rfs4_lo_state_owner_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1407 	    "lockownerxstate",
1408 	    lo_state_lo_hash,
1409 	    lo_state_lo_compare,
1410 	    lo_state_lo_mkkey, TRUE);
1411 
1412 	nsrv4->rfs4_lo_state_idx = rfs4_index_create(nsrv4->rfs4_lo_state_tab,
1413 	    "State-id",
1414 	    lo_state_hash, lo_state_compare,
1415 	    lo_state_mkkey, FALSE);
1416 
1417 	nsrv4->rfs4_lockowner_cache_time *= rfs4_lease_time;
1418 
1419 	nsrv4->rfs4_lockowner_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1420 	    "Lockowner",
1421 	    nsrv4->rfs4_lockowner_cache_time,
1422 	    2,
1423 	    rfs4_lockowner_create,
1424 	    rfs4_lockowner_destroy,
1425 	    rfs4_lockowner_expiry,
1426 	    sizeof (rfs4_lockowner_t),
1427 	    TABSIZE,
1428 	    MAXTABSZ, 100);
1429 
1430 	nsrv4->rfs4_lockowner_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1431 	    "lock_owner4", lockowner_hash,
1432 	    lockowner_compare,
1433 	    lockowner_mkkey, TRUE);
1434 
1435 	/* CSTYLED */
1436 	nsrv4->rfs4_lockowner_pid_idx = rfs4_index_create(nsrv4->rfs4_lockowner_tab,
1437 	    "pid", pid_hash,
1438 	    pid_compare, pid_mkkey,
1439 	    FALSE);
1440 
1441 	nsrv4->rfs4_file_cache_time *= rfs4_lease_time;
1442 	nsrv4->rfs4_file_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1443 	    "File",
1444 	    nsrv4->rfs4_file_cache_time,
1445 	    1,
1446 	    rfs4_file_create,
1447 	    rfs4_file_destroy,
1448 	    NULL,
1449 	    sizeof (rfs4_file_t),
1450 	    TABSIZE,
1451 	    MAXTABSZ, -1);
1452 
1453 	nsrv4->rfs4_file_idx = rfs4_index_create(nsrv4->rfs4_file_tab,
1454 	    "Filehandle", file_hash,
1455 	    file_compare, file_mkkey, TRUE);
1456 
1457 	nsrv4->rfs4_deleg_state_cache_time *= rfs4_lease_time;
1458 	/* CSTYLED */
1459 	nsrv4->rfs4_deleg_state_tab = rfs4_table_create(nsrv4->nfs4_server_state,
1460 	    "DelegStateID",
1461 	    nsrv4->rfs4_deleg_state_cache_time,
1462 	    2,
1463 	    rfs4_deleg_state_create,
1464 	    rfs4_deleg_state_destroy,
1465 	    rfs4_deleg_state_expiry,
1466 	    sizeof (rfs4_deleg_state_t),
1467 	    TABSIZE,
1468 	    MAXTABSZ, 100);
1469 	nsrv4->rfs4_deleg_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1470 	    "DelegByFileClient",
1471 	    deleg_hash,
1472 	    deleg_compare,
1473 	    deleg_mkkey, TRUE);
1474 
1475 	/* CSTYLED */
1476 	nsrv4->rfs4_deleg_state_idx = rfs4_index_create(nsrv4->rfs4_deleg_state_tab,
1477 	    "DelegState",
1478 	    deleg_state_hash,
1479 	    deleg_state_compare,
1480 	    deleg_state_mkkey, FALSE);
1481 
1482 	mutex_exit(&nsrv4->state_lock);
1483 
1484 	/*
1485 	 * Init the stable storage.
1486 	 */
1487 	rfs4_ss_init(nsrv4);
1488 }
1489 
1490 /*
1491  * Used at server shutdown to cleanup all of NFSv4 server's zone structures
1492  * and state.
1493  */
1494 void
1495 rfs4_state_zone_fini()
1496 {
1497 	rfs4_database_t *dbp;
1498 	nfs4_srv_t *nsrv4;
1499 	nsrv4 = nfs4_get_srv();
1500 
1501 	rfs4_set_deleg_policy(nsrv4, SRV_NEVER_DELEGATE);
1502 
1503 	/*
1504 	 * Clean up any dangling stable storage structures BEFORE calling
1505 	 * rfs4_servinst_destroy_all() so there are no dangling structures
1506 	 * (i.e. the srvinsts are all cleared of danglers BEFORE they get
1507 	 * freed).
1508 	 */
1509 	rfs4_ss_fini(nsrv4);
1510 
1511 	mutex_enter(&nsrv4->state_lock);
1512 
1513 	if (nsrv4->nfs4_server_state == NULL) {
1514 		mutex_exit(&nsrv4->state_lock);
1515 		return;
1516 	}
1517 
1518 	/* destroy server instances and current instance ptr */
1519 	rfs4_servinst_destroy_all(nsrv4);
1520 
1521 	/* reset the "first NFSv4 request" status */
1522 	nsrv4->seen_first_compound = 0;
1523 
1524 	dbp = nsrv4->nfs4_server_state;
1525 	nsrv4->nfs4_server_state = NULL;
1526 
1527 	rw_destroy(&nsrv4->rfs4_findclient_lock);
1528 
1529 	/* First stop all of the reaper threads in the database */
1530 	rfs4_database_shutdown(dbp);
1531 
1532 	/*
1533 	 * WARNING: There may be consumers of the rfs4 database still
1534 	 * active as we destroy these.  IF that's the case, consider putting
1535 	 * some of their _zone_fini()-like functions into the zsd key as
1536 	 * ~~SHUTDOWN~~ functions instead of ~~DESTROY~~ functions.  We can
1537 	 * maintain some ordering guarantees better that way.
1538 	 */
1539 	/* Now destroy/release the database tables */
1540 	rfs4_database_destroy(dbp);
1541 
1542 	/* Reset the cache timers for next time */
1543 	nsrv4->rfs4_client_cache_time = 0;
1544 	nsrv4->rfs4_openowner_cache_time = 0;
1545 	nsrv4->rfs4_state_cache_time = 0;
1546 	nsrv4->rfs4_lo_state_cache_time = 0;
1547 	nsrv4->rfs4_lockowner_cache_time = 0;
1548 	nsrv4->rfs4_file_cache_time = 0;
1549 	nsrv4->rfs4_deleg_state_cache_time = 0;
1550 
1551 	mutex_exit(&nsrv4->state_lock);
1552 }
1553 
1554 typedef union {
1555 	struct {
1556 		uint32_t start_time;
1557 		uint32_t c_id;
1558 	} impl_id;
1559 	clientid4 id4;
1560 } cid;
1561 
1562 static int foreign_stateid(stateid_t *id);
1563 static int foreign_clientid(cid *cidp);
1564 static void embed_nodeid(cid *cidp);
1565 
1566 typedef union {
1567 	struct {
1568 		uint32_t c_id;
1569 		uint32_t gen_num;
1570 	} cv_impl;
1571 	verifier4	confirm_verf;
1572 } scid_confirm_verf;
1573 
1574 static uint32_t
1575 clientid_hash(void *key)
1576 {
1577 	cid *idp = key;
1578 
1579 	return (idp->impl_id.c_id);
1580 }
1581 
1582 static bool_t
1583 clientid_compare(rfs4_entry_t entry, void *key)
1584 {
1585 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1586 	clientid4 *idp = key;
1587 
1588 	return (*idp == cp->rc_clientid);
1589 }
1590 
1591 static void *
1592 clientid_mkkey(rfs4_entry_t entry)
1593 {
1594 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1595 
1596 	return (&cp->rc_clientid);
1597 }
1598 
1599 static uint32_t
1600 nfsclnt_hash(void *key)
1601 {
1602 	nfs_client_id4 *client = key;
1603 	int i;
1604 	uint32_t hash = 0;
1605 
1606 	for (i = 0; i < client->id_len; i++) {
1607 		hash <<= 1;
1608 		hash += (uint_t)client->id_val[i];
1609 	}
1610 	return (hash);
1611 }
1612 
1613 
1614 static bool_t
1615 nfsclnt_compare(rfs4_entry_t entry, void *key)
1616 {
1617 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1618 	nfs_client_id4 *nfs_client = key;
1619 
1620 	if (cp->rc_nfs_client.id_len != nfs_client->id_len)
1621 		return (FALSE);
1622 
1623 	return (bcmp(cp->rc_nfs_client.id_val, nfs_client->id_val,
1624 	    nfs_client->id_len) == 0);
1625 }
1626 
1627 static void *
1628 nfsclnt_mkkey(rfs4_entry_t entry)
1629 {
1630 	rfs4_client_t *cp = (rfs4_client_t *)entry;
1631 
1632 	return (&cp->rc_nfs_client);
1633 }
1634 
1635 static bool_t
1636 rfs4_client_expiry(rfs4_entry_t u_entry)
1637 {
1638 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1639 	bool_t cp_expired;
1640 
1641 	if (rfs4_dbe_is_invalid(cp->rc_dbe)) {
1642 		cp->rc_ss_remove = 1;
1643 		return (TRUE);
1644 	}
1645 	/*
1646 	 * If the sysadmin has used clear_locks for this
1647 	 * entry then forced_expire will be set and we
1648 	 * want this entry to be reaped. Or the entry
1649 	 * has exceeded its lease period.
1650 	 */
1651 	cp_expired = (cp->rc_forced_expire ||
1652 	    (gethrestime_sec() - cp->rc_last_access
1653 	    > rfs4_lease_time));
1654 
1655 	if (!cp->rc_ss_remove && cp_expired)
1656 		cp->rc_ss_remove = 1;
1657 	return (cp_expired);
1658 }
1659 
1660 /*
1661  * Remove the leaf file from all distributed stable storage paths.
1662  */
1663 static void
1664 rfs4_dss_remove_cpleaf(rfs4_client_t *cp)
1665 {
1666 	nfs4_srv_t *nsrv4;
1667 	rfs4_servinst_t *sip;
1668 	char *leaf = cp->rc_ss_pn->leaf;
1669 
1670 	/*
1671 	 * since the state files are written to all DSS
1672 	 * paths we must remove this leaf file instance
1673 	 * from all server instances.
1674 	 */
1675 
1676 	nsrv4 = nfs4_get_srv();
1677 	mutex_enter(&nsrv4->servinst_lock);
1678 	for (sip = nsrv4->nfs4_cur_servinst; sip != NULL; sip = sip->prev) {
1679 		/* remove the leaf file associated with this server instance */
1680 		rfs4_dss_remove_leaf(sip, NFS4_DSS_STATE_LEAF, leaf);
1681 	}
1682 	mutex_exit(&nsrv4->servinst_lock);
1683 }
1684 
1685 static void
1686 rfs4_dss_remove_leaf(rfs4_servinst_t *sip, char *dir_leaf, char *leaf)
1687 {
1688 	int i, npaths = sip->dss_npaths;
1689 
1690 	for (i = 0; i < npaths; i++) {
1691 		rfs4_dss_path_t *dss_path = sip->dss_paths[i];
1692 		char *path, *dir;
1693 		size_t pathlen;
1694 
1695 		/* the HA-NFSv4 path might have been failed-over away from us */
1696 		if (dss_path == NULL)
1697 			continue;
1698 
1699 		dir = dss_path->path;
1700 
1701 		/* allow 3 extra bytes for two '/' & a NUL */
1702 		pathlen = strlen(dir) + strlen(dir_leaf) + strlen(leaf) + 3;
1703 		path = kmem_alloc(pathlen, KM_SLEEP);
1704 		(void) sprintf(path, "%s/%s/%s", dir, dir_leaf, leaf);
1705 
1706 		(void) vn_remove(path, UIO_SYSSPACE, RMFILE);
1707 
1708 		kmem_free(path, pathlen);
1709 	}
1710 }
1711 
1712 static void
1713 rfs4_client_destroy(rfs4_entry_t u_entry)
1714 {
1715 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1716 
1717 	mutex_destroy(cp->rc_cbinfo.cb_lock);
1718 	cv_destroy(cp->rc_cbinfo.cb_cv);
1719 	cv_destroy(cp->rc_cbinfo.cb_cv_nullcaller);
1720 	list_destroy(&cp->rc_openownerlist);
1721 
1722 	/* free callback info */
1723 	rfs4_cbinfo_free(&cp->rc_cbinfo);
1724 
1725 	if (cp->rc_cp_confirmed)
1726 		rfs4_client_rele(cp->rc_cp_confirmed);
1727 
1728 	if (cp->rc_ss_pn) {
1729 		/* check if the stable storage files need to be removed */
1730 		if (cp->rc_ss_remove)
1731 			rfs4_dss_remove_cpleaf(cp);
1732 		rfs4_ss_pnfree(cp->rc_ss_pn);
1733 	}
1734 
1735 	/* Free the client supplied client id */
1736 	kmem_free(cp->rc_nfs_client.id_val, cp->rc_nfs_client.id_len);
1737 
1738 	if (cp->rc_sysidt != LM_NOSYSID)
1739 		lm_free_sysidt(cp->rc_sysidt);
1740 }
1741 
1742 static bool_t
1743 rfs4_client_create(rfs4_entry_t u_entry, void *arg)
1744 {
1745 	rfs4_client_t *cp = (rfs4_client_t *)u_entry;
1746 	nfs_client_id4 *client = (nfs_client_id4 *)arg;
1747 	struct sockaddr *ca;
1748 	cid *cidp;
1749 	scid_confirm_verf *scvp;
1750 	nfs4_srv_t *nsrv4;
1751 
1752 	nsrv4 = nfs4_get_srv();
1753 
1754 	/* Get a clientid to give to the client */
1755 	cidp = (cid *)&cp->rc_clientid;
1756 	cidp->impl_id.start_time = nsrv4->rfs4_start_time;
1757 	cidp->impl_id.c_id = (uint32_t)rfs4_dbe_getid(cp->rc_dbe);
1758 
1759 	/* If we are booted as a cluster node, embed our nodeid */
1760 	if (cluster_bootflags & CLUSTER_BOOTED)
1761 		embed_nodeid(cidp);
1762 
1763 	/* Allocate and copy client's client id value */
1764 	cp->rc_nfs_client.id_val = kmem_alloc(client->id_len, KM_SLEEP);
1765 	cp->rc_nfs_client.id_len = client->id_len;
1766 	bcopy(client->id_val, cp->rc_nfs_client.id_val, client->id_len);
1767 	cp->rc_nfs_client.verifier = client->verifier;
1768 
1769 	/* Copy client's IP address */
1770 	ca = client->cl_addr;
1771 	if (ca->sa_family == AF_INET)
1772 		bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in));
1773 	else if (ca->sa_family == AF_INET6)
1774 		bcopy(ca, &cp->rc_addr, sizeof (struct sockaddr_in6));
1775 	cp->rc_nfs_client.cl_addr = (struct sockaddr *)&cp->rc_addr;
1776 
1777 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1778 	scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1779 	scvp->cv_impl.c_id = cidp->impl_id.c_id;
1780 	scvp->cv_impl.gen_num = 0;
1781 
1782 	/* An F_UNLKSYS has been done for this client */
1783 	cp->rc_unlksys_completed = FALSE;
1784 
1785 	/* We need the client to ack us */
1786 	cp->rc_need_confirm = TRUE;
1787 	cp->rc_cp_confirmed = NULL;
1788 
1789 	/* TRUE all the time until the callback path actually fails */
1790 	cp->rc_cbinfo.cb_notified_of_cb_path_down = TRUE;
1791 
1792 	/* Initialize the access time to now */
1793 	cp->rc_last_access = gethrestime_sec();
1794 
1795 	cp->rc_cr_set = NULL;
1796 
1797 	cp->rc_sysidt = LM_NOSYSID;
1798 
1799 	list_create(&cp->rc_openownerlist, sizeof (rfs4_openowner_t),
1800 	    offsetof(rfs4_openowner_t, ro_node));
1801 
1802 	/* set up the callback control structure */
1803 	cp->rc_cbinfo.cb_state = CB_UNINIT;
1804 	mutex_init(cp->rc_cbinfo.cb_lock, NULL, MUTEX_DEFAULT, NULL);
1805 	cv_init(cp->rc_cbinfo.cb_cv, NULL, CV_DEFAULT, NULL);
1806 	cv_init(cp->rc_cbinfo.cb_cv_nullcaller, NULL, CV_DEFAULT, NULL);
1807 
1808 	/*
1809 	 * Associate the client_t with the current server instance.
1810 	 * The hold is solely to satisfy the calling requirement of
1811 	 * rfs4_servinst_assign(). In this case it's not strictly necessary.
1812 	 */
1813 	rfs4_dbe_hold(cp->rc_dbe);
1814 	rfs4_servinst_assign(nsrv4, cp, nsrv4->nfs4_cur_servinst);
1815 	rfs4_dbe_rele(cp->rc_dbe);
1816 
1817 	return (TRUE);
1818 }
1819 
1820 /*
1821  * Caller wants to generate/update the setclientid_confirm verifier
1822  * associated with a client.  This is done during the SETCLIENTID
1823  * processing.
1824  */
1825 void
1826 rfs4_client_scv_next(rfs4_client_t *cp)
1827 {
1828 	scid_confirm_verf *scvp;
1829 
1830 	/* Init the value for the SETCLIENTID_CONFIRM verifier */
1831 	scvp = (scid_confirm_verf *)&cp->rc_confirm_verf;
1832 	scvp->cv_impl.gen_num++;
1833 }
1834 
1835 void
1836 rfs4_client_rele(rfs4_client_t *cp)
1837 {
1838 	rfs4_dbe_rele(cp->rc_dbe);
1839 }
1840 
1841 rfs4_client_t *
1842 rfs4_findclient(nfs_client_id4 *client, bool_t *create,	rfs4_client_t *oldcp)
1843 {
1844 	rfs4_client_t *cp;
1845 	nfs4_srv_t *nsrv4;
1846 	nsrv4 = nfs4_get_srv();
1847 
1848 
1849 	if (oldcp) {
1850 		rw_enter(&nsrv4->rfs4_findclient_lock, RW_WRITER);
1851 		rfs4_dbe_hide(oldcp->rc_dbe);
1852 	} else {
1853 		rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1854 	}
1855 
1856 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_nfsclnt_idx, client,
1857 	    create, (void *)client, RFS4_DBS_VALID);
1858 
1859 	if (oldcp)
1860 		rfs4_dbe_unhide(oldcp->rc_dbe);
1861 
1862 	rw_exit(&nsrv4->rfs4_findclient_lock);
1863 
1864 	return (cp);
1865 }
1866 
1867 rfs4_client_t *
1868 rfs4_findclient_by_id(clientid4 clientid, bool_t find_unconfirmed)
1869 {
1870 	rfs4_client_t *cp;
1871 	bool_t create = FALSE;
1872 	cid *cidp = (cid *)&clientid;
1873 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
1874 
1875 	/* If we're a cluster and the nodeid isn't right, short-circuit */
1876 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
1877 		return (NULL);
1878 
1879 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1880 
1881 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx, &clientid,
1882 	    &create, NULL, RFS4_DBS_VALID);
1883 
1884 	rw_exit(&nsrv4->rfs4_findclient_lock);
1885 
1886 	if (cp && cp->rc_need_confirm && find_unconfirmed == FALSE) {
1887 		rfs4_client_rele(cp);
1888 		return (NULL);
1889 	} else {
1890 		return (cp);
1891 	}
1892 }
1893 
1894 static uint32_t
1895 clntip_hash(void *key)
1896 {
1897 	struct sockaddr *addr = key;
1898 	int i, len = 0;
1899 	uint32_t hash = 0;
1900 	char *ptr;
1901 
1902 	if (addr->sa_family == AF_INET) {
1903 		struct sockaddr_in *a = (struct sockaddr_in *)addr;
1904 		len = sizeof (struct in_addr);
1905 		ptr = (char *)&a->sin_addr;
1906 	} else if (addr->sa_family == AF_INET6) {
1907 		struct sockaddr_in6 *a = (struct sockaddr_in6 *)addr;
1908 		len = sizeof (struct in6_addr);
1909 		ptr = (char *)&a->sin6_addr;
1910 	} else
1911 		return (0);
1912 
1913 	for (i = 0; i < len; i++) {
1914 		hash <<= 1;
1915 		hash += (uint_t)ptr[i];
1916 	}
1917 	return (hash);
1918 }
1919 
1920 static bool_t
1921 clntip_compare(rfs4_entry_t entry, void *key)
1922 {
1923 	rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1924 	struct sockaddr *addr = key;
1925 	int len = 0;
1926 	char *p1, *p2;
1927 
1928 	if (addr->sa_family == AF_INET) {
1929 		struct sockaddr_in *a1 = (struct sockaddr_in *)&cp->ri_addr;
1930 		struct sockaddr_in *a2 = (struct sockaddr_in *)addr;
1931 		len = sizeof (struct in_addr);
1932 		p1 = (char *)&a1->sin_addr;
1933 		p2 = (char *)&a2->sin_addr;
1934 	} else if (addr->sa_family == AF_INET6) {
1935 		struct sockaddr_in6 *a1 = (struct sockaddr_in6 *)&cp->ri_addr;
1936 		struct sockaddr_in6 *a2 = (struct sockaddr_in6 *)addr;
1937 		len = sizeof (struct in6_addr);
1938 		p1 = (char *)&a1->sin6_addr;
1939 		p2 = (char *)&a2->sin6_addr;
1940 	} else
1941 		return (0);
1942 
1943 	return (bcmp(p1, p2, len) == 0);
1944 }
1945 
1946 static void *
1947 clntip_mkkey(rfs4_entry_t entry)
1948 {
1949 	rfs4_clntip_t *cp = (rfs4_clntip_t *)entry;
1950 
1951 	return (&cp->ri_addr);
1952 }
1953 
1954 static bool_t
1955 rfs4_clntip_expiry(rfs4_entry_t u_entry)
1956 {
1957 	rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1958 
1959 	if (rfs4_dbe_is_invalid(cp->ri_dbe))
1960 		return (TRUE);
1961 	return (FALSE);
1962 }
1963 
1964 /* ARGSUSED */
1965 static void
1966 rfs4_clntip_destroy(rfs4_entry_t u_entry)
1967 {
1968 }
1969 
1970 static bool_t
1971 rfs4_clntip_create(rfs4_entry_t u_entry, void *arg)
1972 {
1973 	rfs4_clntip_t *cp = (rfs4_clntip_t *)u_entry;
1974 	struct sockaddr *ca = (struct sockaddr *)arg;
1975 
1976 	/* Copy client's IP address */
1977 	if (ca->sa_family == AF_INET)
1978 		bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in));
1979 	else if (ca->sa_family == AF_INET6)
1980 		bcopy(ca, &cp->ri_addr, sizeof (struct sockaddr_in6));
1981 	else
1982 		return (FALSE);
1983 	cp->ri_no_referrals = 1;
1984 
1985 	return (TRUE);
1986 }
1987 
1988 rfs4_clntip_t *
1989 rfs4_find_clntip(struct sockaddr *addr, bool_t *create)
1990 {
1991 	rfs4_clntip_t *cp;
1992 	nfs4_srv_t *nsrv4;
1993 
1994 	nsrv4 = nfs4_get_srv();
1995 
1996 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
1997 
1998 	cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
1999 	    create, addr, RFS4_DBS_VALID);
2000 
2001 	rw_exit(&nsrv4->rfs4_findclient_lock);
2002 
2003 	return (cp);
2004 }
2005 
2006 void
2007 rfs4_invalidate_clntip(struct sockaddr *addr)
2008 {
2009 	rfs4_clntip_t *cp;
2010 	bool_t create = FALSE;
2011 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2012 
2013 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2014 
2015 	cp = (rfs4_clntip_t *)rfs4_dbsearch(nsrv4->rfs4_clntip_idx, addr,
2016 	    &create, NULL, RFS4_DBS_VALID);
2017 	if (cp == NULL) {
2018 		rw_exit(&nsrv4->rfs4_findclient_lock);
2019 		return;
2020 	}
2021 	rfs4_dbe_invalidate(cp->ri_dbe);
2022 	rfs4_dbe_rele(cp->ri_dbe);
2023 
2024 	rw_exit(&nsrv4->rfs4_findclient_lock);
2025 }
2026 
2027 bool_t
2028 rfs4_lease_expired(rfs4_client_t *cp)
2029 {
2030 	bool_t rc;
2031 
2032 	rfs4_dbe_lock(cp->rc_dbe);
2033 
2034 	/*
2035 	 * If the admin has executed clear_locks for this
2036 	 * client id, force expire will be set, so no need
2037 	 * to calculate anything because it's "outa here".
2038 	 */
2039 	if (cp->rc_forced_expire) {
2040 		rc = TRUE;
2041 	} else {
2042 		rc = (gethrestime_sec() - cp->rc_last_access > rfs4_lease_time);
2043 	}
2044 
2045 	/*
2046 	 * If the lease has expired we will also want
2047 	 * to remove any stable storage state data. So
2048 	 * mark the client id accordingly.
2049 	 */
2050 	if (!cp->rc_ss_remove)
2051 		cp->rc_ss_remove = (rc == TRUE);
2052 
2053 	rfs4_dbe_unlock(cp->rc_dbe);
2054 
2055 	return (rc);
2056 }
2057 
2058 void
2059 rfs4_update_lease(rfs4_client_t *cp)
2060 {
2061 	rfs4_dbe_lock(cp->rc_dbe);
2062 	if (!cp->rc_forced_expire)
2063 		cp->rc_last_access = gethrestime_sec();
2064 	rfs4_dbe_unlock(cp->rc_dbe);
2065 }
2066 
2067 
2068 static bool_t
2069 EQOPENOWNER(open_owner4 *a, open_owner4 *b)
2070 {
2071 	bool_t rc;
2072 
2073 	if (a->clientid != b->clientid)
2074 		return (FALSE);
2075 
2076 	if (a->owner_len != b->owner_len)
2077 		return (FALSE);
2078 
2079 	rc = (bcmp(a->owner_val, b->owner_val, a->owner_len) == 0);
2080 
2081 	return (rc);
2082 }
2083 
2084 static uint_t
2085 openowner_hash(void *key)
2086 {
2087 	int i;
2088 	open_owner4 *openowner = key;
2089 	uint_t hash = 0;
2090 
2091 	for (i = 0; i < openowner->owner_len; i++) {
2092 		hash <<= 4;
2093 		hash += (uint_t)openowner->owner_val[i];
2094 	}
2095 	hash += (uint_t)openowner->clientid;
2096 	hash |= (openowner->clientid >> 32);
2097 
2098 	return (hash);
2099 }
2100 
2101 static bool_t
2102 openowner_compare(rfs4_entry_t u_entry, void *key)
2103 {
2104 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2105 	open_owner4 *arg = key;
2106 
2107 	return (EQOPENOWNER(&oo->ro_owner, arg));
2108 }
2109 
2110 void *
2111 openowner_mkkey(rfs4_entry_t u_entry)
2112 {
2113 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2114 
2115 	return (&oo->ro_owner);
2116 }
2117 
2118 /* ARGSUSED */
2119 static bool_t
2120 rfs4_openowner_expiry(rfs4_entry_t u_entry)
2121 {
2122 	/* openstateid held us and did all needed delay */
2123 	return (TRUE);
2124 }
2125 
2126 static void
2127 rfs4_openowner_destroy(rfs4_entry_t u_entry)
2128 {
2129 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2130 
2131 	/* Remove open owner from client's lists of open owners */
2132 	rfs4_dbe_lock(oo->ro_client->rc_dbe);
2133 	list_remove(&oo->ro_client->rc_openownerlist, oo);
2134 	rfs4_dbe_unlock(oo->ro_client->rc_dbe);
2135 
2136 	/* One less reference to the client */
2137 	rfs4_client_rele(oo->ro_client);
2138 	oo->ro_client = NULL;
2139 
2140 	/* Free the last reply for this lock owner */
2141 	rfs4_free_reply(&oo->ro_reply);
2142 
2143 	if (oo->ro_reply_fh.nfs_fh4_val) {
2144 		kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2145 		    oo->ro_reply_fh.nfs_fh4_len);
2146 		oo->ro_reply_fh.nfs_fh4_val = NULL;
2147 		oo->ro_reply_fh.nfs_fh4_len = 0;
2148 	}
2149 
2150 	rfs4_sw_destroy(&oo->ro_sw);
2151 	list_destroy(&oo->ro_statelist);
2152 
2153 	/* Free the lock owner id */
2154 	kmem_free(oo->ro_owner.owner_val, oo->ro_owner.owner_len);
2155 }
2156 
2157 void
2158 rfs4_openowner_rele(rfs4_openowner_t *oo)
2159 {
2160 	rfs4_dbe_rele(oo->ro_dbe);
2161 }
2162 
2163 static bool_t
2164 rfs4_openowner_create(rfs4_entry_t u_entry, void *arg)
2165 {
2166 	rfs4_openowner_t *oo = (rfs4_openowner_t *)u_entry;
2167 	rfs4_openowner_t *argp = (rfs4_openowner_t *)arg;
2168 	open_owner4 *openowner = &argp->ro_owner;
2169 	seqid4 seqid = argp->ro_open_seqid;
2170 	rfs4_client_t *cp;
2171 	bool_t create = FALSE;
2172 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2173 
2174 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2175 
2176 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2177 	    &openowner->clientid,
2178 	    &create, NULL, RFS4_DBS_VALID);
2179 
2180 	rw_exit(&nsrv4->rfs4_findclient_lock);
2181 
2182 	if (cp == NULL)
2183 		return (FALSE);
2184 
2185 	oo->ro_reply_fh.nfs_fh4_len = 0;
2186 	oo->ro_reply_fh.nfs_fh4_val = NULL;
2187 
2188 	oo->ro_owner.clientid = openowner->clientid;
2189 	oo->ro_owner.owner_val =
2190 	    kmem_alloc(openowner->owner_len, KM_SLEEP);
2191 
2192 	bcopy(openowner->owner_val,
2193 	    oo->ro_owner.owner_val, openowner->owner_len);
2194 
2195 	oo->ro_owner.owner_len = openowner->owner_len;
2196 
2197 	oo->ro_need_confirm = TRUE;
2198 
2199 	rfs4_sw_init(&oo->ro_sw);
2200 
2201 	oo->ro_open_seqid = seqid;
2202 	bzero(&oo->ro_reply, sizeof (nfs_resop4));
2203 	oo->ro_client = cp;
2204 	oo->ro_cr_set = NULL;
2205 
2206 	list_create(&oo->ro_statelist, sizeof (rfs4_state_t),
2207 	    offsetof(rfs4_state_t, rs_node));
2208 
2209 	/* Insert openowner into client's open owner list */
2210 	rfs4_dbe_lock(cp->rc_dbe);
2211 	list_insert_tail(&cp->rc_openownerlist, oo);
2212 	rfs4_dbe_unlock(cp->rc_dbe);
2213 
2214 	return (TRUE);
2215 }
2216 
2217 rfs4_openowner_t *
2218 rfs4_findopenowner(open_owner4 *openowner, bool_t *create, seqid4 seqid)
2219 {
2220 	rfs4_openowner_t *oo;
2221 	rfs4_openowner_t arg;
2222 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2223 
2224 	arg.ro_owner = *openowner;
2225 	arg.ro_open_seqid = seqid;
2226 	/* CSTYLED */
2227 	oo = (rfs4_openowner_t *)rfs4_dbsearch(nsrv4->rfs4_openowner_idx, openowner,
2228 	    create, &arg, RFS4_DBS_VALID);
2229 
2230 	return (oo);
2231 }
2232 
2233 void
2234 rfs4_update_open_sequence(rfs4_openowner_t *oo)
2235 {
2236 
2237 	rfs4_dbe_lock(oo->ro_dbe);
2238 
2239 	oo->ro_open_seqid++;
2240 
2241 	rfs4_dbe_unlock(oo->ro_dbe);
2242 }
2243 
2244 void
2245 rfs4_update_open_resp(rfs4_openowner_t *oo, nfs_resop4 *resp, nfs_fh4 *fh)
2246 {
2247 
2248 	rfs4_dbe_lock(oo->ro_dbe);
2249 
2250 	rfs4_free_reply(&oo->ro_reply);
2251 
2252 	rfs4_copy_reply(&oo->ro_reply, resp);
2253 
2254 	/* Save the filehandle if provided and free if not used */
2255 	if (resp->nfs_resop4_u.opopen.status == NFS4_OK &&
2256 	    fh && fh->nfs_fh4_len) {
2257 		if (oo->ro_reply_fh.nfs_fh4_val == NULL)
2258 			oo->ro_reply_fh.nfs_fh4_val =
2259 			    kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2260 		nfs_fh4_copy(fh, &oo->ro_reply_fh);
2261 	} else {
2262 		if (oo->ro_reply_fh.nfs_fh4_val) {
2263 			kmem_free(oo->ro_reply_fh.nfs_fh4_val,
2264 			    oo->ro_reply_fh.nfs_fh4_len);
2265 			oo->ro_reply_fh.nfs_fh4_val = NULL;
2266 			oo->ro_reply_fh.nfs_fh4_len = 0;
2267 		}
2268 	}
2269 
2270 	rfs4_dbe_unlock(oo->ro_dbe);
2271 }
2272 
2273 static bool_t
2274 lockowner_compare(rfs4_entry_t u_entry, void *key)
2275 {
2276 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2277 	lock_owner4 *b = (lock_owner4 *)key;
2278 
2279 	if (lo->rl_owner.clientid != b->clientid)
2280 		return (FALSE);
2281 
2282 	if (lo->rl_owner.owner_len != b->owner_len)
2283 		return (FALSE);
2284 
2285 	return (bcmp(lo->rl_owner.owner_val, b->owner_val,
2286 	    lo->rl_owner.owner_len) == 0);
2287 }
2288 
2289 void *
2290 lockowner_mkkey(rfs4_entry_t u_entry)
2291 {
2292 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2293 
2294 	return (&lo->rl_owner);
2295 }
2296 
2297 static uint32_t
2298 lockowner_hash(void *key)
2299 {
2300 	int i;
2301 	lock_owner4 *lockowner = key;
2302 	uint_t hash = 0;
2303 
2304 	for (i = 0; i < lockowner->owner_len; i++) {
2305 		hash <<= 4;
2306 		hash += (uint_t)lockowner->owner_val[i];
2307 	}
2308 	hash += (uint_t)lockowner->clientid;
2309 	hash |= (lockowner->clientid >> 32);
2310 
2311 	return (hash);
2312 }
2313 
2314 static uint32_t
2315 pid_hash(void *key)
2316 {
2317 	return ((uint32_t)(uintptr_t)key);
2318 }
2319 
2320 static void *
2321 pid_mkkey(rfs4_entry_t u_entry)
2322 {
2323 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2324 
2325 	return ((void *)(uintptr_t)lo->rl_pid);
2326 }
2327 
2328 static bool_t
2329 pid_compare(rfs4_entry_t u_entry, void *key)
2330 {
2331 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2332 
2333 	return (lo->rl_pid == (pid_t)(uintptr_t)key);
2334 }
2335 
2336 static void
2337 rfs4_lockowner_destroy(rfs4_entry_t u_entry)
2338 {
2339 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2340 
2341 	/* Free the lock owner id */
2342 	kmem_free(lo->rl_owner.owner_val, lo->rl_owner.owner_len);
2343 	rfs4_client_rele(lo->rl_client);
2344 }
2345 
2346 void
2347 rfs4_lockowner_rele(rfs4_lockowner_t *lo)
2348 {
2349 	rfs4_dbe_rele(lo->rl_dbe);
2350 }
2351 
2352 /* ARGSUSED */
2353 static bool_t
2354 rfs4_lockowner_expiry(rfs4_entry_t u_entry)
2355 {
2356 	/*
2357 	 * Since expiry is called with no other references on
2358 	 * this struct, go ahead and have it removed.
2359 	 */
2360 	return (TRUE);
2361 }
2362 
2363 static bool_t
2364 rfs4_lockowner_create(rfs4_entry_t u_entry, void *arg)
2365 {
2366 	rfs4_lockowner_t *lo = (rfs4_lockowner_t *)u_entry;
2367 	lock_owner4 *lockowner = (lock_owner4 *)arg;
2368 	rfs4_client_t *cp;
2369 	bool_t create = FALSE;
2370 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2371 
2372 	rw_enter(&nsrv4->rfs4_findclient_lock, RW_READER);
2373 
2374 	cp = (rfs4_client_t *)rfs4_dbsearch(nsrv4->rfs4_clientid_idx,
2375 	    &lockowner->clientid,
2376 	    &create, NULL, RFS4_DBS_VALID);
2377 
2378 	rw_exit(&nsrv4->rfs4_findclient_lock);
2379 
2380 	if (cp == NULL)
2381 		return (FALSE);
2382 
2383 	/* Reference client */
2384 	lo->rl_client = cp;
2385 	lo->rl_owner.clientid = lockowner->clientid;
2386 	lo->rl_owner.owner_val = kmem_alloc(lockowner->owner_len, KM_SLEEP);
2387 	bcopy(lockowner->owner_val, lo->rl_owner.owner_val,
2388 	    lockowner->owner_len);
2389 	lo->rl_owner.owner_len = lockowner->owner_len;
2390 	lo->rl_pid = rfs4_dbe_getid(lo->rl_dbe);
2391 
2392 	return (TRUE);
2393 }
2394 
2395 rfs4_lockowner_t *
2396 rfs4_findlockowner(lock_owner4 *lockowner, bool_t *create)
2397 {
2398 	rfs4_lockowner_t *lo;
2399 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2400 
2401 	/* CSTYLED */
2402 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_idx, lockowner,
2403 	    create, lockowner, RFS4_DBS_VALID);
2404 
2405 	return (lo);
2406 }
2407 
2408 rfs4_lockowner_t *
2409 rfs4_findlockowner_by_pid(pid_t pid)
2410 {
2411 	rfs4_lockowner_t *lo;
2412 	bool_t create = FALSE;
2413 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2414 
2415 	lo = (rfs4_lockowner_t *)rfs4_dbsearch(nsrv4->rfs4_lockowner_pid_idx,
2416 	    (void *)(uintptr_t)pid, &create, NULL, RFS4_DBS_VALID);
2417 
2418 	return (lo);
2419 }
2420 
2421 
2422 static uint32_t
2423 file_hash(void *key)
2424 {
2425 	return (ADDRHASH(key));
2426 }
2427 
2428 static void *
2429 file_mkkey(rfs4_entry_t u_entry)
2430 {
2431 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2432 
2433 	return (fp->rf_vp);
2434 }
2435 
2436 static bool_t
2437 file_compare(rfs4_entry_t u_entry, void *key)
2438 {
2439 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2440 
2441 	return (fp->rf_vp == (vnode_t *)key);
2442 }
2443 
2444 static void
2445 rfs4_file_destroy(rfs4_entry_t u_entry)
2446 {
2447 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2448 
2449 	list_destroy(&fp->rf_delegstatelist);
2450 
2451 	if (fp->rf_filehandle.nfs_fh4_val)
2452 		kmem_free(fp->rf_filehandle.nfs_fh4_val,
2453 		    fp->rf_filehandle.nfs_fh4_len);
2454 	cv_destroy(fp->rf_dinfo.rd_recall_cv);
2455 	if (fp->rf_vp) {
2456 		vnode_t *vp = fp->rf_vp;
2457 
2458 		mutex_enter(&vp->v_vsd_lock);
2459 		(void) vsd_set(vp, nfs4_srv_vkey, NULL);
2460 		mutex_exit(&vp->v_vsd_lock);
2461 		VN_RELE(vp);
2462 		fp->rf_vp = NULL;
2463 	}
2464 	rw_destroy(&fp->rf_file_rwlock);
2465 }
2466 
2467 /*
2468  * Used to unlock the underlying dbe struct only
2469  */
2470 void
2471 rfs4_file_rele(rfs4_file_t *fp)
2472 {
2473 	rfs4_dbe_rele(fp->rf_dbe);
2474 }
2475 
2476 typedef struct {
2477     vnode_t *vp;
2478     nfs_fh4 *fh;
2479 } rfs4_fcreate_arg;
2480 
2481 static bool_t
2482 rfs4_file_create(rfs4_entry_t u_entry, void *arg)
2483 {
2484 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
2485 	rfs4_fcreate_arg *ap = (rfs4_fcreate_arg *)arg;
2486 	vnode_t *vp = ap->vp;
2487 	nfs_fh4 *fh = ap->fh;
2488 
2489 	VN_HOLD(vp);
2490 
2491 	fp->rf_filehandle.nfs_fh4_len = 0;
2492 	fp->rf_filehandle.nfs_fh4_val = NULL;
2493 	ASSERT(fh && fh->nfs_fh4_len);
2494 	if (fh && fh->nfs_fh4_len) {
2495 		fp->rf_filehandle.nfs_fh4_val =
2496 		    kmem_alloc(fh->nfs_fh4_len, KM_SLEEP);
2497 		nfs_fh4_copy(fh, &fp->rf_filehandle);
2498 	}
2499 	fp->rf_vp = vp;
2500 
2501 	list_create(&fp->rf_delegstatelist, sizeof (rfs4_deleg_state_t),
2502 	    offsetof(rfs4_deleg_state_t, rds_node));
2503 
2504 	fp->rf_share_deny = fp->rf_share_access = fp->rf_access_read = 0;
2505 	fp->rf_access_write = fp->rf_deny_read = fp->rf_deny_write = 0;
2506 
2507 	mutex_init(fp->rf_dinfo.rd_recall_lock, NULL, MUTEX_DEFAULT, NULL);
2508 	cv_init(fp->rf_dinfo.rd_recall_cv, NULL, CV_DEFAULT, NULL);
2509 
2510 	fp->rf_dinfo.rd_dtype = OPEN_DELEGATE_NONE;
2511 
2512 	rw_init(&fp->rf_file_rwlock, NULL, RW_DEFAULT, NULL);
2513 
2514 	mutex_enter(&vp->v_vsd_lock);
2515 	VERIFY(vsd_set(vp, nfs4_srv_vkey, (void *)fp) == 0);
2516 	mutex_exit(&vp->v_vsd_lock);
2517 
2518 	return (TRUE);
2519 }
2520 
2521 rfs4_file_t *
2522 rfs4_findfile(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2523 {
2524 	rfs4_file_t *fp;
2525 	rfs4_fcreate_arg arg;
2526 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2527 
2528 	arg.vp = vp;
2529 	arg.fh = fh;
2530 
2531 	if (*create == TRUE)
2532 		/* CSTYLED */
2533 		fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp, create,
2534 		    &arg, RFS4_DBS_VALID);
2535 	else {
2536 		mutex_enter(&vp->v_vsd_lock);
2537 		fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2538 		if (fp) {
2539 			rfs4_dbe_lock(fp->rf_dbe);
2540 			if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2541 			    (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2542 				rfs4_dbe_unlock(fp->rf_dbe);
2543 				fp = NULL;
2544 			} else {
2545 				rfs4_dbe_hold(fp->rf_dbe);
2546 				rfs4_dbe_unlock(fp->rf_dbe);
2547 			}
2548 		}
2549 		mutex_exit(&vp->v_vsd_lock);
2550 	}
2551 	return (fp);
2552 }
2553 
2554 /*
2555  * Find a file in the db and once it is located, take the rw lock.
2556  * Need to check the vnode pointer and if it does not exist (it was
2557  * removed between the db location and check) redo the find.  This
2558  * assumes that a file struct that has a NULL vnode pointer is marked
2559  * at 'invalid' and will not be found in the db the second time
2560  * around.
2561  */
2562 rfs4_file_t *
2563 rfs4_findfile_withlock(vnode_t *vp, nfs_fh4 *fh, bool_t *create)
2564 {
2565 	rfs4_file_t *fp;
2566 	rfs4_fcreate_arg arg;
2567 	bool_t screate = *create;
2568 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2569 
2570 	if (screate == FALSE) {
2571 		mutex_enter(&vp->v_vsd_lock);
2572 		fp = (rfs4_file_t *)vsd_get(vp, nfs4_srv_vkey);
2573 		if (fp) {
2574 			rfs4_dbe_lock(fp->rf_dbe);
2575 			if (rfs4_dbe_is_invalid(fp->rf_dbe) ||
2576 			    (rfs4_dbe_refcnt(fp->rf_dbe) == 0)) {
2577 				rfs4_dbe_unlock(fp->rf_dbe);
2578 				mutex_exit(&vp->v_vsd_lock);
2579 				fp = NULL;
2580 			} else {
2581 				rfs4_dbe_hold(fp->rf_dbe);
2582 				rfs4_dbe_unlock(fp->rf_dbe);
2583 				mutex_exit(&vp->v_vsd_lock);
2584 				rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2585 				if (fp->rf_vp == NULL) {
2586 					rw_exit(&fp->rf_file_rwlock);
2587 					rfs4_file_rele(fp);
2588 					fp = NULL;
2589 				}
2590 			}
2591 		} else {
2592 			mutex_exit(&vp->v_vsd_lock);
2593 		}
2594 	} else {
2595 retry:
2596 		arg.vp = vp;
2597 		arg.fh = fh;
2598 
2599 		fp = (rfs4_file_t *)rfs4_dbsearch(nsrv4->rfs4_file_idx, vp,
2600 		    create, &arg, RFS4_DBS_VALID);
2601 		if (fp != NULL) {
2602 			rw_enter(&fp->rf_file_rwlock, RW_WRITER);
2603 			if (fp->rf_vp == NULL) {
2604 				rw_exit(&fp->rf_file_rwlock);
2605 				rfs4_file_rele(fp);
2606 				*create = screate;
2607 				goto retry;
2608 			}
2609 		}
2610 	}
2611 
2612 	return (fp);
2613 }
2614 
2615 static uint32_t
2616 lo_state_hash(void *key)
2617 {
2618 	stateid_t *id = key;
2619 
2620 	return (id->bits.ident+id->bits.pid);
2621 }
2622 
2623 static bool_t
2624 lo_state_compare(rfs4_entry_t u_entry, void *key)
2625 {
2626 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2627 	stateid_t *id = key;
2628 	bool_t rc;
2629 
2630 	rc = (lsp->rls_lockid.bits.boottime == id->bits.boottime &&
2631 	    lsp->rls_lockid.bits.type == id->bits.type &&
2632 	    lsp->rls_lockid.bits.ident == id->bits.ident &&
2633 	    lsp->rls_lockid.bits.pid == id->bits.pid);
2634 
2635 	return (rc);
2636 }
2637 
2638 static void *
2639 lo_state_mkkey(rfs4_entry_t u_entry)
2640 {
2641 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2642 
2643 	return (&lsp->rls_lockid);
2644 }
2645 
2646 static bool_t
2647 rfs4_lo_state_expiry(rfs4_entry_t u_entry)
2648 {
2649 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2650 
2651 	if (rfs4_dbe_is_invalid(lsp->rls_dbe))
2652 		return (TRUE);
2653 	if (lsp->rls_state->rs_closed)
2654 		return (TRUE);
2655 	return ((gethrestime_sec() -
2656 	    lsp->rls_state->rs_owner->ro_client->rc_last_access
2657 	    > rfs4_lease_time));
2658 }
2659 
2660 static void
2661 rfs4_lo_state_destroy(rfs4_entry_t u_entry)
2662 {
2663 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2664 
2665 	rfs4_dbe_lock(lsp->rls_state->rs_dbe);
2666 	list_remove(&lsp->rls_state->rs_lostatelist, lsp);
2667 	rfs4_dbe_unlock(lsp->rls_state->rs_dbe);
2668 
2669 	rfs4_sw_destroy(&lsp->rls_sw);
2670 
2671 	/* Make sure to release the file locks */
2672 	if (lsp->rls_locks_cleaned == FALSE) {
2673 		lsp->rls_locks_cleaned = TRUE;
2674 		if (lsp->rls_locker->rl_client->rc_sysidt != LM_NOSYSID) {
2675 			/* Is the PxFS kernel module loaded? */
2676 			if (lm_remove_file_locks != NULL) {
2677 				int new_sysid;
2678 
2679 				/* Encode the cluster nodeid in new sysid */
2680 				new_sysid =
2681 				    lsp->rls_locker->rl_client->rc_sysidt;
2682 				lm_set_nlmid_flk(&new_sysid);
2683 
2684 				/*
2685 				 * This PxFS routine removes file locks for a
2686 				 * client over all nodes of a cluster.
2687 				 */
2688 				DTRACE_PROBE1(nfss_i_clust_rm_lck,
2689 				    int, new_sysid);
2690 				(*lm_remove_file_locks)(new_sysid);
2691 			} else {
2692 				(void) cleanlocks(
2693 				    lsp->rls_state->rs_finfo->rf_vp,
2694 				    lsp->rls_locker->rl_pid,
2695 				    lsp->rls_locker->rl_client->rc_sysidt);
2696 			}
2697 		}
2698 	}
2699 
2700 	/* Free the last reply for this state */
2701 	rfs4_free_reply(&lsp->rls_reply);
2702 
2703 	rfs4_lockowner_rele(lsp->rls_locker);
2704 	lsp->rls_locker = NULL;
2705 
2706 	rfs4_state_rele_nounlock(lsp->rls_state);
2707 	lsp->rls_state = NULL;
2708 }
2709 
2710 static bool_t
2711 rfs4_lo_state_create(rfs4_entry_t u_entry, void *arg)
2712 {
2713 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2714 	rfs4_lo_state_t *argp = (rfs4_lo_state_t *)arg;
2715 	rfs4_lockowner_t *lo = argp->rls_locker;
2716 	rfs4_state_t *sp = argp->rls_state;
2717 
2718 	lsp->rls_state = sp;
2719 
2720 	lsp->rls_lockid = sp->rs_stateid;
2721 	lsp->rls_lockid.bits.type = LOCKID;
2722 	lsp->rls_lockid.bits.chgseq = 0;
2723 	lsp->rls_lockid.bits.pid = lo->rl_pid;
2724 
2725 	lsp->rls_locks_cleaned = FALSE;
2726 	lsp->rls_lock_completed = FALSE;
2727 
2728 	rfs4_sw_init(&lsp->rls_sw);
2729 
2730 	/* Attached the supplied lock owner */
2731 	rfs4_dbe_hold(lo->rl_dbe);
2732 	lsp->rls_locker = lo;
2733 
2734 	rfs4_dbe_lock(sp->rs_dbe);
2735 	list_insert_tail(&sp->rs_lostatelist, lsp);
2736 	rfs4_dbe_hold(sp->rs_dbe);
2737 	rfs4_dbe_unlock(sp->rs_dbe);
2738 
2739 	return (TRUE);
2740 }
2741 
2742 void
2743 rfs4_lo_state_rele(rfs4_lo_state_t *lsp, bool_t unlock_fp)
2744 {
2745 	if (unlock_fp == TRUE)
2746 		rw_exit(&lsp->rls_state->rs_finfo->rf_file_rwlock);
2747 	rfs4_dbe_rele(lsp->rls_dbe);
2748 }
2749 
2750 static rfs4_lo_state_t *
2751 rfs4_findlo_state(stateid_t *id, bool_t lock_fp)
2752 {
2753 	rfs4_lo_state_t *lsp;
2754 	bool_t create = FALSE;
2755 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2756 
2757 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_idx, id,
2758 	    &create, NULL, RFS4_DBS_VALID);
2759 	if (lock_fp == TRUE && lsp != NULL)
2760 		rw_enter(&lsp->rls_state->rs_finfo->rf_file_rwlock, RW_READER);
2761 
2762 	return (lsp);
2763 }
2764 
2765 
2766 static uint32_t
2767 lo_state_lo_hash(void *key)
2768 {
2769 	rfs4_lo_state_t *lsp = key;
2770 
2771 	return (ADDRHASH(lsp->rls_locker) ^ ADDRHASH(lsp->rls_state));
2772 }
2773 
2774 static bool_t
2775 lo_state_lo_compare(rfs4_entry_t u_entry, void *key)
2776 {
2777 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
2778 	rfs4_lo_state_t *keyp = key;
2779 
2780 	return (keyp->rls_locker == lsp->rls_locker &&
2781 	    keyp->rls_state == lsp->rls_state);
2782 }
2783 
2784 static void *
2785 lo_state_lo_mkkey(rfs4_entry_t u_entry)
2786 {
2787 	return (u_entry);
2788 }
2789 
2790 rfs4_lo_state_t *
2791 rfs4_findlo_state_by_owner(rfs4_lockowner_t *lo, rfs4_state_t *sp,
2792     bool_t *create)
2793 {
2794 	rfs4_lo_state_t *lsp;
2795 	rfs4_lo_state_t arg;
2796 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
2797 
2798 	arg.rls_locker = lo;
2799 	arg.rls_state = sp;
2800 
2801 	lsp = (rfs4_lo_state_t *)rfs4_dbsearch(nsrv4->rfs4_lo_state_owner_idx,
2802 	    &arg, create, &arg, RFS4_DBS_VALID);
2803 
2804 	return (lsp);
2805 }
2806 
2807 static stateid_t
2808 get_stateid(id_t eid)
2809 {
2810 	stateid_t id;
2811 	nfs4_srv_t *nsrv4;
2812 
2813 	nsrv4 = nfs4_get_srv();
2814 
2815 	id.bits.boottime = nsrv4->rfs4_start_time;
2816 	id.bits.ident = eid;
2817 	id.bits.chgseq = 0;
2818 	id.bits.type = 0;
2819 	id.bits.pid = 0;
2820 
2821 	/*
2822 	 * If we are booted as a cluster node, embed our nodeid.
2823 	 * We've already done sanity checks in rfs4_client_create() so no
2824 	 * need to repeat them here.
2825 	 */
2826 	id.bits.clnodeid = (cluster_bootflags & CLUSTER_BOOTED) ?
2827 	    clconf_get_nodeid() : 0;
2828 
2829 	return (id);
2830 }
2831 
2832 /*
2833  * For use only when booted as a cluster node.
2834  * Returns TRUE if the embedded nodeid indicates that this stateid was
2835  * generated on another node.
2836  */
2837 static int
2838 foreign_stateid(stateid_t *id)
2839 {
2840 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2841 	return (id->bits.clnodeid != (uint32_t)clconf_get_nodeid());
2842 }
2843 
2844 /*
2845  * For use only when booted as a cluster node.
2846  * Returns TRUE if the embedded nodeid indicates that this clientid was
2847  * generated on another node.
2848  */
2849 static int
2850 foreign_clientid(cid *cidp)
2851 {
2852 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2853 	return (cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT !=
2854 	    (uint32_t)clconf_get_nodeid());
2855 }
2856 
2857 /*
2858  * For use only when booted as a cluster node.
2859  * Embed our cluster nodeid into the clientid.
2860  */
2861 static void
2862 embed_nodeid(cid *cidp)
2863 {
2864 	int clnodeid;
2865 	/*
2866 	 * Currently, our state tables are small enough that their
2867 	 * ids will leave enough bits free for the nodeid. If the
2868 	 * tables become larger, we mustn't overwrite the id.
2869 	 * Equally, we only have room for so many bits of nodeid, so
2870 	 * must check that too.
2871 	 */
2872 	ASSERT(cluster_bootflags & CLUSTER_BOOTED);
2873 	ASSERT(cidp->impl_id.c_id >> CLUSTER_NODEID_SHIFT == 0);
2874 	clnodeid = clconf_get_nodeid();
2875 	ASSERT(clnodeid <= CLUSTER_MAX_NODEID);
2876 	ASSERT(clnodeid != NODEID_UNKNOWN);
2877 	cidp->impl_id.c_id |= (clnodeid << CLUSTER_NODEID_SHIFT);
2878 }
2879 
2880 static uint32_t
2881 state_hash(void *key)
2882 {
2883 	stateid_t *ip = (stateid_t *)key;
2884 
2885 	return (ip->bits.ident);
2886 }
2887 
2888 static bool_t
2889 state_compare(rfs4_entry_t u_entry, void *key)
2890 {
2891 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2892 	stateid_t *id = (stateid_t *)key;
2893 	bool_t rc;
2894 
2895 	rc = (sp->rs_stateid.bits.boottime == id->bits.boottime &&
2896 	    sp->rs_stateid.bits.ident == id->bits.ident);
2897 
2898 	return (rc);
2899 }
2900 
2901 static void *
2902 state_mkkey(rfs4_entry_t u_entry)
2903 {
2904 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2905 
2906 	return (&sp->rs_stateid);
2907 }
2908 
2909 static void
2910 rfs4_state_destroy(rfs4_entry_t u_entry)
2911 {
2912 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
2913 
2914 	/* remove from openowner list */
2915 	rfs4_dbe_lock(sp->rs_owner->ro_dbe);
2916 	list_remove(&sp->rs_owner->ro_statelist, sp);
2917 	rfs4_dbe_unlock(sp->rs_owner->ro_dbe);
2918 
2919 	list_destroy(&sp->rs_lostatelist);
2920 
2921 	/* release any share locks for this stateid if it's still open */
2922 	if (!sp->rs_closed) {
2923 		rfs4_dbe_lock(sp->rs_dbe);
2924 		(void) rfs4_unshare(sp);
2925 		rfs4_dbe_unlock(sp->rs_dbe);
2926 	}
2927 
2928 	/* Were done with the file */
2929 	rfs4_file_rele(sp->rs_finfo);
2930 	sp->rs_finfo = NULL;
2931 
2932 	/* And now with the openowner */
2933 	rfs4_openowner_rele(sp->rs_owner);
2934 	sp->rs_owner = NULL;
2935 }
2936 
2937 static void
2938 rfs4_state_rele_nounlock(rfs4_state_t *sp)
2939 {
2940 	rfs4_dbe_rele(sp->rs_dbe);
2941 }
2942 
2943 void
2944 rfs4_state_rele(rfs4_state_t *sp)
2945 {
2946 	rw_exit(&sp->rs_finfo->rf_file_rwlock);
2947 	rfs4_dbe_rele(sp->rs_dbe);
2948 }
2949 
2950 static uint32_t
2951 deleg_hash(void *key)
2952 {
2953 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)key;
2954 
2955 	return (ADDRHASH(dsp->rds_client) ^ ADDRHASH(dsp->rds_finfo));
2956 }
2957 
2958 static bool_t
2959 deleg_compare(rfs4_entry_t u_entry, void *key)
2960 {
2961 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2962 	rfs4_deleg_state_t *kdsp = (rfs4_deleg_state_t *)key;
2963 
2964 	return (dsp->rds_client == kdsp->rds_client &&
2965 	    dsp->rds_finfo == kdsp->rds_finfo);
2966 }
2967 
2968 static void *
2969 deleg_mkkey(rfs4_entry_t u_entry)
2970 {
2971 	return (u_entry);
2972 }
2973 
2974 static uint32_t
2975 deleg_state_hash(void *key)
2976 {
2977 	stateid_t *ip = (stateid_t *)key;
2978 
2979 	return (ip->bits.ident);
2980 }
2981 
2982 static bool_t
2983 deleg_state_compare(rfs4_entry_t u_entry, void *key)
2984 {
2985 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
2986 	stateid_t *id = (stateid_t *)key;
2987 	bool_t rc;
2988 
2989 	if (id->bits.type != DELEGID)
2990 		return (FALSE);
2991 
2992 	rc = (dsp->rds_delegid.bits.boottime == id->bits.boottime &&
2993 	    dsp->rds_delegid.bits.ident == id->bits.ident);
2994 
2995 	return (rc);
2996 }
2997 
2998 static void *
2999 deleg_state_mkkey(rfs4_entry_t u_entry)
3000 {
3001 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3002 
3003 	return (&dsp->rds_delegid);
3004 }
3005 
3006 static bool_t
3007 rfs4_deleg_state_expiry(rfs4_entry_t u_entry)
3008 {
3009 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3010 
3011 	if (rfs4_dbe_is_invalid(dsp->rds_dbe))
3012 		return (TRUE);
3013 
3014 	if (dsp->rds_dtype == OPEN_DELEGATE_NONE)
3015 		return (TRUE);
3016 
3017 	if ((gethrestime_sec() - dsp->rds_client->rc_last_access
3018 	    > rfs4_lease_time)) {
3019 		rfs4_dbe_invalidate(dsp->rds_dbe);
3020 		return (TRUE);
3021 	}
3022 
3023 	return (FALSE);
3024 }
3025 
3026 static bool_t
3027 rfs4_deleg_state_create(rfs4_entry_t u_entry, void *argp)
3028 {
3029 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3030 	rfs4_file_t *fp = ((rfs4_deleg_state_t *)argp)->rds_finfo;
3031 	rfs4_client_t *cp = ((rfs4_deleg_state_t *)argp)->rds_client;
3032 
3033 	rfs4_dbe_hold(fp->rf_dbe);
3034 	rfs4_dbe_hold(cp->rc_dbe);
3035 
3036 	dsp->rds_delegid = get_stateid(rfs4_dbe_getid(dsp->rds_dbe));
3037 	dsp->rds_delegid.bits.type = DELEGID;
3038 	dsp->rds_finfo = fp;
3039 	dsp->rds_client = cp;
3040 	dsp->rds_dtype = OPEN_DELEGATE_NONE;
3041 
3042 	dsp->rds_time_granted = gethrestime_sec();	/* observability */
3043 	dsp->rds_time_revoked = 0;
3044 
3045 	list_link_init(&dsp->rds_node);
3046 
3047 	return (TRUE);
3048 }
3049 
3050 static void
3051 rfs4_deleg_state_destroy(rfs4_entry_t u_entry)
3052 {
3053 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
3054 
3055 	/* return delegation if necessary */
3056 	rfs4_return_deleg(dsp, FALSE);
3057 
3058 	/* Were done with the file */
3059 	rfs4_file_rele(dsp->rds_finfo);
3060 	dsp->rds_finfo = NULL;
3061 
3062 	/* And now with the openowner */
3063 	rfs4_client_rele(dsp->rds_client);
3064 	dsp->rds_client = NULL;
3065 }
3066 
3067 rfs4_deleg_state_t *
3068 rfs4_finddeleg(rfs4_state_t *sp, bool_t *create)
3069 {
3070 	rfs4_deleg_state_t ds, *dsp;
3071 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3072 
3073 	ds.rds_client = sp->rs_owner->ro_client;
3074 	ds.rds_finfo = sp->rs_finfo;
3075 
3076 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_idx, &ds,
3077 	    create, &ds, RFS4_DBS_VALID);
3078 
3079 	return (dsp);
3080 }
3081 
3082 rfs4_deleg_state_t *
3083 rfs4_finddelegstate(stateid_t *id)
3084 {
3085 	rfs4_deleg_state_t *dsp;
3086 	bool_t create = FALSE;
3087 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3088 
3089 	dsp = (rfs4_deleg_state_t *)rfs4_dbsearch(nsrv4->rfs4_deleg_state_idx,
3090 	    id, &create, NULL, RFS4_DBS_VALID);
3091 
3092 	return (dsp);
3093 }
3094 
3095 void
3096 rfs4_deleg_state_rele(rfs4_deleg_state_t *dsp)
3097 {
3098 	rfs4_dbe_rele(dsp->rds_dbe);
3099 }
3100 
3101 void
3102 rfs4_update_lock_sequence(rfs4_lo_state_t *lsp)
3103 {
3104 
3105 	rfs4_dbe_lock(lsp->rls_dbe);
3106 
3107 	/*
3108 	 * If we are skipping sequence id checking, this means that
3109 	 * this is the first lock request and therefore the sequence
3110 	 * id does not need to be updated.  This only happens on the
3111 	 * first lock request for a lockowner
3112 	 */
3113 	if (!lsp->rls_skip_seqid_check)
3114 		lsp->rls_seqid++;
3115 
3116 	rfs4_dbe_unlock(lsp->rls_dbe);
3117 }
3118 
3119 void
3120 rfs4_update_lock_resp(rfs4_lo_state_t *lsp, nfs_resop4 *resp)
3121 {
3122 
3123 	rfs4_dbe_lock(lsp->rls_dbe);
3124 
3125 	rfs4_free_reply(&lsp->rls_reply);
3126 
3127 	rfs4_copy_reply(&lsp->rls_reply, resp);
3128 
3129 	rfs4_dbe_unlock(lsp->rls_dbe);
3130 }
3131 
3132 void
3133 rfs4_free_opens(rfs4_openowner_t *oo, bool_t invalidate,
3134     bool_t close_of_client)
3135 {
3136 	rfs4_state_t *sp;
3137 
3138 	rfs4_dbe_lock(oo->ro_dbe);
3139 
3140 	for (sp = list_head(&oo->ro_statelist); sp != NULL;
3141 	    sp = list_next(&oo->ro_statelist, sp)) {
3142 		rfs4_state_close(sp, FALSE, close_of_client, CRED());
3143 		if (invalidate == TRUE)
3144 			rfs4_dbe_invalidate(sp->rs_dbe);
3145 	}
3146 
3147 	rfs4_dbe_invalidate(oo->ro_dbe);
3148 	rfs4_dbe_unlock(oo->ro_dbe);
3149 }
3150 
3151 static uint32_t
3152 state_owner_file_hash(void *key)
3153 {
3154 	rfs4_state_t *sp = key;
3155 
3156 	return (ADDRHASH(sp->rs_owner) ^ ADDRHASH(sp->rs_finfo));
3157 }
3158 
3159 static bool_t
3160 state_owner_file_compare(rfs4_entry_t u_entry, void *key)
3161 {
3162 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3163 	rfs4_state_t *arg = key;
3164 
3165 	if (sp->rs_closed == TRUE)
3166 		return (FALSE);
3167 
3168 	return (arg->rs_owner == sp->rs_owner && arg->rs_finfo == sp->rs_finfo);
3169 }
3170 
3171 static void *
3172 state_owner_file_mkkey(rfs4_entry_t u_entry)
3173 {
3174 	return (u_entry);
3175 }
3176 
3177 static uint32_t
3178 state_file_hash(void *key)
3179 {
3180 	return (ADDRHASH(key));
3181 }
3182 
3183 static bool_t
3184 state_file_compare(rfs4_entry_t u_entry, void *key)
3185 {
3186 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3187 	rfs4_file_t *fp = key;
3188 
3189 	if (sp->rs_closed == TRUE)
3190 		return (FALSE);
3191 
3192 	return (fp == sp->rs_finfo);
3193 }
3194 
3195 static void *
3196 state_file_mkkey(rfs4_entry_t u_entry)
3197 {
3198 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3199 
3200 	return (sp->rs_finfo);
3201 }
3202 
3203 rfs4_state_t *
3204 rfs4_findstate_by_owner_file(rfs4_openowner_t *oo, rfs4_file_t *fp,
3205     bool_t *create)
3206 {
3207 	rfs4_state_t *sp;
3208 	rfs4_state_t key;
3209 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3210 
3211 	key.rs_owner = oo;
3212 	key.rs_finfo = fp;
3213 
3214 	sp = (rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_owner_file_idx,
3215 	    &key, create, &key, RFS4_DBS_VALID);
3216 
3217 	return (sp);
3218 }
3219 
3220 /* This returns ANY state struct that refers to this file */
3221 static rfs4_state_t *
3222 rfs4_findstate_by_file(rfs4_file_t *fp)
3223 {
3224 	bool_t create = FALSE;
3225 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3226 
3227 	return ((rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_file_idx, fp,
3228 	    &create, fp, RFS4_DBS_VALID));
3229 }
3230 
3231 static bool_t
3232 rfs4_state_expiry(rfs4_entry_t u_entry)
3233 {
3234 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3235 
3236 	if (rfs4_dbe_is_invalid(sp->rs_dbe))
3237 		return (TRUE);
3238 
3239 	if (sp->rs_closed == TRUE &&
3240 	    ((gethrestime_sec() - rfs4_dbe_get_timerele(sp->rs_dbe))
3241 	    > rfs4_lease_time))
3242 		return (TRUE);
3243 
3244 	return ((gethrestime_sec() - sp->rs_owner->ro_client->rc_last_access
3245 	    > rfs4_lease_time));
3246 }
3247 
3248 static bool_t
3249 rfs4_state_create(rfs4_entry_t u_entry, void *argp)
3250 {
3251 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
3252 	rfs4_file_t *fp = ((rfs4_state_t *)argp)->rs_finfo;
3253 	rfs4_openowner_t *oo = ((rfs4_state_t *)argp)->rs_owner;
3254 
3255 	rfs4_dbe_hold(fp->rf_dbe);
3256 	rfs4_dbe_hold(oo->ro_dbe);
3257 	sp->rs_stateid = get_stateid(rfs4_dbe_getid(sp->rs_dbe));
3258 	sp->rs_stateid.bits.type = OPENID;
3259 	sp->rs_owner = oo;
3260 	sp->rs_finfo = fp;
3261 
3262 	list_create(&sp->rs_lostatelist, sizeof (rfs4_lo_state_t),
3263 	    offsetof(rfs4_lo_state_t, rls_node));
3264 
3265 	/* Insert state on per open owner's list */
3266 	rfs4_dbe_lock(oo->ro_dbe);
3267 	list_insert_tail(&oo->ro_statelist, sp);
3268 	rfs4_dbe_unlock(oo->ro_dbe);
3269 
3270 	return (TRUE);
3271 }
3272 
3273 static rfs4_state_t *
3274 rfs4_findstate(stateid_t *id, rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3275 {
3276 	rfs4_state_t *sp;
3277 	bool_t create = FALSE;
3278 	nfs4_srv_t *nsrv4 = nfs4_get_srv();
3279 
3280 	sp = (rfs4_state_t *)rfs4_dbsearch(nsrv4->rfs4_state_idx, id,
3281 	    &create, NULL, find_invalid);
3282 	if (lock_fp == TRUE && sp != NULL)
3283 		rw_enter(&sp->rs_finfo->rf_file_rwlock, RW_READER);
3284 
3285 	return (sp);
3286 }
3287 
3288 void
3289 rfs4_state_close(rfs4_state_t *sp, bool_t lock_held, bool_t close_of_client,
3290     cred_t *cr)
3291 {
3292 	/* Remove the associated lo_state owners */
3293 	if (!lock_held)
3294 		rfs4_dbe_lock(sp->rs_dbe);
3295 
3296 	/*
3297 	 * If refcnt == 0, the dbe is about to be destroyed.
3298 	 * lock state will be released by the reaper thread.
3299 	 */
3300 
3301 	if (rfs4_dbe_refcnt(sp->rs_dbe) > 0) {
3302 		if (sp->rs_closed == FALSE) {
3303 			rfs4_release_share_lock_state(sp, cr, close_of_client);
3304 			sp->rs_closed = TRUE;
3305 		}
3306 	}
3307 
3308 	if (!lock_held)
3309 		rfs4_dbe_unlock(sp->rs_dbe);
3310 }
3311 
3312 /*
3313  * Remove all state associated with the given client.
3314  */
3315 void
3316 rfs4_client_state_remove(rfs4_client_t *cp)
3317 {
3318 	rfs4_openowner_t *oo;
3319 
3320 	rfs4_dbe_lock(cp->rc_dbe);
3321 
3322 	for (oo = list_head(&cp->rc_openownerlist); oo != NULL;
3323 	    oo = list_next(&cp->rc_openownerlist, oo)) {
3324 		rfs4_free_opens(oo, TRUE, TRUE);
3325 	}
3326 
3327 	rfs4_dbe_unlock(cp->rc_dbe);
3328 }
3329 
3330 void
3331 rfs4_client_close(rfs4_client_t *cp)
3332 {
3333 	/* Mark client as going away. */
3334 	rfs4_dbe_lock(cp->rc_dbe);
3335 	rfs4_dbe_invalidate(cp->rc_dbe);
3336 	rfs4_dbe_unlock(cp->rc_dbe);
3337 
3338 	rfs4_client_state_remove(cp);
3339 
3340 	/* Release the client */
3341 	rfs4_client_rele(cp);
3342 }
3343 
3344 nfsstat4
3345 rfs4_check_clientid(clientid4 *cp, int setclid_confirm)
3346 {
3347 	cid *cidp = (cid *) cp;
3348 	nfs4_srv_t *nsrv4;
3349 
3350 	nsrv4 = nfs4_get_srv();
3351 
3352 	/*
3353 	 * If we are booted as a cluster node, check the embedded nodeid.
3354 	 * If it indicates that this clientid was generated on another node,
3355 	 * inform the client accordingly.
3356 	 */
3357 	if (cluster_bootflags & CLUSTER_BOOTED && foreign_clientid(cidp))
3358 		return (NFS4ERR_STALE_CLIENTID);
3359 
3360 	/*
3361 	 * If the server start time matches the time provided
3362 	 * by the client (via the clientid) and this is NOT a
3363 	 * setclientid_confirm then return EXPIRED.
3364 	 */
3365 	if (!setclid_confirm &&
3366 	    cidp->impl_id.start_time == nsrv4->rfs4_start_time)
3367 		return (NFS4ERR_EXPIRED);
3368 
3369 	return (NFS4ERR_STALE_CLIENTID);
3370 }
3371 
3372 /*
3373  * This is used when a stateid has not been found amongst the
3374  * current server's state.  Check the stateid to see if it
3375  * was from this server instantiation or not.
3376  */
3377 static nfsstat4
3378 what_stateid_error(stateid_t *id, stateid_type_t type)
3379 {
3380 	nfs4_srv_t *nsrv4;
3381 
3382 	nsrv4 = nfs4_get_srv();
3383 
3384 	/* If we are booted as a cluster node, was stateid locally generated? */
3385 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3386 		return (NFS4ERR_STALE_STATEID);
3387 
3388 	/* If types don't match then no use checking further */
3389 	if (type != id->bits.type)
3390 		return (NFS4ERR_BAD_STATEID);
3391 
3392 	/* From a different server instantiation, return STALE */
3393 	if (id->bits.boottime != nsrv4->rfs4_start_time)
3394 		return (NFS4ERR_STALE_STATEID);
3395 
3396 	/*
3397 	 * From this server but the state is most likely beyond lease
3398 	 * timeout: return NFS4ERR_EXPIRED.  However, there is the
3399 	 * case of a delegation stateid.  For delegations, there is a
3400 	 * case where the state can be removed without the client's
3401 	 * knowledge/consent: revocation.  In the case of delegation
3402 	 * revocation, the delegation state will be removed and will
3403 	 * not be found.  If the client does something like a
3404 	 * DELEGRETURN or even a READ/WRITE with a delegatoin stateid
3405 	 * that has been revoked, the server should return BAD_STATEID
3406 	 * instead of the more common EXPIRED error.
3407 	 */
3408 	if (id->bits.boottime == nsrv4->rfs4_start_time) {
3409 		if (type == DELEGID)
3410 			return (NFS4ERR_BAD_STATEID);
3411 		else
3412 			return (NFS4ERR_EXPIRED);
3413 	}
3414 
3415 	return (NFS4ERR_BAD_STATEID);
3416 }
3417 
3418 /*
3419  * Used later on to find the various state structs.  When called from
3420  * rfs4_check_stateid()->rfs4_get_all_state(), no file struct lock is
3421  * taken (it is not needed) and helps on the read/write path with
3422  * respect to performance.
3423  */
3424 static nfsstat4
3425 rfs4_get_state_lockit(stateid4 *stateid, rfs4_state_t **spp,
3426     rfs4_dbsearch_type_t find_invalid, bool_t lock_fp)
3427 {
3428 	stateid_t *id = (stateid_t *)stateid;
3429 	rfs4_state_t *sp;
3430 
3431 	*spp = NULL;
3432 
3433 	/* If we are booted as a cluster node, was stateid locally generated? */
3434 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3435 		return (NFS4ERR_STALE_STATEID);
3436 
3437 	sp = rfs4_findstate(id, find_invalid, lock_fp);
3438 	if (sp == NULL) {
3439 		return (what_stateid_error(id, OPENID));
3440 	}
3441 
3442 	if (rfs4_lease_expired(sp->rs_owner->ro_client)) {
3443 		if (lock_fp == TRUE)
3444 			rfs4_state_rele(sp);
3445 		else
3446 			rfs4_state_rele_nounlock(sp);
3447 		return (NFS4ERR_EXPIRED);
3448 	}
3449 
3450 	*spp = sp;
3451 
3452 	return (NFS4_OK);
3453 }
3454 
3455 nfsstat4
3456 rfs4_get_state(stateid4 *stateid, rfs4_state_t **spp,
3457     rfs4_dbsearch_type_t find_invalid)
3458 {
3459 	return (rfs4_get_state_lockit(stateid, spp, find_invalid, TRUE));
3460 }
3461 
3462 int
3463 rfs4_check_stateid_seqid(rfs4_state_t *sp, stateid4 *stateid)
3464 {
3465 	stateid_t *id = (stateid_t *)stateid;
3466 
3467 	if (rfs4_lease_expired(sp->rs_owner->ro_client))
3468 		return (NFS4_CHECK_STATEID_EXPIRED);
3469 
3470 	/* Stateid is some time in the future - that's bad */
3471 	if (sp->rs_stateid.bits.chgseq < id->bits.chgseq)
3472 		return (NFS4_CHECK_STATEID_BAD);
3473 
3474 	if (sp->rs_stateid.bits.chgseq == id->bits.chgseq + 1)
3475 		return (NFS4_CHECK_STATEID_REPLAY);
3476 
3477 	/* Stateid is some time in the past - that's old */
3478 	if (sp->rs_stateid.bits.chgseq > id->bits.chgseq)
3479 		return (NFS4_CHECK_STATEID_OLD);
3480 
3481 	/* Caller needs to know about confirmation before closure */
3482 	if (sp->rs_owner->ro_need_confirm)
3483 		return (NFS4_CHECK_STATEID_UNCONFIRMED);
3484 
3485 	if (sp->rs_closed == TRUE)
3486 		return (NFS4_CHECK_STATEID_CLOSED);
3487 
3488 	return (NFS4_CHECK_STATEID_OKAY);
3489 }
3490 
3491 int
3492 rfs4_check_lo_stateid_seqid(rfs4_lo_state_t *lsp, stateid4 *stateid)
3493 {
3494 	stateid_t *id = (stateid_t *)stateid;
3495 
3496 	if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client))
3497 		return (NFS4_CHECK_STATEID_EXPIRED);
3498 
3499 	/* Stateid is some time in the future - that's bad */
3500 	if (lsp->rls_lockid.bits.chgseq < id->bits.chgseq)
3501 		return (NFS4_CHECK_STATEID_BAD);
3502 
3503 	if (lsp->rls_lockid.bits.chgseq == id->bits.chgseq + 1)
3504 		return (NFS4_CHECK_STATEID_REPLAY);
3505 
3506 	/* Stateid is some time in the past - that's old */
3507 	if (lsp->rls_lockid.bits.chgseq > id->bits.chgseq)
3508 		return (NFS4_CHECK_STATEID_OLD);
3509 
3510 	if (lsp->rls_state->rs_closed == TRUE)
3511 		return (NFS4_CHECK_STATEID_CLOSED);
3512 
3513 	return (NFS4_CHECK_STATEID_OKAY);
3514 }
3515 
3516 nfsstat4
3517 rfs4_get_deleg_state(stateid4 *stateid, rfs4_deleg_state_t **dspp)
3518 {
3519 	stateid_t *id = (stateid_t *)stateid;
3520 	rfs4_deleg_state_t *dsp;
3521 
3522 	*dspp = NULL;
3523 
3524 	/* If we are booted as a cluster node, was stateid locally generated? */
3525 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3526 		return (NFS4ERR_STALE_STATEID);
3527 
3528 	dsp = rfs4_finddelegstate(id);
3529 	if (dsp == NULL) {
3530 		return (what_stateid_error(id, DELEGID));
3531 	}
3532 
3533 	if (rfs4_lease_expired(dsp->rds_client)) {
3534 		rfs4_deleg_state_rele(dsp);
3535 		return (NFS4ERR_EXPIRED);
3536 	}
3537 
3538 	*dspp = dsp;
3539 
3540 	return (NFS4_OK);
3541 }
3542 
3543 nfsstat4
3544 rfs4_get_lo_state(stateid4 *stateid, rfs4_lo_state_t **lspp, bool_t lock_fp)
3545 {
3546 	stateid_t *id = (stateid_t *)stateid;
3547 	rfs4_lo_state_t *lsp;
3548 
3549 	*lspp = NULL;
3550 
3551 	/* If we are booted as a cluster node, was stateid locally generated? */
3552 	if ((cluster_bootflags & CLUSTER_BOOTED) && foreign_stateid(id))
3553 		return (NFS4ERR_STALE_STATEID);
3554 
3555 	lsp = rfs4_findlo_state(id, lock_fp);
3556 	if (lsp == NULL) {
3557 		return (what_stateid_error(id, LOCKID));
3558 	}
3559 
3560 	if (rfs4_lease_expired(lsp->rls_state->rs_owner->ro_client)) {
3561 		rfs4_lo_state_rele(lsp, lock_fp);
3562 		return (NFS4ERR_EXPIRED);
3563 	}
3564 
3565 	*lspp = lsp;
3566 
3567 	return (NFS4_OK);
3568 }
3569 
3570 static nfsstat4
3571 rfs4_get_all_state(stateid4 *sid, rfs4_state_t **spp,
3572     rfs4_deleg_state_t **dspp, rfs4_lo_state_t **lspp)
3573 {
3574 	rfs4_state_t *sp = NULL;
3575 	rfs4_deleg_state_t *dsp = NULL;
3576 	rfs4_lo_state_t *lsp = NULL;
3577 	stateid_t *id;
3578 	nfsstat4 status;
3579 
3580 	*spp = NULL; *dspp = NULL; *lspp = NULL;
3581 
3582 	id = (stateid_t *)sid;
3583 	switch (id->bits.type) {
3584 	case OPENID:
3585 		status = rfs4_get_state_lockit(sid, &sp, FALSE, FALSE);
3586 		break;
3587 	case DELEGID:
3588 		status = rfs4_get_deleg_state(sid, &dsp);
3589 		break;
3590 	case LOCKID:
3591 		status = rfs4_get_lo_state(sid, &lsp, FALSE);
3592 		if (status == NFS4_OK) {
3593 			sp = lsp->rls_state;
3594 			rfs4_dbe_hold(sp->rs_dbe);
3595 		}
3596 		break;
3597 	default:
3598 		status = NFS4ERR_BAD_STATEID;
3599 	}
3600 
3601 	if (status == NFS4_OK) {
3602 		*spp = sp;
3603 		*dspp = dsp;
3604 		*lspp = lsp;
3605 	}
3606 
3607 	return (status);
3608 }
3609 
3610 /*
3611  * Given the I/O mode (FREAD or FWRITE), this checks whether the
3612  * rfs4_state_t struct has access to do this operation and if so
3613  * return NFS4_OK; otherwise the proper NFSv4 error is returned.
3614  */
3615 nfsstat4
3616 rfs4_state_has_access(rfs4_state_t *sp, int mode, vnode_t *vp)
3617 {
3618 	nfsstat4 stat = NFS4_OK;
3619 	rfs4_file_t *fp;
3620 	bool_t create = FALSE;
3621 
3622 	rfs4_dbe_lock(sp->rs_dbe);
3623 	if (mode == FWRITE) {
3624 		if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_WRITE)) {
3625 			stat = NFS4ERR_OPENMODE;
3626 		}
3627 	} else if (mode == FREAD) {
3628 		if (!(sp->rs_share_access & OPEN4_SHARE_ACCESS_READ)) {
3629 			/*
3630 			 * If we have OPENed the file with DENYing access
3631 			 * to both READ and WRITE then no one else could
3632 			 * have OPENed the file, hence no conflicting READ
3633 			 * deny.  This check is merely an optimization.
3634 			 */
3635 			if (sp->rs_share_deny == OPEN4_SHARE_DENY_BOTH)
3636 				goto out;
3637 
3638 			/* Check against file struct's DENY mode */
3639 			fp = rfs4_findfile(vp, NULL, &create);
3640 			if (fp != NULL) {
3641 				int deny_read = 0;
3642 				rfs4_dbe_lock(fp->rf_dbe);
3643 				/*
3644 				 * Check if any other open owner has the file
3645 				 * OPENed with deny READ.
3646 				 */
3647 				if (sp->rs_share_deny & OPEN4_SHARE_DENY_READ)
3648 					deny_read = 1;
3649 				ASSERT(fp->rf_deny_read >= deny_read);
3650 				if (fp->rf_deny_read > deny_read)
3651 					stat = NFS4ERR_OPENMODE;
3652 				rfs4_dbe_unlock(fp->rf_dbe);
3653 				rfs4_file_rele(fp);
3654 			}
3655 		}
3656 	} else {
3657 		/* Illegal I/O mode */
3658 		stat = NFS4ERR_INVAL;
3659 	}
3660 out:
3661 	rfs4_dbe_unlock(sp->rs_dbe);
3662 	return (stat);
3663 }
3664 
3665 /*
3666  * Given the I/O mode (FREAD or FWRITE), the vnode, the stateid and whether
3667  * the file is being truncated, return NFS4_OK if allowed or appropriate
3668  * V4 error if not. Note NFS4ERR_DELAY will be returned and a recall on
3669  * the associated file will be done if the I/O is not consistent with any
3670  * delegation in effect on the file. Should be holding VOP_RWLOCK, either
3671  * as reader or writer as appropriate. rfs4_op_open will acquire the
3672  * VOP_RWLOCK as writer when setting up delegation. If the stateid is bad
3673  * this routine will return NFS4ERR_BAD_STATEID. In addition, through the
3674  * deleg parameter, we will return whether a write delegation is held by
3675  * the client associated with this stateid.
3676  * If the server instance associated with the relevant client is in its
3677  * grace period, return NFS4ERR_GRACE.
3678  */
3679 
3680 nfsstat4
3681 rfs4_check_stateid(int mode, vnode_t *vp,
3682     stateid4 *stateid, bool_t trunc, bool_t *deleg,
3683     bool_t do_access, caller_context_t *ct)
3684 {
3685 	rfs4_file_t *fp;
3686 	bool_t create = FALSE;
3687 	rfs4_state_t *sp;
3688 	rfs4_deleg_state_t *dsp;
3689 	rfs4_lo_state_t *lsp;
3690 	stateid_t *id = (stateid_t *)stateid;
3691 	nfsstat4 stat = NFS4_OK;
3692 
3693 	if (ct != NULL) {
3694 		ct->cc_sysid = 0;
3695 		ct->cc_pid = 0;
3696 		ct->cc_caller_id = nfs4_srv_caller_id;
3697 		ct->cc_flags = CC_DONTBLOCK;
3698 	}
3699 
3700 	if (ISSPECIAL(stateid)) {
3701 		fp = rfs4_findfile(vp, NULL, &create);
3702 		if (fp == NULL)
3703 			return (NFS4_OK);
3704 		if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_NONE) {
3705 			rfs4_file_rele(fp);
3706 			return (NFS4_OK);
3707 		}
3708 		if (mode == FWRITE ||
3709 		    fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_WRITE) {
3710 			rfs4_recall_deleg(fp, trunc, NULL);
3711 			rfs4_file_rele(fp);
3712 			return (NFS4ERR_DELAY);
3713 		}
3714 		rfs4_file_rele(fp);
3715 		return (NFS4_OK);
3716 	} else {
3717 		stat = rfs4_get_all_state(stateid, &sp, &dsp, &lsp);
3718 		if (stat != NFS4_OK)
3719 			return (stat);
3720 		if (lsp != NULL) {
3721 			/* Is associated server instance in its grace period? */
3722 			if (rfs4_clnt_in_grace(lsp->rls_locker->rl_client)) {
3723 				rfs4_lo_state_rele(lsp, FALSE);
3724 				if (sp != NULL)
3725 					rfs4_state_rele_nounlock(sp);
3726 				return (NFS4ERR_GRACE);
3727 			}
3728 			if (id->bits.type == LOCKID) {
3729 				/* Seqid in the future? - that's bad */
3730 				if (lsp->rls_lockid.bits.chgseq <
3731 				    id->bits.chgseq) {
3732 					rfs4_lo_state_rele(lsp, FALSE);
3733 					if (sp != NULL)
3734 						rfs4_state_rele_nounlock(sp);
3735 					return (NFS4ERR_BAD_STATEID);
3736 				}
3737 				/* Seqid in the past? - that's old */
3738 				if (lsp->rls_lockid.bits.chgseq >
3739 				    id->bits.chgseq) {
3740 					rfs4_lo_state_rele(lsp, FALSE);
3741 					if (sp != NULL)
3742 						rfs4_state_rele_nounlock(sp);
3743 					return (NFS4ERR_OLD_STATEID);
3744 				}
3745 				/* Ensure specified filehandle matches */
3746 				if (lsp->rls_state->rs_finfo->rf_vp != vp) {
3747 					rfs4_lo_state_rele(lsp, FALSE);
3748 					if (sp != NULL)
3749 						rfs4_state_rele_nounlock(sp);
3750 					return (NFS4ERR_BAD_STATEID);
3751 				}
3752 			}
3753 			if (ct != NULL) {
3754 				ct->cc_sysid =
3755 				    lsp->rls_locker->rl_client->rc_sysidt;
3756 				ct->cc_pid = lsp->rls_locker->rl_pid;
3757 			}
3758 			rfs4_lo_state_rele(lsp, FALSE);
3759 		}
3760 
3761 		/* Stateid provided was an "open" stateid */
3762 		if (sp != NULL) {
3763 			/* Is associated server instance in its grace period? */
3764 			if (rfs4_clnt_in_grace(sp->rs_owner->ro_client)) {
3765 				rfs4_state_rele_nounlock(sp);
3766 				return (NFS4ERR_GRACE);
3767 			}
3768 			if (id->bits.type == OPENID) {
3769 				/* Seqid in the future? - that's bad */
3770 				if (sp->rs_stateid.bits.chgseq <
3771 				    id->bits.chgseq) {
3772 					rfs4_state_rele_nounlock(sp);
3773 					return (NFS4ERR_BAD_STATEID);
3774 				}
3775 				/* Seqid in the past - that's old */
3776 				if (sp->rs_stateid.bits.chgseq >
3777 				    id->bits.chgseq) {
3778 					rfs4_state_rele_nounlock(sp);
3779 					return (NFS4ERR_OLD_STATEID);
3780 				}
3781 			}
3782 			/* Ensure specified filehandle matches */
3783 			if (sp->rs_finfo->rf_vp != vp) {
3784 				rfs4_state_rele_nounlock(sp);
3785 				return (NFS4ERR_BAD_STATEID);
3786 			}
3787 
3788 			if (sp->rs_owner->ro_need_confirm) {
3789 				rfs4_state_rele_nounlock(sp);
3790 				return (NFS4ERR_BAD_STATEID);
3791 			}
3792 
3793 			if (sp->rs_closed == TRUE) {
3794 				rfs4_state_rele_nounlock(sp);
3795 				return (NFS4ERR_OLD_STATEID);
3796 			}
3797 
3798 			if (do_access)
3799 				stat = rfs4_state_has_access(sp, mode, vp);
3800 			else
3801 				stat = NFS4_OK;
3802 
3803 			/*
3804 			 * Return whether this state has write
3805 			 * delegation if desired
3806 			 */
3807 			if (deleg && (sp->rs_finfo->rf_dinfo.rd_dtype ==
3808 			    OPEN_DELEGATE_WRITE))
3809 				*deleg = TRUE;
3810 
3811 			/*
3812 			 * We got a valid stateid, so we update the
3813 			 * lease on the client. Ideally we would like
3814 			 * to do this after the calling op succeeds,
3815 			 * but for now this will be good
3816 			 * enough. Callers of this routine are
3817 			 * currently insulated from the state stuff.
3818 			 */
3819 			rfs4_update_lease(sp->rs_owner->ro_client);
3820 
3821 			/*
3822 			 * If a delegation is present on this file and
3823 			 * this is a WRITE, then update the lastwrite
3824 			 * time to indicate that activity is present.
3825 			 */
3826 			if (sp->rs_finfo->rf_dinfo.rd_dtype ==
3827 			    OPEN_DELEGATE_WRITE &&
3828 			    mode == FWRITE) {
3829 				sp->rs_finfo->rf_dinfo.rd_time_lastwrite =
3830 				    gethrestime_sec();
3831 			}
3832 
3833 			rfs4_state_rele_nounlock(sp);
3834 
3835 			return (stat);
3836 		}
3837 
3838 		if (dsp != NULL) {
3839 			/* Is associated server instance in its grace period? */
3840 			if (rfs4_clnt_in_grace(dsp->rds_client)) {
3841 				rfs4_deleg_state_rele(dsp);
3842 				return (NFS4ERR_GRACE);
3843 			}
3844 			if (dsp->rds_delegid.bits.chgseq != id->bits.chgseq) {
3845 				rfs4_deleg_state_rele(dsp);
3846 				return (NFS4ERR_BAD_STATEID);
3847 			}
3848 
3849 			/* Ensure specified filehandle matches */
3850 			if (dsp->rds_finfo->rf_vp != vp) {
3851 				rfs4_deleg_state_rele(dsp);
3852 				return (NFS4ERR_BAD_STATEID);
3853 			}
3854 			/*
3855 			 * Return whether this state has write
3856 			 * delegation if desired
3857 			 */
3858 			if (deleg && (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3859 			    OPEN_DELEGATE_WRITE))
3860 				*deleg = TRUE;
3861 
3862 			rfs4_update_lease(dsp->rds_client);
3863 
3864 			/*
3865 			 * If a delegation is present on this file and
3866 			 * this is a WRITE, then update the lastwrite
3867 			 * time to indicate that activity is present.
3868 			 */
3869 			if (dsp->rds_finfo->rf_dinfo.rd_dtype ==
3870 			    OPEN_DELEGATE_WRITE && mode == FWRITE) {
3871 				dsp->rds_finfo->rf_dinfo.rd_time_lastwrite =
3872 				    gethrestime_sec();
3873 			}
3874 
3875 			/*
3876 			 * XXX - what happens if this is a WRITE and the
3877 			 * delegation type of for READ.
3878 			 */
3879 			rfs4_deleg_state_rele(dsp);
3880 
3881 			return (stat);
3882 		}
3883 		/*
3884 		 * If we got this far, something bad happened
3885 		 */
3886 		return (NFS4ERR_BAD_STATEID);
3887 	}
3888 }
3889 
3890 
3891 /*
3892  * This is a special function in that for the file struct provided the
3893  * server wants to remove/close all current state associated with the
3894  * file.  The prime use of this would be with OP_REMOVE to force the
3895  * release of state and particularly of file locks.
3896  *
3897  * There is an assumption that there is no delegations outstanding on
3898  * this file at this point.  The caller should have waited for those
3899  * to be returned or revoked.
3900  */
3901 void
3902 rfs4_close_all_state(rfs4_file_t *fp)
3903 {
3904 	rfs4_state_t *sp;
3905 
3906 	rfs4_dbe_lock(fp->rf_dbe);
3907 
3908 #ifdef DEBUG
3909 	/* only applies when server is handing out delegations */
3910 	if (nfs4_get_deleg_policy() != SRV_NEVER_DELEGATE)
3911 		ASSERT(fp->rf_dinfo.rd_hold_grant > 0);
3912 #endif
3913 
3914 	/* No delegations for this file */
3915 	ASSERT(list_is_empty(&fp->rf_delegstatelist));
3916 
3917 	/* Make sure that it can not be found */
3918 	rfs4_dbe_invalidate(fp->rf_dbe);
3919 
3920 	if (fp->rf_vp == NULL) {
3921 		rfs4_dbe_unlock(fp->rf_dbe);
3922 		return;
3923 	}
3924 	rfs4_dbe_unlock(fp->rf_dbe);
3925 
3926 	/*
3927 	 * Hold as writer to prevent other server threads from
3928 	 * processing requests related to the file while all state is
3929 	 * being removed.
3930 	 */
3931 	rw_enter(&fp->rf_file_rwlock, RW_WRITER);
3932 
3933 	/* Remove ALL state from the file */
3934 	while (sp = rfs4_findstate_by_file(fp)) {
3935 		rfs4_state_close(sp, FALSE, FALSE, CRED());
3936 		rfs4_state_rele_nounlock(sp);
3937 	}
3938 
3939 	/*
3940 	 * This is only safe since there are no further references to
3941 	 * the file.
3942 	 */
3943 	rfs4_dbe_lock(fp->rf_dbe);
3944 	if (fp->rf_vp) {
3945 		vnode_t *vp = fp->rf_vp;
3946 
3947 		mutex_enter(&vp->v_vsd_lock);
3948 		(void) vsd_set(vp, nfs4_srv_vkey, NULL);
3949 		mutex_exit(&vp->v_vsd_lock);
3950 		VN_RELE(vp);
3951 		fp->rf_vp = NULL;
3952 	}
3953 	rfs4_dbe_unlock(fp->rf_dbe);
3954 
3955 	/* Finally let other references to proceed */
3956 	rw_exit(&fp->rf_file_rwlock);
3957 }
3958 
3959 /*
3960  * This function is used as a target for the rfs4_dbe_walk() call
3961  * below.  The purpose of this function is to see if the
3962  * lockowner_state refers to a file that resides within the exportinfo
3963  * export.  If so, then remove the lock_owner state (file locks and
3964  * share "locks") for this object since the intent is the server is
3965  * unexporting the specified directory.  Be sure to invalidate the
3966  * object after the state has been released
3967  */
3968 static void
3969 rfs4_lo_state_walk_callout(rfs4_entry_t u_entry, void *e)
3970 {
3971 	rfs4_lo_state_t *lsp = (rfs4_lo_state_t *)u_entry;
3972 	struct exportinfo *exi = (struct exportinfo *)e;
3973 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
3974 	fhandle_t *efhp;
3975 
3976 	efhp = (fhandle_t *)&exi->exi_fh;
3977 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
3978 
3979 	FH_TO_FMT4(efhp, exi_fhp);
3980 
3981 	finfo_fhp = (nfs_fh4_fmt_t *)lsp->rls_state->rs_finfo->
3982 	    rf_filehandle.nfs_fh4_val;
3983 
3984 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
3985 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
3986 	    exi_fhp->fh4_xlen) == 0) {
3987 		rfs4_state_close(lsp->rls_state, FALSE, FALSE, CRED());
3988 		rfs4_dbe_invalidate(lsp->rls_dbe);
3989 		rfs4_dbe_invalidate(lsp->rls_state->rs_dbe);
3990 	}
3991 }
3992 
3993 /*
3994  * This function is used as a target for the rfs4_dbe_walk() call
3995  * below.  The purpose of this function is to see if the state refers
3996  * to a file that resides within the exportinfo export.  If so, then
3997  * remove the open state for this object since the intent is the
3998  * server is unexporting the specified directory.  The main result for
3999  * this type of entry is to invalidate it such it will not be found in
4000  * the future.
4001  */
4002 static void
4003 rfs4_state_walk_callout(rfs4_entry_t u_entry, void *e)
4004 {
4005 	rfs4_state_t *sp = (rfs4_state_t *)u_entry;
4006 	struct exportinfo *exi = (struct exportinfo *)e;
4007 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
4008 	fhandle_t *efhp;
4009 
4010 	efhp = (fhandle_t *)&exi->exi_fh;
4011 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
4012 
4013 	FH_TO_FMT4(efhp, exi_fhp);
4014 
4015 	finfo_fhp =
4016 	    (nfs_fh4_fmt_t *)sp->rs_finfo->rf_filehandle.nfs_fh4_val;
4017 
4018 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
4019 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
4020 	    exi_fhp->fh4_xlen) == 0) {
4021 		rfs4_state_close(sp, TRUE, FALSE, CRED());
4022 		rfs4_dbe_invalidate(sp->rs_dbe);
4023 	}
4024 }
4025 
4026 /*
4027  * This function is used as a target for the rfs4_dbe_walk() call
4028  * below.  The purpose of this function is to see if the state refers
4029  * to a file that resides within the exportinfo export.  If so, then
4030  * remove the deleg state for this object since the intent is the
4031  * server is unexporting the specified directory.  The main result for
4032  * this type of entry is to invalidate it such it will not be found in
4033  * the future.
4034  */
4035 static void
4036 rfs4_deleg_state_walk_callout(rfs4_entry_t u_entry, void *e)
4037 {
4038 	rfs4_deleg_state_t *dsp = (rfs4_deleg_state_t *)u_entry;
4039 	struct exportinfo *exi = (struct exportinfo *)e;
4040 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
4041 	fhandle_t *efhp;
4042 
4043 	efhp = (fhandle_t *)&exi->exi_fh;
4044 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
4045 
4046 	FH_TO_FMT4(efhp, exi_fhp);
4047 
4048 	finfo_fhp =
4049 	    (nfs_fh4_fmt_t *)dsp->rds_finfo->rf_filehandle.nfs_fh4_val;
4050 
4051 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
4052 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
4053 	    exi_fhp->fh4_xlen) == 0) {
4054 		rfs4_dbe_invalidate(dsp->rds_dbe);
4055 	}
4056 }
4057 
4058 /*
4059  * This function is used as a target for the rfs4_dbe_walk() call
4060  * below.  The purpose of this function is to see if the state refers
4061  * to a file that resides within the exportinfo export.  If so, then
4062  * release vnode hold for this object since the intent is the server
4063  * is unexporting the specified directory.  Invalidation will prevent
4064  * this struct from being found in the future.
4065  */
4066 static void
4067 rfs4_file_walk_callout(rfs4_entry_t u_entry, void *e)
4068 {
4069 	rfs4_file_t *fp = (rfs4_file_t *)u_entry;
4070 	struct exportinfo *exi = (struct exportinfo *)e;
4071 	nfs_fh4_fmt_t   fhfmt4, *exi_fhp, *finfo_fhp;
4072 	fhandle_t *efhp;
4073 
4074 	efhp = (fhandle_t *)&exi->exi_fh;
4075 	exi_fhp = (nfs_fh4_fmt_t *)&fhfmt4;
4076 
4077 	FH_TO_FMT4(efhp, exi_fhp);
4078 
4079 	finfo_fhp = (nfs_fh4_fmt_t *)fp->rf_filehandle.nfs_fh4_val;
4080 
4081 	if (EQFSID(&finfo_fhp->fh4_fsid, &exi_fhp->fh4_fsid) &&
4082 	    bcmp(&finfo_fhp->fh4_xdata, &exi_fhp->fh4_xdata,
4083 	    exi_fhp->fh4_xlen) == 0) {
4084 		if (fp->rf_vp) {
4085 			vnode_t *vp = fp->rf_vp;
4086 
4087 			/*
4088 			 * don't leak monitors and remove the reference
4089 			 * put on the vnode when the delegation was granted.
4090 			 */
4091 			if (fp->rf_dinfo.rd_dtype == OPEN_DELEGATE_READ) {
4092 				(void) fem_uninstall(vp, deleg_rdops,
4093 				    (void *)fp);
4094 				vn_open_downgrade(vp, FREAD);
4095 			} else if (fp->rf_dinfo.rd_dtype ==
4096 			    OPEN_DELEGATE_WRITE) {
4097 				(void) fem_uninstall(vp, deleg_wrops,
4098 				    (void *)fp);
4099 				vn_open_downgrade(vp, FREAD|FWRITE);
4100 			}
4101 			mutex_enter(&vp->v_vsd_lock);
4102 			(void) vsd_set(vp, nfs4_srv_vkey, NULL);
4103 			mutex_exit(&vp->v_vsd_lock);
4104 			VN_RELE(vp);
4105 			fp->rf_vp = NULL;
4106 		}
4107 		rfs4_dbe_invalidate(fp->rf_dbe);
4108 	}
4109 }
4110 
4111 /*
4112  * Given a directory that is being unexported, cleanup/release all
4113  * state in the server that refers to objects residing underneath this
4114  * particular export.  The ordering of the release is important.
4115  * Lock_owner, then state and then file.
4116  *
4117  * NFS zones note: nfs_export.c:unexport() calls this from a
4118  * thread in the global zone for NGZ data structures, so we
4119  * CANNOT use zone_getspecific anywhere in this code path.
4120  */
4121 void
4122 rfs4_clean_state_exi(nfs_export_t *ne, struct exportinfo *exi)
4123 {
4124 	nfs_globals_t *ng;
4125 	nfs4_srv_t *nsrv4;
4126 
4127 	ng = ne->ne_globals;
4128 	ASSERT(ng->nfs_zoneid == exi->exi_zoneid);
4129 	nsrv4 = ng->nfs4_srv;
4130 
4131 	mutex_enter(&nsrv4->state_lock);
4132 
4133 	if (nsrv4->nfs4_server_state == NULL) {
4134 		mutex_exit(&nsrv4->state_lock);
4135 		return;
4136 	}
4137 
4138 	rfs4_dbe_walk(nsrv4->rfs4_lo_state_tab,
4139 	    rfs4_lo_state_walk_callout, exi);
4140 	rfs4_dbe_walk(nsrv4->rfs4_state_tab, rfs4_state_walk_callout, exi);
4141 	rfs4_dbe_walk(nsrv4->rfs4_deleg_state_tab,
4142 	    rfs4_deleg_state_walk_callout, exi);
4143 	rfs4_dbe_walk(nsrv4->rfs4_file_tab, rfs4_file_walk_callout, exi);
4144 
4145 	mutex_exit(&nsrv4->state_lock);
4146 }
4147