xref: /freebsd/sys/fs/nfsserver/nfs_nfsdstate.c (revision 182b21d4627655ff95f9ecd8a8e1066184391f16)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2009 Rick Macklem, University of Guelph
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  *
28  */
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 
33 #include "opt_inet.h"
34 #include "opt_inet6.h"
35 #include <sys/extattr.h>
36 #include <fs/nfs/nfsport.h>
37 
38 int nfsrv_issuedelegs = 0;
39 int nfsrv_dolocallocks = 0;
40 struct nfsv4lock nfsv4rootfs_lock;
41 time_t nfsdev_time = 0;
42 int nfsrv_layouthashsize;
43 volatile int nfsrv_layoutcnt = 0;
44 
45 NFSD_VNET_DEFINE(struct nfsrv_stablefirst, nfsrv_stablefirst);
46 
47 NFSD_VNET_DECLARE(int, nfsrv_numnfsd);
48 NFSD_VNET_DECLARE(struct nfsstatsv1 *, nfsstatsv1_p);
49 
50 extern uint32_t nfs_srvmaxio;
51 extern int nfsrv_lease;
52 extern struct timeval nfsboottime;
53 extern u_int32_t newnfs_true, newnfs_false;
54 extern struct mtx nfsrv_dslock_mtx;
55 extern struct mtx nfsrv_recalllock_mtx;
56 extern struct mtx nfsrv_dontlistlock_mtx;
57 extern int nfsd_debuglevel;
58 extern u_int nfsrv_dsdirsize;
59 extern struct nfsdevicehead nfsrv_devidhead;
60 extern int nfsrv_doflexfile;
61 extern int nfsrv_maxpnfsmirror;
62 NFSV4ROOTLOCKMUTEX;
63 NFSSTATESPINLOCK;
64 extern struct nfsdontlisthead nfsrv_dontlisthead;
65 extern volatile int nfsrv_devidcnt;
66 extern struct nfslayouthead nfsrv_recalllisthead;
67 extern char *nfsrv_zeropnfsdat;
68 
69 SYSCTL_DECL(_vfs_nfsd);
70 int	nfsrv_statehashsize = NFSSTATEHASHSIZE;
71 SYSCTL_INT(_vfs_nfsd, OID_AUTO, statehashsize, CTLFLAG_RDTUN,
72     &nfsrv_statehashsize, 0,
73     "Size of state hash table set via loader.conf");
74 
75 int	nfsrv_clienthashsize = NFSCLIENTHASHSIZE;
76 SYSCTL_INT(_vfs_nfsd, OID_AUTO, clienthashsize, CTLFLAG_RDTUN,
77     &nfsrv_clienthashsize, 0,
78     "Size of client hash table set via loader.conf");
79 
80 int	nfsrv_lockhashsize = NFSLOCKHASHSIZE;
81 SYSCTL_INT(_vfs_nfsd, OID_AUTO, fhhashsize, CTLFLAG_RDTUN,
82     &nfsrv_lockhashsize, 0,
83     "Size of file handle hash table set via loader.conf");
84 
85 int	nfsrv_sessionhashsize = NFSSESSIONHASHSIZE;
86 SYSCTL_INT(_vfs_nfsd, OID_AUTO, sessionhashsize, CTLFLAG_RDTUN,
87     &nfsrv_sessionhashsize, 0,
88     "Size of session hash table set via loader.conf");
89 
90 int	nfsrv_layouthighwater = NFSLAYOUTHIGHWATER;
91 SYSCTL_INT(_vfs_nfsd, OID_AUTO, layouthighwater, CTLFLAG_RDTUN,
92     &nfsrv_layouthighwater, 0,
93     "High water mark for number of layouts set via loader.conf");
94 
95 static int	nfsrv_v4statelimit = NFSRV_V4STATELIMIT;
96 SYSCTL_INT(_vfs_nfsd, OID_AUTO, v4statelimit, CTLFLAG_RWTUN,
97     &nfsrv_v4statelimit, 0,
98     "High water limit for NFSv4 opens+locks+delegations");
99 
100 static int	nfsrv_writedelegifpos = 0;
101 SYSCTL_INT(_vfs_nfsd, OID_AUTO, writedelegifpos, CTLFLAG_RW,
102     &nfsrv_writedelegifpos, 0,
103     "Issue a write delegation for read opens if possible");
104 
105 static int	nfsrv_allowreadforwriteopen = 1;
106 SYSCTL_INT(_vfs_nfsd, OID_AUTO, allowreadforwriteopen, CTLFLAG_RW,
107     &nfsrv_allowreadforwriteopen, 0,
108     "Allow Reads to be done with Write Access StateIDs");
109 
110 int	nfsrv_pnfsatime = 0;
111 SYSCTL_INT(_vfs_nfsd, OID_AUTO, pnfsstrictatime, CTLFLAG_RW,
112     &nfsrv_pnfsatime, 0,
113     "For pNFS service, do Getattr ops to keep atime up-to-date");
114 
115 int	nfsrv_flexlinuxhack = 0;
116 SYSCTL_INT(_vfs_nfsd, OID_AUTO, flexlinuxhack, CTLFLAG_RW,
117     &nfsrv_flexlinuxhack, 0,
118     "For Linux clients, hack around Flex File Layout bug");
119 
120 /*
121  * Hash lists for nfs V4.
122  */
123 NFSD_VNET_DEFINE(struct nfsclienthashhead *, nfsclienthash);
124 NFSD_VNET_DEFINE(struct nfslockhashhead *, nfslockhash);
125 NFSD_VNET_DEFINE(struct nfssessionhash *, nfssessionhash);
126 
127 struct nfslayouthash		*nfslayouthash;
128 volatile int nfsrv_dontlistlen = 0;
129 
130 static u_int32_t nfsrv_openpluslock = 0, nfsrv_delegatecnt = 0;
131 static int nfsrv_returnoldstateid = 0, nfsrv_clients = 0;
132 static int nfsrv_clienthighwater = NFSRV_CLIENTHIGHWATER;
133 static int nfsrv_nogsscallback = 0;
134 static volatile int nfsrv_writedelegcnt = 0;
135 static int nfsrv_faildscnt;
136 
137 NFSD_VNET_DEFINE_STATIC(time_t, nfsrvboottime);
138 
139 /* local functions */
140 static void nfsrv_dumpaclient(struct nfsclient *clp,
141     struct nfsd_dumpclients *dumpp);
142 static void nfsrv_freeopenowner(struct nfsstate *stp, int cansleep,
143     NFSPROC_T *p);
144 static int nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep,
145     NFSPROC_T *p);
146 static void nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
147     NFSPROC_T *p);
148 static void nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp,
149     int cansleep, NFSPROC_T *p);
150 static void nfsrv_freenfslock(struct nfslock *lop);
151 static void nfsrv_freenfslockfile(struct nfslockfile *lfp);
152 static void nfsrv_freedeleg(struct nfsstate *);
153 static int nfsrv_getstate(struct nfsclient *clp, nfsv4stateid_t *stateidp,
154     u_int32_t flags, struct nfsstate **stpp);
155 static void nfsrv_getowner(struct nfsstatehead *hp, struct nfsstate *new_stp,
156     struct nfsstate **stpp);
157 static int nfsrv_getlockfh(vnode_t vp, u_short flags,
158     struct nfslockfile *new_lfp, fhandle_t *nfhp, NFSPROC_T *p);
159 static int nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
160     struct nfslockfile **lfpp, fhandle_t *nfhp, int lockit);
161 static void nfsrv_insertlock(struct nfslock *new_lop,
162     struct nfslock *insert_lop, struct nfsstate *stp, struct nfslockfile *lfp);
163 static void nfsrv_updatelock(struct nfsstate *stp, struct nfslock **new_lopp,
164     struct nfslock **other_lopp, struct nfslockfile *lfp);
165 static int nfsrv_getipnumber(u_char *cp);
166 static int nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
167     nfsv4stateid_t *stateidp, int specialid);
168 static int nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp,
169     u_int32_t flags);
170 static int nfsrv_docallback(struct nfsclient *clp, int procnum,
171     nfsv4stateid_t *stateidp, int trunc, fhandle_t *fhp,
172     struct nfsvattr *nap, nfsattrbit_t *attrbitp, int laytype, NFSPROC_T *p);
173 static int nfsrv_cbcallargs(struct nfsrv_descript *nd, struct nfsclient *clp,
174     uint32_t callback, int op, const char *optag, struct nfsdsession **sepp,
175     int *slotposp);
176 static u_int32_t nfsrv_nextclientindex(void);
177 static u_int32_t nfsrv_nextstateindex(struct nfsclient *clp);
178 static void nfsrv_markstable(struct nfsclient *clp);
179 static void nfsrv_markreclaim(struct nfsclient *clp);
180 static int nfsrv_checkstable(struct nfsclient *clp);
181 static int nfsrv_clientconflict(struct nfsclient *clp, int *haslockp, struct
182     vnode *vp, NFSPROC_T *p);
183 static int nfsrv_delegconflict(struct nfsstate *stp, int *haslockp,
184     NFSPROC_T *p, vnode_t vp);
185 static int nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
186     struct nfsclient *clp, int *haslockp, NFSPROC_T *p);
187 static int nfsrv_notsamecredname(int op, struct nfsrv_descript *nd,
188     struct nfsclient *clp);
189 static time_t nfsrv_leaseexpiry(void);
190 static void nfsrv_delaydelegtimeout(struct nfsstate *stp);
191 static int nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
192     struct nfsstate *stp, struct nfsrvcache *op);
193 static int nfsrv_nootherstate(struct nfsstate *stp);
194 static int nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
195     uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p);
196 static void nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp,
197     uint64_t init_first, uint64_t init_end, NFSPROC_T *p);
198 static int nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags,
199     int oldflags, uint64_t first, uint64_t end, struct nfslockconflict *cfp,
200     NFSPROC_T *p);
201 static void nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile *lfp,
202     NFSPROC_T *p);
203 static void nfsrv_locallock_commit(struct nfslockfile *lfp, int flags,
204     uint64_t first, uint64_t end);
205 static void nfsrv_locklf(struct nfslockfile *lfp);
206 static void nfsrv_unlocklf(struct nfslockfile *lfp);
207 static struct nfsdsession *nfsrv_findsession(uint8_t *sessionid);
208 static int nfsrv_freesession(struct nfsrv_descript *nd, struct nfsdsession *sep,
209     uint8_t *sessionid);
210 static int nfsv4_setcbsequence(struct nfsrv_descript *nd, struct nfsclient *clp,
211     int dont_replycache, struct nfsdsession **sepp, int *slotposp);
212 static int nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp);
213 static int nfsrv_addlayout(struct nfsrv_descript *nd, struct nfslayout **lypp,
214     nfsv4stateid_t *stateidp, char *layp, int *layoutlenp, NFSPROC_T *p);
215 static void nfsrv_freelayout(struct nfslayouthead *lhp, struct nfslayout *lyp);
216 static void nfsrv_freelayoutlist(nfsquad_t clientid);
217 static void nfsrv_freelayouts(nfsquad_t *clid, fsid_t *fs, int laytype,
218     int iomode);
219 static void nfsrv_freealllayouts(void);
220 static void nfsrv_freedevid(struct nfsdevice *ds);
221 static int nfsrv_setdsserver(char *dspathp, char *mdspathp, NFSPROC_T *p,
222     struct nfsdevice **dsp);
223 static void nfsrv_deleteds(struct nfsdevice *fndds);
224 static void nfsrv_allocdevid(struct nfsdevice *ds, char *addr, char *dnshost);
225 static void nfsrv_freealldevids(void);
226 static void nfsrv_flexlayouterr(struct nfsrv_descript *nd, uint32_t *layp,
227     int maxcnt, NFSPROC_T *p);
228 static int nfsrv_recalllayout(nfsquad_t clid, nfsv4stateid_t *stateidp,
229     fhandle_t *fhp, struct nfslayout *lyp, int changed, int laytype,
230     NFSPROC_T *p);
231 static int nfsrv_findlayout(nfsquad_t *clientidp, fhandle_t *fhp, int laytype,
232     NFSPROC_T *, struct nfslayout **lypp);
233 static int nfsrv_fndclid(nfsquad_t *clidvec, nfsquad_t clid, int clidcnt);
234 static struct nfslayout *nfsrv_filelayout(struct nfsrv_descript *nd, int iomode,
235     fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs);
236 static struct nfslayout *nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode,
237     int mirrorcnt, fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs);
238 static int nfsrv_dontlayout(fhandle_t *fhp);
239 static int nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf,
240     vnode_t dvp, struct nfsdevice *ds, struct ucred *cred, NFSPROC_T *p,
241     vnode_t *tvpp);
242 static struct nfsdevice *nfsrv_findmirroredds(struct nfsmount *nmp);
243 static int nfsrv_checkmachcred(int op, struct nfsrv_descript *nd,
244     struct nfsclient *clp);
245 
246 /*
247  * Scan the client list for a match and either return the current one,
248  * create a new entry or return an error.
249  * If returning a non-error, the clp structure must either be linked into
250  * the client list or free'd.
251  */
252 int
253 nfsrv_setclient(struct nfsrv_descript *nd, struct nfsclient **new_clpp,
254     nfsquad_t *clientidp, nfsquad_t *confirmp, NFSPROC_T *p)
255 {
256 	struct nfsclient *clp = NULL, *new_clp = *new_clpp;
257 	int i, error = 0, ret;
258 	struct nfsstate *stp, *tstp;
259 #ifdef INET
260 	struct sockaddr_in *sin, *rin;
261 #endif
262 #ifdef INET6
263 	struct sockaddr_in6 *sin6, *rin6;
264 #endif
265 	struct nfsdsession *sep, *nsep;
266 	int zapit = 0, gotit, hasstate = 0, igotlock;
267 	static u_int64_t confirm_index = 0;
268 
269 	/*
270 	 * Check for state resource limit exceeded.
271 	 */
272 	if (nfsrv_openpluslock > nfsrv_v4statelimit) {
273 		error = NFSERR_RESOURCE;
274 		goto out;
275 	}
276 
277 	if (nfsrv_issuedelegs == 0 ||
278 	    ((nd->nd_flag & ND_GSS) != 0 && nfsrv_nogsscallback != 0))
279 		/*
280 		 * Don't do callbacks when delegations are disabled or
281 		 * for AUTH_GSS unless enabled via nfsrv_nogsscallback.
282 		 * If establishing a callback connection is attempted
283 		 * when a firewall is blocking the callback path, the
284 		 * server may wait too long for the connect attempt to
285 		 * succeed during the Open. Some clients, such as Linux,
286 		 * may timeout and give up on the Open before the server
287 		 * replies. Also, since AUTH_GSS callbacks are not
288 		 * yet interoperability tested, they might cause the
289 		 * server to crap out, if they get past the Init call to
290 		 * the client.
291 		 */
292 		new_clp->lc_program = 0;
293 
294 	/* Lock out other nfsd threads */
295 	NFSLOCKV4ROOTMUTEX();
296 	nfsv4_relref(&nfsv4rootfs_lock);
297 	do {
298 		igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
299 		    NFSV4ROOTLOCKMUTEXPTR, NULL);
300 	} while (!igotlock);
301 	NFSUNLOCKV4ROOTMUTEX();
302 
303 	/*
304 	 * Search for a match in the client list.
305 	 */
306 	gotit = i = 0;
307 	while (i < nfsrv_clienthashsize && !gotit) {
308 	    LIST_FOREACH(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash) {
309 		if (new_clp->lc_idlen == clp->lc_idlen &&
310 		    !NFSBCMP(new_clp->lc_id, clp->lc_id, clp->lc_idlen)) {
311 			gotit = 1;
312 			break;
313 		}
314 	    }
315 	    if (gotit == 0)
316 		i++;
317 	}
318 	if (!gotit ||
319 	    (clp->lc_flags & (LCL_NEEDSCONFIRM | LCL_ADMINREVOKED))) {
320 		if ((nd->nd_flag & ND_NFSV41) != 0 && confirmp->lval[1] != 0) {
321 			/*
322 			 * For NFSv4.1, if confirmp->lval[1] is non-zero, the
323 			 * client is trying to update a confirmed clientid.
324 			 */
325 			NFSLOCKV4ROOTMUTEX();
326 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
327 			NFSUNLOCKV4ROOTMUTEX();
328 			confirmp->lval[1] = 0;
329 			error = NFSERR_NOENT;
330 			goto out;
331 		}
332 		/*
333 		 * Get rid of the old one.
334 		 */
335 		if (i != nfsrv_clienthashsize) {
336 			LIST_REMOVE(clp, lc_hash);
337 			nfsrv_cleanclient(clp, p);
338 			nfsrv_freedeleglist(&clp->lc_deleg);
339 			nfsrv_freedeleglist(&clp->lc_olddeleg);
340 			zapit = 1;
341 		}
342 		/*
343 		 * Add it after assigning a client id to it.
344 		 */
345 		new_clp->lc_flags |= LCL_NEEDSCONFIRM;
346 		if ((nd->nd_flag & ND_NFSV41) != 0) {
347 			confirmp->lval[0] = ++confirm_index;
348 			new_clp->lc_confirm.lval[0] = confirmp->lval[0] - 1;
349 		} else
350 			confirmp->qval = new_clp->lc_confirm.qval =
351 			    ++confirm_index;
352 		clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
353 		    NFSD_VNET(nfsrvboottime);
354 		clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
355 		    nfsrv_nextclientindex();
356 		new_clp->lc_stateindex = 0;
357 		new_clp->lc_statemaxindex = 0;
358 		new_clp->lc_prevsess = 0;
359 		new_clp->lc_cbref = 0;
360 		new_clp->lc_expiry = nfsrv_leaseexpiry();
361 		LIST_INIT(&new_clp->lc_open);
362 		LIST_INIT(&new_clp->lc_deleg);
363 		LIST_INIT(&new_clp->lc_olddeleg);
364 		LIST_INIT(&new_clp->lc_session);
365 		for (i = 0; i < nfsrv_statehashsize; i++)
366 			LIST_INIT(&new_clp->lc_stateid[i]);
367 		LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
368 		    lc_hash);
369 		NFSD_VNET(nfsstatsv1_p)->srvclients++;
370 		nfsrv_openpluslock++;
371 		nfsrv_clients++;
372 		NFSLOCKV4ROOTMUTEX();
373 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
374 		NFSUNLOCKV4ROOTMUTEX();
375 		if (zapit)
376 			nfsrv_zapclient(clp, p);
377 		*new_clpp = NULL;
378 		goto out;
379 	}
380 
381 	/*
382 	 * Now, handle the cases where the id is already issued.
383 	 */
384 	if (nfsrv_notsamecredname(NFSV4OP_EXCHANGEID, nd, clp)) {
385 	    /*
386 	     * Check to see if there is expired state that should go away.
387 	     */
388 	    if (clp->lc_expiry < NFSD_MONOSEC &&
389 	        (!LIST_EMPTY(&clp->lc_open) || !LIST_EMPTY(&clp->lc_deleg))) {
390 		nfsrv_cleanclient(clp, p);
391 		nfsrv_freedeleglist(&clp->lc_deleg);
392 	    }
393 
394 	    /*
395 	     * If there is outstanding state, then reply NFSERR_CLIDINUSE per
396 	     * RFC3530 Sec. 8.1.2 last para.
397 	     */
398 	    if (!LIST_EMPTY(&clp->lc_deleg)) {
399 		hasstate = 1;
400 	    } else if (LIST_EMPTY(&clp->lc_open)) {
401 		hasstate = 0;
402 	    } else {
403 		hasstate = 0;
404 		/* Look for an Open on the OpenOwner */
405 		LIST_FOREACH(stp, &clp->lc_open, ls_list) {
406 		    if (!LIST_EMPTY(&stp->ls_open)) {
407 			hasstate = 1;
408 			break;
409 		    }
410 		}
411 	    }
412 	    if (hasstate) {
413 		/*
414 		 * If the uid doesn't match, return NFSERR_CLIDINUSE after
415 		 * filling out the correct ipaddr and portnum.
416 		 */
417 		switch (clp->lc_req.nr_nam->sa_family) {
418 #ifdef INET
419 		case AF_INET:
420 			sin = (struct sockaddr_in *)new_clp->lc_req.nr_nam;
421 			rin = (struct sockaddr_in *)clp->lc_req.nr_nam;
422 			sin->sin_addr.s_addr = rin->sin_addr.s_addr;
423 			sin->sin_port = rin->sin_port;
424 			break;
425 #endif
426 #ifdef INET6
427 		case AF_INET6:
428 			sin6 = (struct sockaddr_in6 *)new_clp->lc_req.nr_nam;
429 			rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam;
430 			sin6->sin6_addr = rin6->sin6_addr;
431 			sin6->sin6_port = rin6->sin6_port;
432 			break;
433 #endif
434 		}
435 		NFSLOCKV4ROOTMUTEX();
436 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
437 		NFSUNLOCKV4ROOTMUTEX();
438 		error = NFSERR_CLIDINUSE;
439 		goto out;
440 	    }
441 	}
442 
443 	if (NFSBCMP(new_clp->lc_verf, clp->lc_verf, NFSX_VERF)) {
444 		/*
445 		 * If the verifier has changed, the client has rebooted
446 		 * and a new client id is issued. The old state info
447 		 * can be thrown away once the SETCLIENTID_CONFIRM occurs.
448 		 */
449 		LIST_REMOVE(clp, lc_hash);
450 
451 		/* Get rid of all sessions on this clientid. */
452 		LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep) {
453 			ret = nfsrv_freesession(NULL, sep, NULL);
454 			if (ret != 0)
455 				printf("nfsrv_setclient: verifier changed free"
456 				    " session failed=%d\n", ret);
457 		}
458 
459 		new_clp->lc_flags |= LCL_NEEDSCONFIRM;
460 		if ((nd->nd_flag & ND_NFSV41) != 0) {
461 			confirmp->lval[0] = ++confirm_index;
462 			new_clp->lc_confirm.lval[0] = confirmp->lval[0] - 1;
463 		} else
464 			confirmp->qval = new_clp->lc_confirm.qval =
465 			    ++confirm_index;
466 		clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
467 		    NFSD_VNET(nfsrvboottime);
468 		clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
469 		    nfsrv_nextclientindex();
470 		new_clp->lc_stateindex = 0;
471 		new_clp->lc_statemaxindex = 0;
472 		new_clp->lc_prevsess = 0;
473 		new_clp->lc_cbref = 0;
474 		new_clp->lc_expiry = nfsrv_leaseexpiry();
475 
476 		/*
477 		 * Save the state until confirmed.
478 		 */
479 		LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
480 		LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
481 			tstp->ls_clp = new_clp;
482 		LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
483 		LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
484 			tstp->ls_clp = new_clp;
485 		LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg,
486 		    ls_list);
487 		LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
488 			tstp->ls_clp = new_clp;
489 		for (i = 0; i < nfsrv_statehashsize; i++) {
490 			LIST_NEWHEAD(&new_clp->lc_stateid[i],
491 			    &clp->lc_stateid[i], ls_hash);
492 			LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
493 				tstp->ls_clp = new_clp;
494 		}
495 		LIST_INIT(&new_clp->lc_session);
496 		LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
497 		    lc_hash);
498 		NFSD_VNET(nfsstatsv1_p)->srvclients++;
499 		nfsrv_openpluslock++;
500 		nfsrv_clients++;
501 		NFSLOCKV4ROOTMUTEX();
502 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
503 		NFSUNLOCKV4ROOTMUTEX();
504 
505 		/*
506 		 * Must wait until any outstanding callback on the old clp
507 		 * completes.
508 		 */
509 		NFSLOCKSTATE();
510 		while (clp->lc_cbref) {
511 			clp->lc_flags |= LCL_WAKEUPWANTED;
512 			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
513 			    "nfsd clp", 10 * hz);
514 		}
515 		NFSUNLOCKSTATE();
516 		nfsrv_zapclient(clp, p);
517 		*new_clpp = NULL;
518 		goto out;
519 	}
520 
521 	/* For NFSv4.1, mark that we found a confirmed clientid. */
522 	if ((nd->nd_flag & ND_NFSV41) != 0) {
523 		clientidp->lval[0] = clp->lc_clientid.lval[0];
524 		clientidp->lval[1] = clp->lc_clientid.lval[1];
525 		confirmp->lval[0] = 0;	/* Ignored by client */
526 		confirmp->lval[1] = 1;
527 	} else {
528 		/*
529 		 * id and verifier match, so update the net address info
530 		 * and get rid of any existing callback authentication
531 		 * handle, so a new one will be acquired.
532 		 */
533 		LIST_REMOVE(clp, lc_hash);
534 		new_clp->lc_flags |= (LCL_NEEDSCONFIRM | LCL_DONTCLEAN);
535 		new_clp->lc_expiry = nfsrv_leaseexpiry();
536 		confirmp->qval = new_clp->lc_confirm.qval = ++confirm_index;
537 		clientidp->lval[0] = new_clp->lc_clientid.lval[0] =
538 		    clp->lc_clientid.lval[0];
539 		clientidp->lval[1] = new_clp->lc_clientid.lval[1] =
540 		    clp->lc_clientid.lval[1];
541 		new_clp->lc_delegtime = clp->lc_delegtime;
542 		new_clp->lc_stateindex = clp->lc_stateindex;
543 		new_clp->lc_statemaxindex = clp->lc_statemaxindex;
544 		new_clp->lc_cbref = 0;
545 		LIST_NEWHEAD(&new_clp->lc_open, &clp->lc_open, ls_list);
546 		LIST_FOREACH(tstp, &new_clp->lc_open, ls_list)
547 			tstp->ls_clp = new_clp;
548 		LIST_NEWHEAD(&new_clp->lc_deleg, &clp->lc_deleg, ls_list);
549 		LIST_FOREACH(tstp, &new_clp->lc_deleg, ls_list)
550 			tstp->ls_clp = new_clp;
551 		LIST_NEWHEAD(&new_clp->lc_olddeleg, &clp->lc_olddeleg, ls_list);
552 		LIST_FOREACH(tstp, &new_clp->lc_olddeleg, ls_list)
553 			tstp->ls_clp = new_clp;
554 		for (i = 0; i < nfsrv_statehashsize; i++) {
555 			LIST_NEWHEAD(&new_clp->lc_stateid[i],
556 			    &clp->lc_stateid[i], ls_hash);
557 			LIST_FOREACH(tstp, &new_clp->lc_stateid[i], ls_hash)
558 				tstp->ls_clp = new_clp;
559 		}
560 		LIST_INIT(&new_clp->lc_session);
561 		LIST_INSERT_HEAD(NFSCLIENTHASH(new_clp->lc_clientid), new_clp,
562 		    lc_hash);
563 		NFSD_VNET(nfsstatsv1_p)->srvclients++;
564 		nfsrv_openpluslock++;
565 		nfsrv_clients++;
566 	}
567 	NFSLOCKV4ROOTMUTEX();
568 	nfsv4_unlock(&nfsv4rootfs_lock, 1);
569 	NFSUNLOCKV4ROOTMUTEX();
570 
571 	if ((nd->nd_flag & ND_NFSV41) == 0) {
572 		/*
573 		 * Must wait until any outstanding callback on the old clp
574 		 * completes.
575 		 */
576 		NFSLOCKSTATE();
577 		while (clp->lc_cbref) {
578 			clp->lc_flags |= LCL_WAKEUPWANTED;
579 			(void)mtx_sleep(clp, NFSSTATEMUTEXPTR, PZERO - 1,
580 			    "nfsdclp", 10 * hz);
581 		}
582 		NFSUNLOCKSTATE();
583 		nfsrv_zapclient(clp, p);
584 		*new_clpp = NULL;
585 	}
586 
587 out:
588 	NFSEXITCODE2(error, nd);
589 	return (error);
590 }
591 
592 /*
593  * Check to see if the client id exists and optionally confirm it.
594  */
595 int
596 nfsrv_getclient(nfsquad_t clientid, int opflags, struct nfsclient **clpp,
597     struct nfsdsession *nsep, nfsquad_t confirm, uint32_t cbprogram,
598     struct nfsrv_descript *nd, NFSPROC_T *p)
599 {
600 	struct nfsclient *clp;
601 	struct nfsstate *stp;
602 	int i;
603 	struct nfsclienthashhead *hp;
604 	int error = 0, igotlock, doneok;
605 	struct nfssessionhash *shp;
606 	struct nfsdsession *sep;
607 	uint64_t sessid[2];
608 	bool sess_replay;
609 	static uint64_t next_sess = 0;
610 
611 	if (clpp)
612 		*clpp = NULL;
613 	if ((nd == NULL || (nd->nd_flag & ND_NFSV41) == 0 ||
614 	    opflags != CLOPS_RENEW) && NFSD_VNET(nfsrvboottime) !=
615 	    clientid.lval[0]) {
616 		error = NFSERR_STALECLIENTID;
617 		goto out;
618 	}
619 
620 	/*
621 	 * If called with opflags == CLOPS_RENEW, the State Lock is
622 	 * already held. Otherwise, we need to get either that or,
623 	 * for the case of Confirm, lock out the nfsd threads.
624 	 */
625 	if (opflags & CLOPS_CONFIRM) {
626 		NFSLOCKV4ROOTMUTEX();
627 		nfsv4_relref(&nfsv4rootfs_lock);
628 		do {
629 			igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
630 			    NFSV4ROOTLOCKMUTEXPTR, NULL);
631 		} while (!igotlock);
632 		/*
633 		 * Create a new sessionid here, since we need to do it where
634 		 * there is a mutex held to serialize update of next_sess.
635 		 */
636 		if ((nd->nd_flag & ND_NFSV41) != 0) {
637 			sessid[0] = ++next_sess;
638 			sessid[1] = clientid.qval;
639 		}
640 		NFSUNLOCKV4ROOTMUTEX();
641 	} else if (opflags != CLOPS_RENEW) {
642 		NFSLOCKSTATE();
643 	}
644 
645 	/* For NFSv4.1, the clp is acquired from the associated session. */
646 	if (nd != NULL && (nd->nd_flag & ND_NFSV41) != 0 &&
647 	    opflags == CLOPS_RENEW) {
648 		clp = NULL;
649 		if ((nd->nd_flag & ND_HASSEQUENCE) != 0) {
650 			shp = NFSSESSIONHASH(nd->nd_sessionid);
651 			NFSLOCKSESSION(shp);
652 			sep = nfsrv_findsession(nd->nd_sessionid);
653 			if (sep != NULL)
654 				clp = sep->sess_clp;
655 			NFSUNLOCKSESSION(shp);
656 		}
657 	} else {
658 		hp = NFSCLIENTHASH(clientid);
659 		LIST_FOREACH(clp, hp, lc_hash) {
660 			if (clp->lc_clientid.lval[1] == clientid.lval[1])
661 				break;
662 		}
663 	}
664 	if (clp == NULL) {
665 		if (opflags & CLOPS_CONFIRM)
666 			error = NFSERR_STALECLIENTID;
667 		else
668 			error = NFSERR_EXPIRED;
669 	} else if (clp->lc_flags & LCL_ADMINREVOKED) {
670 		/*
671 		 * If marked admin revoked, just return the error.
672 		 */
673 		error = NFSERR_ADMINREVOKED;
674 	}
675 	if (error) {
676 		if (opflags & CLOPS_CONFIRM) {
677 			NFSLOCKV4ROOTMUTEX();
678 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
679 			NFSUNLOCKV4ROOTMUTEX();
680 		} else if (opflags != CLOPS_RENEW) {
681 			NFSUNLOCKSTATE();
682 		}
683 		goto out;
684 	}
685 
686 	/*
687 	 * Perform any operations specified by the opflags.
688 	 */
689 	if (opflags & CLOPS_CONFIRM) {
690 		sess_replay = false;
691 		if ((nd->nd_flag & ND_NFSV41) != 0) {
692 		    /*
693 		     * For the case where lc_confirm.lval[0] == confirm.lval[0],
694 		     * use the new session, but with the previous sessionid.
695 		     * This is not exactly what the RFC describes, but should
696 		     * result in the same reply as the previous CreateSession.
697 		     */
698 		    if (clp->lc_confirm.lval[0] + 1 == confirm.lval[0]) {
699 			clp->lc_confirm.lval[0] = confirm.lval[0];
700 			clp->lc_prevsess = sessid[0];
701 		    } else if (clp->lc_confirm.lval[0] == confirm.lval[0]) {
702 			if (clp->lc_prevsess == 0)
703 			    error = NFSERR_SEQMISORDERED;
704 			else
705 			    sessid[0] = clp->lc_prevsess;
706 			sess_replay = true;
707 		    } else
708 			error = NFSERR_SEQMISORDERED;
709 		} else if ((nd->nd_flag & ND_NFSV41) == 0 &&
710 		     clp->lc_confirm.qval != confirm.qval)
711 			error = NFSERR_STALECLIENTID;
712 		if (error == 0 && nfsrv_notsamecredname(NFSV4OP_CREATESESSION,
713 		    nd, clp))
714 			error = NFSERR_CLIDINUSE;
715 
716 		if (!error) {
717 		    if ((clp->lc_flags & (LCL_NEEDSCONFIRM | LCL_DONTCLEAN)) ==
718 			LCL_NEEDSCONFIRM) {
719 			/*
720 			 * Hang onto the delegations (as old delegations)
721 			 * for an Open with CLAIM_DELEGATE_PREV unless in
722 			 * grace, but get rid of the rest of the state.
723 			 */
724 			nfsrv_cleanclient(clp, p);
725 			nfsrv_freedeleglist(&clp->lc_olddeleg);
726 			if (nfsrv_checkgrace(nd, clp, 0)) {
727 			    /* In grace, so just delete delegations */
728 			    nfsrv_freedeleglist(&clp->lc_deleg);
729 			} else {
730 			    LIST_FOREACH(stp, &clp->lc_deleg, ls_list)
731 				stp->ls_flags |= NFSLCK_OLDDELEG;
732 			    clp->lc_delegtime = NFSD_MONOSEC +
733 				nfsrv_lease + NFSRV_LEASEDELTA;
734 			    LIST_NEWHEAD(&clp->lc_olddeleg, &clp->lc_deleg,
735 				ls_list);
736 			}
737 			if ((nd->nd_flag & ND_NFSV41) != 0)
738 			    clp->lc_program = cbprogram;
739 		    }
740 		    clp->lc_flags &= ~(LCL_NEEDSCONFIRM | LCL_DONTCLEAN);
741 		    if (clp->lc_program)
742 			clp->lc_flags |= LCL_NEEDSCBNULL;
743 		    /* For NFSv4.1, link the session onto the client. */
744 		    if (nsep != NULL) {
745 			/* Hold a reference on the xprt for a backchannel. */
746 			if ((nsep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN)
747 			    != 0 && !sess_replay) {
748 			    if (clp->lc_req.nr_client == NULL)
749 				clp->lc_req.nr_client = (struct __rpc_client *)
750 				    clnt_bck_create(nd->nd_xprt->xp_socket,
751 				    cbprogram, NFSV4_CBVERS);
752 			    if (clp->lc_req.nr_client != NULL) {
753 				SVC_ACQUIRE(nd->nd_xprt);
754 				CLNT_ACQUIRE(clp->lc_req.nr_client);
755 				nd->nd_xprt->xp_p2 = clp->lc_req.nr_client;
756 				/* Disable idle timeout. */
757 				nd->nd_xprt->xp_idletimeout = 0;
758 				nsep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
759 			    } else
760 				nsep->sess_crflags &= ~NFSV4CRSESS_CONNBACKCHAN;
761 			}
762 			NFSBCOPY(sessid, nsep->sess_sessionid,
763 			    NFSX_V4SESSIONID);
764 			NFSBCOPY(sessid, nsep->sess_cbsess.nfsess_sessionid,
765 			    NFSX_V4SESSIONID);
766 			if (!sess_replay) {
767 			    shp = NFSSESSIONHASH(nsep->sess_sessionid);
768 			    NFSLOCKSTATE();
769 			    NFSLOCKSESSION(shp);
770 			    LIST_INSERT_HEAD(&shp->list, nsep, sess_hash);
771 			    LIST_INSERT_HEAD(&clp->lc_session, nsep, sess_list);
772 			    nsep->sess_clp = clp;
773 			    NFSUNLOCKSESSION(shp);
774 			    NFSUNLOCKSTATE();
775 			}
776 		    }
777 		}
778 	} else if (clp->lc_flags & LCL_NEEDSCONFIRM) {
779 		error = NFSERR_EXPIRED;
780 	}
781 
782 	/*
783 	 * If called by the Renew Op, we must check the principal.
784 	 */
785 	if (!error && (opflags & CLOPS_RENEWOP)) {
786 	    if (nfsrv_notsamecredname(0, nd, clp)) {
787 		doneok = 0;
788 		for (i = 0; i < nfsrv_statehashsize && doneok == 0; i++) {
789 		    LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
790 			if ((stp->ls_flags & NFSLCK_OPEN) &&
791 			    stp->ls_uid == nd->nd_cred->cr_uid) {
792 				doneok = 1;
793 				break;
794 			}
795 		    }
796 		}
797 		if (!doneok)
798 			error = NFSERR_ACCES;
799 	    }
800 	    if (!error && (clp->lc_flags & LCL_CBDOWN))
801 		error = NFSERR_CBPATHDOWN;
802 	}
803 	if ((!error || error == NFSERR_CBPATHDOWN) &&
804 	     (opflags & CLOPS_RENEW)) {
805 		clp->lc_expiry = nfsrv_leaseexpiry();
806 	}
807 	if (opflags & CLOPS_CONFIRM) {
808 		NFSLOCKV4ROOTMUTEX();
809 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
810 		NFSUNLOCKV4ROOTMUTEX();
811 	} else if (opflags != CLOPS_RENEW) {
812 		NFSUNLOCKSTATE();
813 	}
814 	if (clpp)
815 		*clpp = clp;
816 
817 out:
818 	NFSEXITCODE2(error, nd);
819 	return (error);
820 }
821 
822 /*
823  * Perform the NFSv4.1 destroy clientid.
824  */
825 int
826 nfsrv_destroyclient(struct nfsrv_descript *nd, nfsquad_t clientid, NFSPROC_T *p)
827 {
828 	struct nfsclient *clp;
829 	struct nfsclienthashhead *hp;
830 	int error = 0, i, igotlock;
831 
832 	if (NFSD_VNET(nfsrvboottime) != clientid.lval[0]) {
833 		error = NFSERR_STALECLIENTID;
834 		goto out;
835 	}
836 
837 	/* Lock out other nfsd threads */
838 	NFSLOCKV4ROOTMUTEX();
839 	nfsv4_relref(&nfsv4rootfs_lock);
840 	do {
841 		igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
842 		    NFSV4ROOTLOCKMUTEXPTR, NULL);
843 	} while (igotlock == 0);
844 	NFSUNLOCKV4ROOTMUTEX();
845 
846 	hp = NFSCLIENTHASH(clientid);
847 	LIST_FOREACH(clp, hp, lc_hash) {
848 		if (clp->lc_clientid.lval[1] == clientid.lval[1])
849 			break;
850 	}
851 	if (clp == NULL) {
852 		NFSLOCKV4ROOTMUTEX();
853 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
854 		NFSUNLOCKV4ROOTMUTEX();
855 		/* Just return ok, since it is gone. */
856 		goto out;
857 	}
858 
859 	/* Check for the SP4_MACH_CRED case. */
860 	error = nfsrv_checkmachcred(NFSV4OP_DESTROYCLIENTID, nd, clp);
861 	if (error != 0) {
862 		NFSLOCKV4ROOTMUTEX();
863 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
864 		NFSUNLOCKV4ROOTMUTEX();
865 		goto out;
866 	}
867 
868 	/*
869 	 * Free up all layouts on the clientid.  Should the client return the
870 	 * layouts?
871 	 */
872 	nfsrv_freelayoutlist(clientid);
873 
874 	/* Scan for state on the clientid. */
875 	for (i = 0; i < nfsrv_statehashsize; i++)
876 		if (!LIST_EMPTY(&clp->lc_stateid[i])) {
877 			NFSLOCKV4ROOTMUTEX();
878 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
879 			NFSUNLOCKV4ROOTMUTEX();
880 			error = NFSERR_CLIENTIDBUSY;
881 			goto out;
882 		}
883 	if (!LIST_EMPTY(&clp->lc_session) || !LIST_EMPTY(&clp->lc_deleg)) {
884 		NFSLOCKV4ROOTMUTEX();
885 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
886 		NFSUNLOCKV4ROOTMUTEX();
887 		error = NFSERR_CLIENTIDBUSY;
888 		goto out;
889 	}
890 
891 	/* Destroy the clientid and return ok. */
892 	nfsrv_cleanclient(clp, p);
893 	nfsrv_freedeleglist(&clp->lc_deleg);
894 	nfsrv_freedeleglist(&clp->lc_olddeleg);
895 	LIST_REMOVE(clp, lc_hash);
896 	NFSLOCKV4ROOTMUTEX();
897 	nfsv4_unlock(&nfsv4rootfs_lock, 1);
898 	NFSUNLOCKV4ROOTMUTEX();
899 	nfsrv_zapclient(clp, p);
900 out:
901 	NFSEXITCODE2(error, nd);
902 	return (error);
903 }
904 
905 /*
906  * Called from the new nfssvc syscall to admin revoke a clientid.
907  * Returns 0 for success, error otherwise.
908  */
909 int
910 nfsrv_adminrevoke(struct nfsd_clid *revokep, NFSPROC_T *p)
911 {
912 	struct nfsclient *clp = NULL;
913 	int i, error = 0;
914 	int gotit, igotlock;
915 
916 	/*
917 	 * First, lock out the nfsd so that state won't change while the
918 	 * revocation record is being written to the stable storage restart
919 	 * file.
920 	 */
921 	NFSLOCKV4ROOTMUTEX();
922 	do {
923 		igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
924 		    NFSV4ROOTLOCKMUTEXPTR, NULL);
925 	} while (!igotlock);
926 	NFSUNLOCKV4ROOTMUTEX();
927 
928 	/*
929 	 * Search for a match in the client list.
930 	 */
931 	gotit = i = 0;
932 	while (i < nfsrv_clienthashsize && !gotit) {
933 	    LIST_FOREACH(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash) {
934 		if (revokep->nclid_idlen == clp->lc_idlen &&
935 		    !NFSBCMP(revokep->nclid_id, clp->lc_id, clp->lc_idlen)) {
936 			gotit = 1;
937 			break;
938 		}
939 	    }
940 	    i++;
941 	}
942 	if (!gotit) {
943 		NFSLOCKV4ROOTMUTEX();
944 		nfsv4_unlock(&nfsv4rootfs_lock, 0);
945 		NFSUNLOCKV4ROOTMUTEX();
946 		error = EPERM;
947 		goto out;
948 	}
949 
950 	/*
951 	 * Now, write out the revocation record
952 	 */
953 	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
954 	nfsrv_backupstable();
955 
956 	/*
957 	 * and clear out the state, marking the clientid revoked.
958 	 */
959 	clp->lc_flags &= ~LCL_CALLBACKSON;
960 	clp->lc_flags |= LCL_ADMINREVOKED;
961 	nfsrv_cleanclient(clp, p);
962 	nfsrv_freedeleglist(&clp->lc_deleg);
963 	nfsrv_freedeleglist(&clp->lc_olddeleg);
964 	NFSLOCKV4ROOTMUTEX();
965 	nfsv4_unlock(&nfsv4rootfs_lock, 0);
966 	NFSUNLOCKV4ROOTMUTEX();
967 
968 out:
969 	NFSEXITCODE(error);
970 	return (error);
971 }
972 
973 /*
974  * Dump out stats for all clients. Called from nfssvc(2), that is used
975  * nfsstatsv1.
976  */
977 void
978 nfsrv_dumpclients(struct nfsd_dumpclients *dumpp, int maxcnt)
979 {
980 	struct nfsclient *clp;
981 	int i = 0, cnt = 0;
982 
983 	/*
984 	 * First, get a reference on the nfsv4rootfs_lock so that an
985 	 * exclusive lock cannot be acquired while dumping the clients.
986 	 */
987 	NFSLOCKV4ROOTMUTEX();
988 	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
989 	NFSUNLOCKV4ROOTMUTEX();
990 	NFSLOCKSTATE();
991 	/*
992 	 * Rattle through the client lists until done.
993 	 */
994 	while (i < nfsrv_clienthashsize && cnt < maxcnt) {
995 	    clp = LIST_FIRST(&NFSD_VNET(nfsclienthash)[i]);
996 	    while (clp != LIST_END(&NFSD_VNET(nfsclienthash)[i]) && cnt <
997 		maxcnt) {
998 		nfsrv_dumpaclient(clp, &dumpp[cnt]);
999 		cnt++;
1000 		clp = LIST_NEXT(clp, lc_hash);
1001 	    }
1002 	    i++;
1003 	}
1004 	if (cnt < maxcnt)
1005 	    dumpp[cnt].ndcl_clid.nclid_idlen = 0;
1006 	NFSUNLOCKSTATE();
1007 	NFSLOCKV4ROOTMUTEX();
1008 	nfsv4_relref(&nfsv4rootfs_lock);
1009 	NFSUNLOCKV4ROOTMUTEX();
1010 }
1011 
1012 /*
1013  * Dump stats for a client. Must be called with the NFSSTATELOCK and spl'd.
1014  */
1015 static void
1016 nfsrv_dumpaclient(struct nfsclient *clp, struct nfsd_dumpclients *dumpp)
1017 {
1018 	struct nfsstate *stp, *openstp, *lckownstp;
1019 	struct nfslock *lop;
1020 	sa_family_t af;
1021 #ifdef INET
1022 	struct sockaddr_in *rin;
1023 #endif
1024 #ifdef INET6
1025 	struct sockaddr_in6 *rin6;
1026 #endif
1027 
1028 	dumpp->ndcl_nopenowners = dumpp->ndcl_nlockowners = 0;
1029 	dumpp->ndcl_nopens = dumpp->ndcl_nlocks = 0;
1030 	dumpp->ndcl_ndelegs = dumpp->ndcl_nolddelegs = 0;
1031 	dumpp->ndcl_flags = clp->lc_flags;
1032 	dumpp->ndcl_clid.nclid_idlen = clp->lc_idlen;
1033 	NFSBCOPY(clp->lc_id, dumpp->ndcl_clid.nclid_id, clp->lc_idlen);
1034 	af = clp->lc_req.nr_nam->sa_family;
1035 	dumpp->ndcl_addrfam = af;
1036 	switch (af) {
1037 #ifdef INET
1038 	case AF_INET:
1039 		rin = (struct sockaddr_in *)clp->lc_req.nr_nam;
1040 		dumpp->ndcl_cbaddr.sin_addr = rin->sin_addr;
1041 		break;
1042 #endif
1043 #ifdef INET6
1044 	case AF_INET6:
1045 		rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam;
1046 		dumpp->ndcl_cbaddr.sin6_addr = rin6->sin6_addr;
1047 		break;
1048 #endif
1049 	}
1050 
1051 	/*
1052 	 * Now, scan the state lists and total up the opens and locks.
1053 	 */
1054 	LIST_FOREACH(stp, &clp->lc_open, ls_list) {
1055 	    dumpp->ndcl_nopenowners++;
1056 	    LIST_FOREACH(openstp, &stp->ls_open, ls_list) {
1057 		dumpp->ndcl_nopens++;
1058 		LIST_FOREACH(lckownstp, &openstp->ls_open, ls_list) {
1059 		    dumpp->ndcl_nlockowners++;
1060 		    LIST_FOREACH(lop, &lckownstp->ls_lock, lo_lckowner) {
1061 			dumpp->ndcl_nlocks++;
1062 		    }
1063 		}
1064 	    }
1065 	}
1066 
1067 	/*
1068 	 * and the delegation lists.
1069 	 */
1070 	LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
1071 	    dumpp->ndcl_ndelegs++;
1072 	}
1073 	LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
1074 	    dumpp->ndcl_nolddelegs++;
1075 	}
1076 }
1077 
1078 /*
1079  * Dump out lock stats for a file.
1080  */
1081 void
1082 nfsrv_dumplocks(vnode_t vp, struct nfsd_dumplocks *ldumpp, int maxcnt,
1083     NFSPROC_T *p)
1084 {
1085 	struct nfsstate *stp;
1086 	struct nfslock *lop;
1087 	int cnt = 0;
1088 	struct nfslockfile *lfp;
1089 	sa_family_t af;
1090 #ifdef INET
1091 	struct sockaddr_in *rin;
1092 #endif
1093 #ifdef INET6
1094 	struct sockaddr_in6 *rin6;
1095 #endif
1096 	int ret;
1097 	fhandle_t nfh;
1098 
1099 	ret = nfsrv_getlockfh(vp, 0, NULL, &nfh, p);
1100 	/*
1101 	 * First, get a reference on the nfsv4rootfs_lock so that an
1102 	 * exclusive lock on it cannot be acquired while dumping the locks.
1103 	 */
1104 	NFSLOCKV4ROOTMUTEX();
1105 	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
1106 	NFSUNLOCKV4ROOTMUTEX();
1107 	NFSLOCKSTATE();
1108 	if (!ret)
1109 		ret = nfsrv_getlockfile(0, NULL, &lfp, &nfh, 0);
1110 	if (ret) {
1111 		ldumpp[0].ndlck_clid.nclid_idlen = 0;
1112 		NFSUNLOCKSTATE();
1113 		NFSLOCKV4ROOTMUTEX();
1114 		nfsv4_relref(&nfsv4rootfs_lock);
1115 		NFSUNLOCKV4ROOTMUTEX();
1116 		return;
1117 	}
1118 
1119 	/*
1120 	 * For each open share on file, dump it out.
1121 	 */
1122 	stp = LIST_FIRST(&lfp->lf_open);
1123 	while (stp != LIST_END(&lfp->lf_open) && cnt < maxcnt) {
1124 		ldumpp[cnt].ndlck_flags = stp->ls_flags;
1125 		ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
1126 		ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
1127 		ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
1128 		ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
1129 		ldumpp[cnt].ndlck_owner.nclid_idlen =
1130 		    stp->ls_openowner->ls_ownerlen;
1131 		NFSBCOPY(stp->ls_openowner->ls_owner,
1132 		    ldumpp[cnt].ndlck_owner.nclid_id,
1133 		    stp->ls_openowner->ls_ownerlen);
1134 		ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
1135 		NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
1136 		    stp->ls_clp->lc_idlen);
1137 		af = stp->ls_clp->lc_req.nr_nam->sa_family;
1138 		ldumpp[cnt].ndlck_addrfam = af;
1139 		switch (af) {
1140 #ifdef INET
1141 		case AF_INET:
1142 			rin = (struct sockaddr_in *)stp->ls_clp->lc_req.nr_nam;
1143 			ldumpp[cnt].ndlck_cbaddr.sin_addr = rin->sin_addr;
1144 			break;
1145 #endif
1146 #ifdef INET6
1147 		case AF_INET6:
1148 			rin6 = (struct sockaddr_in6 *)
1149 			    stp->ls_clp->lc_req.nr_nam;
1150 			ldumpp[cnt].ndlck_cbaddr.sin6_addr = rin6->sin6_addr;
1151 			break;
1152 #endif
1153 		}
1154 		stp = LIST_NEXT(stp, ls_file);
1155 		cnt++;
1156 	}
1157 
1158 	/*
1159 	 * and all locks.
1160 	 */
1161 	lop = LIST_FIRST(&lfp->lf_lock);
1162 	while (lop != LIST_END(&lfp->lf_lock) && cnt < maxcnt) {
1163 		stp = lop->lo_stp;
1164 		ldumpp[cnt].ndlck_flags = lop->lo_flags;
1165 		ldumpp[cnt].ndlck_first = lop->lo_first;
1166 		ldumpp[cnt].ndlck_end = lop->lo_end;
1167 		ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
1168 		ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
1169 		ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
1170 		ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
1171 		ldumpp[cnt].ndlck_owner.nclid_idlen = stp->ls_ownerlen;
1172 		NFSBCOPY(stp->ls_owner, ldumpp[cnt].ndlck_owner.nclid_id,
1173 		    stp->ls_ownerlen);
1174 		ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
1175 		NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
1176 		    stp->ls_clp->lc_idlen);
1177 		af = stp->ls_clp->lc_req.nr_nam->sa_family;
1178 		ldumpp[cnt].ndlck_addrfam = af;
1179 		switch (af) {
1180 #ifdef INET
1181 		case AF_INET:
1182 			rin = (struct sockaddr_in *)stp->ls_clp->lc_req.nr_nam;
1183 			ldumpp[cnt].ndlck_cbaddr.sin_addr = rin->sin_addr;
1184 			break;
1185 #endif
1186 #ifdef INET6
1187 		case AF_INET6:
1188 			rin6 = (struct sockaddr_in6 *)
1189 			    stp->ls_clp->lc_req.nr_nam;
1190 			ldumpp[cnt].ndlck_cbaddr.sin6_addr = rin6->sin6_addr;
1191 			break;
1192 #endif
1193 		}
1194 		lop = LIST_NEXT(lop, lo_lckfile);
1195 		cnt++;
1196 	}
1197 
1198 	/*
1199 	 * and the delegations.
1200 	 */
1201 	stp = LIST_FIRST(&lfp->lf_deleg);
1202 	while (stp != LIST_END(&lfp->lf_deleg) && cnt < maxcnt) {
1203 		ldumpp[cnt].ndlck_flags = stp->ls_flags;
1204 		ldumpp[cnt].ndlck_stateid.seqid = stp->ls_stateid.seqid;
1205 		ldumpp[cnt].ndlck_stateid.other[0] = stp->ls_stateid.other[0];
1206 		ldumpp[cnt].ndlck_stateid.other[1] = stp->ls_stateid.other[1];
1207 		ldumpp[cnt].ndlck_stateid.other[2] = stp->ls_stateid.other[2];
1208 		ldumpp[cnt].ndlck_owner.nclid_idlen = 0;
1209 		ldumpp[cnt].ndlck_clid.nclid_idlen = stp->ls_clp->lc_idlen;
1210 		NFSBCOPY(stp->ls_clp->lc_id, ldumpp[cnt].ndlck_clid.nclid_id,
1211 		    stp->ls_clp->lc_idlen);
1212 		af = stp->ls_clp->lc_req.nr_nam->sa_family;
1213 		ldumpp[cnt].ndlck_addrfam = af;
1214 		switch (af) {
1215 #ifdef INET
1216 		case AF_INET:
1217 			rin = (struct sockaddr_in *)stp->ls_clp->lc_req.nr_nam;
1218 			ldumpp[cnt].ndlck_cbaddr.sin_addr = rin->sin_addr;
1219 			break;
1220 #endif
1221 #ifdef INET6
1222 		case AF_INET6:
1223 			rin6 = (struct sockaddr_in6 *)
1224 			    stp->ls_clp->lc_req.nr_nam;
1225 			ldumpp[cnt].ndlck_cbaddr.sin6_addr = rin6->sin6_addr;
1226 			break;
1227 #endif
1228 		}
1229 		stp = LIST_NEXT(stp, ls_file);
1230 		cnt++;
1231 	}
1232 
1233 	/*
1234 	 * If list isn't full, mark end of list by setting the client name
1235 	 * to zero length.
1236 	 */
1237 	if (cnt < maxcnt)
1238 		ldumpp[cnt].ndlck_clid.nclid_idlen = 0;
1239 	NFSUNLOCKSTATE();
1240 	NFSLOCKV4ROOTMUTEX();
1241 	nfsv4_relref(&nfsv4rootfs_lock);
1242 	NFSUNLOCKV4ROOTMUTEX();
1243 }
1244 
1245 /*
1246  * Server timer routine. It can scan any linked list, so long
1247  * as it holds the spin/mutex lock and there is no exclusive lock on
1248  * nfsv4rootfs_lock.
1249  * (For OpenBSD, a kthread is ok. For FreeBSD, I think it is ok
1250  *  to do this from a callout, since the spin locks work. For
1251  *  Darwin, I'm not sure what will work correctly yet.)
1252  * Should be called once per second.
1253  */
1254 void
1255 nfsrv_servertimer(void *arg __unused)
1256 {
1257 	struct nfsclient *clp, *nclp;
1258 	struct nfsstate *stp, *nstp;
1259 	int got_ref, i;
1260 
1261 	/*
1262 	 * Make sure nfsboottime is set. This is used by V3 as well
1263 	 * as V4. Note that nfsboottime is not nfsrvboottime, which is
1264 	 * only used by the V4 server for leases.
1265 	 */
1266 	if (nfsboottime.tv_sec == 0)
1267 		NFSSETBOOTTIME(nfsboottime);
1268 
1269 	/*
1270 	 * If server hasn't started yet, just return.
1271 	 */
1272 	NFSLOCKSTATE();
1273 	if (NFSD_VNET(nfsrv_stablefirst).nsf_eograce == 0) {
1274 		NFSUNLOCKSTATE();
1275 		return;
1276 	}
1277 	if (!(NFSD_VNET(nfsrv_stablefirst).nsf_flags & NFSNSF_UPDATEDONE)) {
1278 		if (!(NFSD_VNET(nfsrv_stablefirst).nsf_flags &
1279 		      NFSNSF_GRACEOVER) &&
1280 		    NFSD_MONOSEC > NFSD_VNET(nfsrv_stablefirst).nsf_eograce)
1281 			NFSD_VNET(nfsrv_stablefirst).nsf_flags |=
1282 			    (NFSNSF_GRACEOVER | NFSNSF_NEEDLOCK);
1283 		NFSUNLOCKSTATE();
1284 		return;
1285 	}
1286 
1287 	/*
1288 	 * Try and get a reference count on the nfsv4rootfs_lock so that
1289 	 * no nfsd thread can acquire an exclusive lock on it before this
1290 	 * call is done. If it is already exclusively locked, just return.
1291 	 */
1292 	NFSLOCKV4ROOTMUTEX();
1293 	got_ref = nfsv4_getref_nonblock(&nfsv4rootfs_lock);
1294 	NFSUNLOCKV4ROOTMUTEX();
1295 	if (got_ref == 0) {
1296 		NFSUNLOCKSTATE();
1297 		return;
1298 	}
1299 
1300 	/*
1301 	 * For each client...
1302 	 */
1303 	for (i = 0; i < nfsrv_clienthashsize; i++) {
1304 	    clp = LIST_FIRST(&NFSD_VNET(nfsclienthash)[i]);
1305 	    while (clp != LIST_END(&NFSD_VNET(nfsclienthash)[i])) {
1306 		nclp = LIST_NEXT(clp, lc_hash);
1307 		if (!(clp->lc_flags & LCL_EXPIREIT)) {
1308 		    if (((clp->lc_expiry + NFSRV_STALELEASE) < NFSD_MONOSEC
1309 			 && ((LIST_EMPTY(&clp->lc_deleg)
1310 			      && LIST_EMPTY(&clp->lc_open)) ||
1311 			     nfsrv_clients > nfsrv_clienthighwater)) ||
1312 			(clp->lc_expiry + NFSRV_MOULDYLEASE) < NFSD_MONOSEC ||
1313 			(clp->lc_expiry < NFSD_MONOSEC &&
1314 			 (nfsrv_openpluslock * 10 / 9) > nfsrv_v4statelimit)) {
1315 			/*
1316 			 * Lease has expired several nfsrv_lease times ago:
1317 			 * PLUS
1318 			 *    - no state is associated with it
1319 			 *    OR
1320 			 *    - above high water mark for number of clients
1321 			 *      (nfsrv_clienthighwater should be large enough
1322 			 *       that this only occurs when clients fail to
1323 			 *       use the same nfs_client_id4.id. Maybe somewhat
1324 			 *       higher that the maximum number of clients that
1325 			 *       will mount this server?)
1326 			 * OR
1327 			 * Lease has expired a very long time ago
1328 			 * OR
1329 			 * Lease has expired PLUS the number of opens + locks
1330 			 * has exceeded 90% of capacity
1331 			 *
1332 			 * --> Mark for expiry. The actual expiry will be done
1333 			 *     by an nfsd sometime soon.
1334 			 */
1335 			clp->lc_flags |= LCL_EXPIREIT;
1336 			NFSD_VNET(nfsrv_stablefirst).nsf_flags |=
1337 			    (NFSNSF_NEEDLOCK | NFSNSF_EXPIREDCLIENT);
1338 		    } else {
1339 			/*
1340 			 * If there are no opens, increment no open tick cnt
1341 			 * If time exceeds NFSNOOPEN, mark it to be thrown away
1342 			 * otherwise, if there is an open, reset no open time
1343 			 * Hopefully, this will avoid excessive re-creation
1344 			 * of open owners and subsequent open confirms.
1345 			 */
1346 			stp = LIST_FIRST(&clp->lc_open);
1347 			while (stp != LIST_END(&clp->lc_open)) {
1348 				nstp = LIST_NEXT(stp, ls_list);
1349 				if (LIST_EMPTY(&stp->ls_open)) {
1350 					stp->ls_noopens++;
1351 					if (stp->ls_noopens > NFSNOOPEN ||
1352 					    (nfsrv_openpluslock * 2) >
1353 					    nfsrv_v4statelimit)
1354 						NFSD_VNET(nfsrv_stablefirst).nsf_flags |=
1355 							NFSNSF_NOOPENS;
1356 				} else {
1357 					stp->ls_noopens = 0;
1358 				}
1359 				stp = nstp;
1360 			}
1361 		    }
1362 		}
1363 		clp = nclp;
1364 	    }
1365 	}
1366 	NFSUNLOCKSTATE();
1367 	NFSLOCKV4ROOTMUTEX();
1368 	nfsv4_relref(&nfsv4rootfs_lock);
1369 	NFSUNLOCKV4ROOTMUTEX();
1370 }
1371 
1372 /*
1373  * The following set of functions free up the various data structures.
1374  */
1375 /*
1376  * Clear out all open/lock state related to this nfsclient.
1377  * Caller must hold an exclusive lock on nfsv4rootfs_lock, so that
1378  * there are no other active nfsd threads.
1379  */
1380 void
1381 nfsrv_cleanclient(struct nfsclient *clp, NFSPROC_T *p)
1382 {
1383 	struct nfsstate *stp, *nstp;
1384 	struct nfsdsession *sep, *nsep;
1385 
1386 	LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp)
1387 		nfsrv_freeopenowner(stp, 1, p);
1388 	if ((clp->lc_flags & LCL_ADMINREVOKED) == 0)
1389 		LIST_FOREACH_SAFE(sep, &clp->lc_session, sess_list, nsep)
1390 			(void)nfsrv_freesession(NULL, sep, NULL);
1391 }
1392 
1393 /*
1394  * Free a client that has been cleaned. It should also already have been
1395  * removed from the lists.
1396  * (Just to be safe w.r.t. newnfs_disconnect(), call this function when
1397  *  softclock interrupts are enabled.)
1398  */
1399 void
1400 nfsrv_zapclient(struct nfsclient *clp, NFSPROC_T *p)
1401 {
1402 
1403 #ifdef notyet
1404 	if ((clp->lc_flags & (LCL_GSS | LCL_CALLBACKSON)) ==
1405 	     (LCL_GSS | LCL_CALLBACKSON) &&
1406 	    (clp->lc_hand.nfsh_flag & NFSG_COMPLETE) &&
1407 	    clp->lc_handlelen > 0) {
1408 		clp->lc_hand.nfsh_flag &= ~NFSG_COMPLETE;
1409 		clp->lc_hand.nfsh_flag |= NFSG_DESTROYED;
1410 		(void) nfsrv_docallback(clp, NFSV4PROC_CBNULL,
1411 			NULL, 0, NULL, NULL, NULL, 0, p);
1412 	}
1413 #endif
1414 	newnfs_disconnect(NULL, &clp->lc_req);
1415 	free(clp->lc_req.nr_nam, M_SONAME);
1416 	NFSFREEMUTEX(&clp->lc_req.nr_mtx);
1417 	free(clp->lc_stateid, M_NFSDCLIENT);
1418 	free(clp, M_NFSDCLIENT);
1419 	NFSLOCKSTATE();
1420 	NFSD_VNET(nfsstatsv1_p)->srvclients--;
1421 	nfsrv_openpluslock--;
1422 	nfsrv_clients--;
1423 	NFSUNLOCKSTATE();
1424 }
1425 
1426 /*
1427  * Free a list of delegation state structures.
1428  * (This function will also free all nfslockfile structures that no
1429  *  longer have associated state.)
1430  */
1431 void
1432 nfsrv_freedeleglist(struct nfsstatehead *sthp)
1433 {
1434 	struct nfsstate *stp, *nstp;
1435 
1436 	LIST_FOREACH_SAFE(stp, sthp, ls_list, nstp) {
1437 		nfsrv_freedeleg(stp);
1438 	}
1439 	LIST_INIT(sthp);
1440 }
1441 
1442 /*
1443  * Free up a delegation.
1444  */
1445 static void
1446 nfsrv_freedeleg(struct nfsstate *stp)
1447 {
1448 	struct nfslockfile *lfp;
1449 
1450 	LIST_REMOVE(stp, ls_hash);
1451 	LIST_REMOVE(stp, ls_list);
1452 	LIST_REMOVE(stp, ls_file);
1453 	if ((stp->ls_flags & NFSLCK_DELEGWRITE) != 0)
1454 		nfsrv_writedelegcnt--;
1455 	lfp = stp->ls_lfp;
1456 	if (LIST_EMPTY(&lfp->lf_open) &&
1457 	    LIST_EMPTY(&lfp->lf_lock) && LIST_EMPTY(&lfp->lf_deleg) &&
1458 	    LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
1459 	    lfp->lf_usecount == 0 &&
1460 	    nfsv4_testlock(&lfp->lf_locallock_lck) == 0)
1461 		nfsrv_freenfslockfile(lfp);
1462 	free(stp, M_NFSDSTATE);
1463 	NFSD_VNET(nfsstatsv1_p)->srvdelegates--;
1464 	nfsrv_openpluslock--;
1465 	nfsrv_delegatecnt--;
1466 }
1467 
1468 /*
1469  * This function frees an open owner and all associated opens.
1470  */
1471 static void
1472 nfsrv_freeopenowner(struct nfsstate *stp, int cansleep, NFSPROC_T *p)
1473 {
1474 	struct nfsstate *nstp, *tstp;
1475 
1476 	LIST_REMOVE(stp, ls_list);
1477 	/*
1478 	 * Now, free all associated opens.
1479 	 */
1480 	nstp = LIST_FIRST(&stp->ls_open);
1481 	while (nstp != LIST_END(&stp->ls_open)) {
1482 		tstp = nstp;
1483 		nstp = LIST_NEXT(nstp, ls_list);
1484 		(void) nfsrv_freeopen(tstp, NULL, cansleep, p);
1485 	}
1486 	if (stp->ls_op)
1487 		nfsrvd_derefcache(stp->ls_op);
1488 	free(stp, M_NFSDSTATE);
1489 	NFSD_VNET(nfsstatsv1_p)->srvopenowners--;
1490 	nfsrv_openpluslock--;
1491 }
1492 
1493 /*
1494  * This function frees an open (nfsstate open structure) with all associated
1495  * lock_owners and locks. It also frees the nfslockfile structure iff there
1496  * are no other opens on the file.
1497  * Returns 1 if it free'd the nfslockfile, 0 otherwise.
1498  */
1499 static int
1500 nfsrv_freeopen(struct nfsstate *stp, vnode_t vp, int cansleep, NFSPROC_T *p)
1501 {
1502 	struct nfsstate *nstp, *tstp;
1503 	struct nfslockfile *lfp;
1504 	int ret;
1505 
1506 	LIST_REMOVE(stp, ls_hash);
1507 	LIST_REMOVE(stp, ls_list);
1508 	LIST_REMOVE(stp, ls_file);
1509 
1510 	lfp = stp->ls_lfp;
1511 	/*
1512 	 * Now, free all lockowners associated with this open.
1513 	 */
1514 	LIST_FOREACH_SAFE(tstp, &stp->ls_open, ls_list, nstp)
1515 		nfsrv_freelockowner(tstp, vp, cansleep, p);
1516 
1517 	/*
1518 	 * The nfslockfile is freed here if there are no locks
1519 	 * associated with the open.
1520 	 * If there are locks associated with the open, the
1521 	 * nfslockfile structure can be freed via nfsrv_freelockowner().
1522 	 * Acquire the state mutex to avoid races with calls to
1523 	 * nfsrv_getlockfile().
1524 	 */
1525 	if (cansleep != 0)
1526 		NFSLOCKSTATE();
1527 	if (lfp != NULL && LIST_EMPTY(&lfp->lf_open) &&
1528 	    LIST_EMPTY(&lfp->lf_deleg) && LIST_EMPTY(&lfp->lf_lock) &&
1529 	    LIST_EMPTY(&lfp->lf_locallock) && LIST_EMPTY(&lfp->lf_rollback) &&
1530 	    lfp->lf_usecount == 0 &&
1531 	    (cansleep != 0 || nfsv4_testlock(&lfp->lf_locallock_lck) == 0)) {
1532 		nfsrv_freenfslockfile(lfp);
1533 		ret = 1;
1534 	} else
1535 		ret = 0;
1536 	if (cansleep != 0)
1537 		NFSUNLOCKSTATE();
1538 	free(stp, M_NFSDSTATE);
1539 	NFSD_VNET(nfsstatsv1_p)->srvopens--;
1540 	nfsrv_openpluslock--;
1541 	return (ret);
1542 }
1543 
1544 /*
1545  * Frees a lockowner and all associated locks.
1546  */
1547 static void
1548 nfsrv_freelockowner(struct nfsstate *stp, vnode_t vp, int cansleep,
1549     NFSPROC_T *p)
1550 {
1551 
1552 	LIST_REMOVE(stp, ls_hash);
1553 	LIST_REMOVE(stp, ls_list);
1554 	nfsrv_freeallnfslocks(stp, vp, cansleep, p);
1555 	if (stp->ls_op)
1556 		nfsrvd_derefcache(stp->ls_op);
1557 	free(stp, M_NFSDSTATE);
1558 	NFSD_VNET(nfsstatsv1_p)->srvlockowners--;
1559 	nfsrv_openpluslock--;
1560 }
1561 
1562 /*
1563  * Free all the nfs locks on a lockowner.
1564  */
1565 static void
1566 nfsrv_freeallnfslocks(struct nfsstate *stp, vnode_t vp, int cansleep,
1567     NFSPROC_T *p)
1568 {
1569 	struct nfslock *lop, *nlop;
1570 	struct nfsrollback *rlp, *nrlp;
1571 	struct nfslockfile *lfp = NULL;
1572 	int gottvp = 0;
1573 	vnode_t tvp = NULL;
1574 	uint64_t first, end;
1575 
1576 	if (vp != NULL)
1577 		ASSERT_VOP_UNLOCKED(vp, "nfsrv_freeallnfslocks: vnode locked");
1578 	lop = LIST_FIRST(&stp->ls_lock);
1579 	while (lop != LIST_END(&stp->ls_lock)) {
1580 		nlop = LIST_NEXT(lop, lo_lckowner);
1581 		/*
1582 		 * Since all locks should be for the same file, lfp should
1583 		 * not change.
1584 		 */
1585 		if (lfp == NULL)
1586 			lfp = lop->lo_lfp;
1587 		else if (lfp != lop->lo_lfp)
1588 			panic("allnfslocks");
1589 		/*
1590 		 * If vp is NULL and cansleep != 0, a vnode must be acquired
1591 		 * from the file handle. This only occurs when called from
1592 		 * nfsrv_cleanclient().
1593 		 */
1594 		if (gottvp == 0) {
1595 			if (nfsrv_dolocallocks == 0)
1596 				tvp = NULL;
1597 			else if (vp == NULL && cansleep != 0) {
1598 				tvp = nfsvno_getvp(&lfp->lf_fh);
1599 				if (tvp != NULL)
1600 					NFSVOPUNLOCK(tvp);
1601 			} else
1602 				tvp = vp;
1603 			gottvp = 1;
1604 		}
1605 
1606 		if (tvp != NULL) {
1607 			if (cansleep == 0)
1608 				panic("allnfs2");
1609 			first = lop->lo_first;
1610 			end = lop->lo_end;
1611 			nfsrv_freenfslock(lop);
1612 			nfsrv_localunlock(tvp, lfp, first, end, p);
1613 			LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list,
1614 			    nrlp)
1615 				free(rlp, M_NFSDROLLBACK);
1616 			LIST_INIT(&lfp->lf_rollback);
1617 		} else
1618 			nfsrv_freenfslock(lop);
1619 		lop = nlop;
1620 	}
1621 	if (vp == NULL && tvp != NULL)
1622 		vrele(tvp);
1623 }
1624 
1625 /*
1626  * Free an nfslock structure.
1627  */
1628 static void
1629 nfsrv_freenfslock(struct nfslock *lop)
1630 {
1631 
1632 	if (lop->lo_lckfile.le_prev != NULL) {
1633 		LIST_REMOVE(lop, lo_lckfile);
1634 		NFSD_VNET(nfsstatsv1_p)->srvlocks--;
1635 		nfsrv_openpluslock--;
1636 	}
1637 	LIST_REMOVE(lop, lo_lckowner);
1638 	free(lop, M_NFSDLOCK);
1639 }
1640 
1641 /*
1642  * This function frees an nfslockfile structure.
1643  */
1644 static void
1645 nfsrv_freenfslockfile(struct nfslockfile *lfp)
1646 {
1647 
1648 	LIST_REMOVE(lfp, lf_hash);
1649 	free(lfp, M_NFSDLOCKFILE);
1650 }
1651 
1652 /*
1653  * This function looks up an nfsstate structure via stateid.
1654  */
1655 static int
1656 nfsrv_getstate(struct nfsclient *clp, nfsv4stateid_t *stateidp, __unused u_int32_t flags,
1657     struct nfsstate **stpp)
1658 {
1659 	struct nfsstate *stp;
1660 	struct nfsstatehead *hp;
1661 	int error = 0;
1662 
1663 	*stpp = NULL;
1664 	hp = NFSSTATEHASH(clp, *stateidp);
1665 	LIST_FOREACH(stp, hp, ls_hash) {
1666 		if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
1667 			NFSX_STATEIDOTHER))
1668 			break;
1669 	}
1670 
1671 	/*
1672 	 * If no state id in list, return NFSERR_BADSTATEID.
1673 	 */
1674 	if (stp == LIST_END(hp)) {
1675 		error = NFSERR_BADSTATEID;
1676 		goto out;
1677 	}
1678 	*stpp = stp;
1679 
1680 out:
1681 	NFSEXITCODE(error);
1682 	return (error);
1683 }
1684 
1685 /*
1686  * This function gets an nfsstate structure via owner string.
1687  */
1688 static void
1689 nfsrv_getowner(struct nfsstatehead *hp, struct nfsstate *new_stp,
1690     struct nfsstate **stpp)
1691 {
1692 	struct nfsstate *stp;
1693 
1694 	*stpp = NULL;
1695 	LIST_FOREACH(stp, hp, ls_list) {
1696 		if (new_stp->ls_ownerlen == stp->ls_ownerlen &&
1697 		  !NFSBCMP(new_stp->ls_owner,stp->ls_owner,stp->ls_ownerlen)) {
1698 			*stpp = stp;
1699 			return;
1700 		}
1701 	}
1702 }
1703 
1704 /*
1705  * Lock control function called to update lock status.
1706  * Returns 0 upon success, -1 if there is no lock and the flags indicate
1707  * that one isn't to be created and an NFSERR_xxx for other errors.
1708  * The structures new_stp and new_lop are passed in as pointers that should
1709  * be set to NULL if the structure is used and shouldn't be free'd.
1710  * For the NFSLCK_TEST and NFSLCK_CHECK cases, the structures are
1711  * never used and can safely be allocated on the stack. For all other
1712  * cases, *new_stpp and *new_lopp should be malloc'd before the call,
1713  * in case they are used.
1714  */
1715 int
1716 nfsrv_lockctrl(vnode_t vp, struct nfsstate **new_stpp,
1717     struct nfslock **new_lopp, struct nfslockconflict *cfp,
1718     nfsquad_t clientid, nfsv4stateid_t *stateidp,
1719     __unused struct nfsexstuff *exp,
1720     struct nfsrv_descript *nd, NFSPROC_T *p)
1721 {
1722 	struct nfslock *lop;
1723 	struct nfsstate *new_stp = *new_stpp;
1724 	struct nfslock *new_lop = *new_lopp;
1725 	struct nfsstate *tstp, *mystp, *nstp;
1726 	int specialid = 0;
1727 	struct nfslockfile *lfp;
1728 	struct nfslock *other_lop = NULL;
1729 	struct nfsstate *stp, *lckstp = NULL;
1730 	struct nfsclient *clp = NULL;
1731 	u_int32_t bits;
1732 	int error = 0, haslock = 0, ret, reterr;
1733 	int getlckret, delegation = 0, filestruct_locked, vnode_unlocked = 0;
1734 	fhandle_t nfh;
1735 	uint64_t first, end;
1736 	uint32_t lock_flags;
1737 
1738 	if (new_stp->ls_flags & (NFSLCK_CHECK | NFSLCK_SETATTR)) {
1739 		/*
1740 		 * Note the special cases of "all 1s" or "all 0s" stateids and
1741 		 * let reads with all 1s go ahead.
1742 		 */
1743 		if (new_stp->ls_stateid.seqid == 0x0 &&
1744 		    new_stp->ls_stateid.other[0] == 0x0 &&
1745 		    new_stp->ls_stateid.other[1] == 0x0 &&
1746 		    new_stp->ls_stateid.other[2] == 0x0)
1747 			specialid = 1;
1748 		else if (new_stp->ls_stateid.seqid == 0xffffffff &&
1749 		    new_stp->ls_stateid.other[0] == 0xffffffff &&
1750 		    new_stp->ls_stateid.other[1] == 0xffffffff &&
1751 		    new_stp->ls_stateid.other[2] == 0xffffffff)
1752 			specialid = 2;
1753 	}
1754 
1755 	/*
1756 	 * Check for restart conditions (client and server).
1757 	 */
1758 	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
1759 	    &new_stp->ls_stateid, specialid);
1760 	if (error)
1761 		goto out;
1762 
1763 	/*
1764 	 * Check for state resource limit exceeded.
1765 	 */
1766 	if ((new_stp->ls_flags & NFSLCK_LOCK) &&
1767 	    nfsrv_openpluslock > nfsrv_v4statelimit) {
1768 		error = NFSERR_RESOURCE;
1769 		goto out;
1770 	}
1771 
1772 	/*
1773 	 * For the lock case, get another nfslock structure,
1774 	 * just in case we need it.
1775 	 * Malloc now, before we start sifting through the linked lists,
1776 	 * in case we have to wait for memory.
1777 	 */
1778 tryagain:
1779 	if (new_stp->ls_flags & NFSLCK_LOCK)
1780 		other_lop = malloc(sizeof (struct nfslock),
1781 		    M_NFSDLOCK, M_WAITOK);
1782 	filestruct_locked = 0;
1783 	reterr = 0;
1784 	lfp = NULL;
1785 
1786 	/*
1787 	 * Get the lockfile structure for CFH now, so we can do a sanity
1788 	 * check against the stateid, before incrementing the seqid#, since
1789 	 * we want to return NFSERR_BADSTATEID on failure and the seqid#
1790 	 * shouldn't be incremented for this case.
1791 	 * If nfsrv_getlockfile() returns -1, it means "not found", which
1792 	 * will be handled later.
1793 	 * If we are doing Lock/LockU and local locking is enabled, sleep
1794 	 * lock the nfslockfile structure.
1795 	 */
1796 	getlckret = nfsrv_getlockfh(vp, new_stp->ls_flags, NULL, &nfh, p);
1797 	NFSLOCKSTATE();
1798 	if (getlckret == 0) {
1799 		if ((new_stp->ls_flags & (NFSLCK_LOCK | NFSLCK_UNLOCK)) != 0 &&
1800 		    nfsrv_dolocallocks != 0 && nd->nd_repstat == 0) {
1801 			getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
1802 			    &lfp, &nfh, 1);
1803 			if (getlckret == 0)
1804 				filestruct_locked = 1;
1805 		} else
1806 			getlckret = nfsrv_getlockfile(new_stp->ls_flags, NULL,
1807 			    &lfp, &nfh, 0);
1808 	}
1809 	if (getlckret != 0 && getlckret != -1)
1810 		reterr = getlckret;
1811 
1812 	if (filestruct_locked != 0) {
1813 		LIST_INIT(&lfp->lf_rollback);
1814 		if ((new_stp->ls_flags & NFSLCK_LOCK)) {
1815 			/*
1816 			 * For local locking, do the advisory locking now, so
1817 			 * that any conflict can be detected. A failure later
1818 			 * can be rolled back locally. If an error is returned,
1819 			 * struct nfslockfile has been unlocked and any local
1820 			 * locking rolled back.
1821 			 */
1822 			NFSUNLOCKSTATE();
1823 			if (vnode_unlocked == 0) {
1824 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl1");
1825 				vnode_unlocked = 1;
1826 				NFSVOPUNLOCK(vp);
1827 			}
1828 			reterr = nfsrv_locallock(vp, lfp,
1829 			    (new_lop->lo_flags & (NFSLCK_READ | NFSLCK_WRITE)),
1830 			    new_lop->lo_first, new_lop->lo_end, cfp, p);
1831 			NFSLOCKSTATE();
1832 		}
1833 	}
1834 
1835 	if (specialid == 0) {
1836 	    if (new_stp->ls_flags & NFSLCK_TEST) {
1837 		/*
1838 		 * RFC 3530 does not list LockT as an op that renews a
1839 		 * lease, but the consensus seems to be that it is ok
1840 		 * for a server to do so.
1841 		 */
1842 		error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
1843 		    (nfsquad_t)((u_quad_t)0), 0, nd, p);
1844 
1845 		/*
1846 		 * Since NFSERR_EXPIRED, NFSERR_ADMINREVOKED are not valid
1847 		 * error returns for LockT, just go ahead and test for a lock,
1848 		 * since there are no locks for this client, but other locks
1849 		 * can conflict. (ie. same client will always be false)
1850 		 */
1851 		if (error == NFSERR_EXPIRED || error == NFSERR_ADMINREVOKED)
1852 		    error = 0;
1853 		lckstp = new_stp;
1854 	    } else {
1855 	      error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
1856 		(nfsquad_t)((u_quad_t)0), 0, nd, p);
1857 	      if (error == 0)
1858 		/*
1859 		 * Look up the stateid
1860 		 */
1861 		error = nfsrv_getstate(clp, &new_stp->ls_stateid,
1862 		  new_stp->ls_flags, &stp);
1863 	      /*
1864 	       * do some sanity checks for an unconfirmed open or a
1865 	       * stateid that refers to the wrong file, for an open stateid
1866 	       */
1867 	      if (error == 0 && (stp->ls_flags & NFSLCK_OPEN) &&
1868 		  ((stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM) ||
1869 		   (getlckret == 0 && stp->ls_lfp != lfp))){
1870 		      /*
1871 		       * NFSLCK_SETATTR should return OK rather than NFSERR_BADSTATEID
1872 		       * The only exception is using SETATTR with SIZE.
1873 		       * */
1874                     if ((new_stp->ls_flags &
1875                          (NFSLCK_SETATTR | NFSLCK_CHECK)) != NFSLCK_SETATTR)
1876 			     error = NFSERR_BADSTATEID;
1877 	      }
1878 
1879 		if (error == 0 &&
1880 		  (stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) &&
1881 		  getlckret == 0 && stp->ls_lfp != lfp)
1882 			error = NFSERR_BADSTATEID;
1883 
1884 	      /*
1885 	       * If the lockowner stateid doesn't refer to the same file,
1886 	       * I believe that is considered ok, since some clients will
1887 	       * only create a single lockowner and use that for all locks
1888 	       * on all files.
1889 	       * For now, log it as a diagnostic, instead of considering it
1890 	       * a BadStateid.
1891 	       */
1892 	      if (error == 0 && (stp->ls_flags &
1893 		  (NFSLCK_OPEN | NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) == 0 &&
1894 		  getlckret == 0 && stp->ls_lfp != lfp) {
1895 #ifdef DIAGNOSTIC
1896 		  printf("Got a lock statid for different file open\n");
1897 #endif
1898 		  /*
1899 		  error = NFSERR_BADSTATEID;
1900 		  */
1901 	      }
1902 
1903 	      if (error == 0) {
1904 		    if (new_stp->ls_flags & NFSLCK_OPENTOLOCK) {
1905 			/*
1906 			 * If haslock set, we've already checked the seqid.
1907 			 */
1908 			if (!haslock) {
1909 			    if (stp->ls_flags & NFSLCK_OPEN)
1910 				error = nfsrv_checkseqid(nd, new_stp->ls_seq,
1911 				    stp->ls_openowner, new_stp->ls_op);
1912 			    else
1913 				error = NFSERR_BADSTATEID;
1914 			}
1915 			if (!error)
1916 			    nfsrv_getowner(&stp->ls_open, new_stp, &lckstp);
1917 			if (lckstp) {
1918 			    /*
1919 			     * For NFSv4.1 and NFSv4.2 allow an
1920 			     * open_to_lock_owner when the lock_owner already
1921 			     * exists.  Just clear NFSLCK_OPENTOLOCK so that
1922 			     * a new lock_owner will not be created.
1923 			     * RFC7530 states that the error for NFSv4.0
1924 			     * is NFS4ERR_BAD_SEQID.
1925 			     */
1926 			    if ((nd->nd_flag & ND_NFSV41) != 0)
1927 				new_stp->ls_flags &= ~NFSLCK_OPENTOLOCK;
1928 			    else
1929 				error = NFSERR_BADSEQID;
1930 			} else
1931 			    lckstp = new_stp;
1932 		    } else if (new_stp->ls_flags&(NFSLCK_LOCK|NFSLCK_UNLOCK)) {
1933 			/*
1934 			 * If haslock set, ditto above.
1935 			 */
1936 			if (!haslock) {
1937 			    if (stp->ls_flags & NFSLCK_OPEN)
1938 				error = NFSERR_BADSTATEID;
1939 			    else
1940 				error = nfsrv_checkseqid(nd, new_stp->ls_seq,
1941 				    stp, new_stp->ls_op);
1942 			}
1943 			lckstp = stp;
1944 		    } else {
1945 			lckstp = stp;
1946 		    }
1947 	      }
1948 	      /*
1949 	       * If the seqid part of the stateid isn't the same, return
1950 	       * NFSERR_OLDSTATEID for cases other than I/O Ops.
1951 	       * For I/O Ops, only return NFSERR_OLDSTATEID if
1952 	       * nfsrv_returnoldstateid is set. (The consensus on the email
1953 	       * list was that most clients would prefer to not receive
1954 	       * NFSERR_OLDSTATEID for I/O Ops, but the RFC suggests that that
1955 	       * is what will happen, so I use the nfsrv_returnoldstateid to
1956 	       * allow for either server configuration.)
1957 	       */
1958 	      if (!error && stp->ls_stateid.seqid!=new_stp->ls_stateid.seqid &&
1959 		  (((nd->nd_flag & ND_NFSV41) == 0 &&
1960 		   (!(new_stp->ls_flags & NFSLCK_CHECK) ||
1961 		    nfsrv_returnoldstateid)) ||
1962 		   ((nd->nd_flag & ND_NFSV41) != 0 &&
1963 		    new_stp->ls_stateid.seqid != 0)))
1964 		    error = NFSERR_OLDSTATEID;
1965 	    }
1966 	}
1967 
1968 	/*
1969 	 * Now we can check for grace.
1970 	 */
1971 	if (!error)
1972 		error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
1973 	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
1974 		nfsrv_checkstable(clp))
1975 		error = NFSERR_NOGRACE;
1976 	/*
1977 	 * If we successfully Reclaimed state, note that.
1978 	 */
1979 	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error)
1980 		nfsrv_markstable(clp);
1981 
1982 	/*
1983 	 * At this point, either error == NFSERR_BADSTATEID or the
1984 	 * seqid# has been updated, so we can return any error.
1985 	 * If error == 0, there may be an error in:
1986 	 *    nd_repstat - Set by the calling function.
1987 	 *    reterr - Set above, if getting the nfslockfile structure
1988 	 *       or acquiring the local lock failed.
1989 	 *    (If both of these are set, nd_repstat should probably be
1990 	 *     returned, since that error was detected before this
1991 	 *     function call.)
1992 	 */
1993 	if (error != 0 || nd->nd_repstat != 0 || reterr != 0) {
1994 		if (error == 0) {
1995 			if (nd->nd_repstat != 0)
1996 				error = nd->nd_repstat;
1997 			else
1998 				error = reterr;
1999 		}
2000 		if (filestruct_locked != 0) {
2001 			/* Roll back local locks. */
2002 			NFSUNLOCKSTATE();
2003 			if (vnode_unlocked == 0) {
2004 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl2");
2005 				vnode_unlocked = 1;
2006 				NFSVOPUNLOCK(vp);
2007 			}
2008 			nfsrv_locallock_rollback(vp, lfp, p);
2009 			NFSLOCKSTATE();
2010 			nfsrv_unlocklf(lfp);
2011 		}
2012 		NFSUNLOCKSTATE();
2013 		goto out;
2014 	}
2015 
2016 	/*
2017 	 * Check the nfsrv_getlockfile return.
2018 	 * Returned -1 if no structure found.
2019 	 */
2020 	if (getlckret == -1) {
2021 		error = NFSERR_EXPIRED;
2022 		/*
2023 		 * Called from lockt, so no lock is OK.
2024 		 */
2025 		if (new_stp->ls_flags & NFSLCK_TEST) {
2026 			error = 0;
2027 		} else if (new_stp->ls_flags &
2028 		    (NFSLCK_CHECK | NFSLCK_SETATTR)) {
2029 			/*
2030 			 * Called to check for a lock, OK if the stateid is all
2031 			 * 1s or all 0s, but there should be an nfsstate
2032 			 * otherwise.
2033 			 * (ie. If there is no open, I'll assume no share
2034 			 *  deny bits.)
2035 			 */
2036 			if (specialid)
2037 				error = 0;
2038 			else
2039 				error = NFSERR_BADSTATEID;
2040 		}
2041 		NFSUNLOCKSTATE();
2042 		goto out;
2043 	}
2044 
2045 	/*
2046 	 * For NFSLCK_CHECK and NFSLCK_LOCK, test for a share conflict.
2047 	 * For NFSLCK_CHECK, allow a read if write access is granted,
2048 	 * but check for a deny. For NFSLCK_LOCK, require correct access,
2049 	 * which implies a conflicting deny can't exist.
2050 	 */
2051 	if (new_stp->ls_flags & (NFSLCK_CHECK | NFSLCK_LOCK)) {
2052 	    /*
2053 	     * Four kinds of state id:
2054 	     * - specialid (all 0s or all 1s), only for NFSLCK_CHECK
2055 	     * - stateid for an open
2056 	     * - stateid for a delegation
2057 	     * - stateid for a lock owner
2058 	     */
2059 	    if (!specialid) {
2060 		if (stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) {
2061 		    delegation = 1;
2062 		    mystp = stp;
2063 		    nfsrv_delaydelegtimeout(stp);
2064 	        } else if (stp->ls_flags & NFSLCK_OPEN) {
2065 		    mystp = stp;
2066 		} else {
2067 		    mystp = stp->ls_openstp;
2068 		}
2069 		/*
2070 		 * If locking or checking, require correct access
2071 		 * bit set.
2072 		 */
2073 		if (((new_stp->ls_flags & NFSLCK_LOCK) &&
2074 		     !((new_lop->lo_flags >> NFSLCK_LOCKSHIFT) &
2075 		       mystp->ls_flags & NFSLCK_ACCESSBITS)) ||
2076 		    ((new_stp->ls_flags & (NFSLCK_CHECK|NFSLCK_READACCESS)) ==
2077 		      (NFSLCK_CHECK | NFSLCK_READACCESS) &&
2078 		     !(mystp->ls_flags & NFSLCK_READACCESS) &&
2079 		     nfsrv_allowreadforwriteopen == 0) ||
2080 		    ((new_stp->ls_flags & (NFSLCK_CHECK|NFSLCK_WRITEACCESS)) ==
2081 		      (NFSLCK_CHECK | NFSLCK_WRITEACCESS) &&
2082 		     !(mystp->ls_flags & NFSLCK_WRITEACCESS))) {
2083 			if (filestruct_locked != 0) {
2084 				/* Roll back local locks. */
2085 				NFSUNLOCKSTATE();
2086 				if (vnode_unlocked == 0) {
2087 					ASSERT_VOP_ELOCKED(vp,
2088 					    "nfsrv_lockctrl3");
2089 					vnode_unlocked = 1;
2090 					NFSVOPUNLOCK(vp);
2091 				}
2092 				nfsrv_locallock_rollback(vp, lfp, p);
2093 				NFSLOCKSTATE();
2094 				nfsrv_unlocklf(lfp);
2095 			}
2096 			NFSUNLOCKSTATE();
2097 			error = NFSERR_OPENMODE;
2098 			goto out;
2099 		}
2100 	    } else
2101 		mystp = NULL;
2102 	    if ((new_stp->ls_flags & NFSLCK_CHECK) && !delegation) {
2103 		/*
2104 		 * Check for a conflicting deny bit.
2105 		 */
2106 		LIST_FOREACH(tstp, &lfp->lf_open, ls_file) {
2107 		    if (tstp != mystp) {
2108 			bits = tstp->ls_flags;
2109 			bits >>= NFSLCK_SHIFT;
2110 			if (new_stp->ls_flags & bits & NFSLCK_ACCESSBITS) {
2111 			    KASSERT(vnode_unlocked == 0,
2112 				("nfsrv_lockctrl: vnode unlocked1"));
2113 			    ret = nfsrv_clientconflict(tstp->ls_clp, &haslock,
2114 				vp, p);
2115 			    if (ret == 1) {
2116 				/*
2117 				* nfsrv_clientconflict unlocks state
2118 				 * when it returns non-zero.
2119 				 */
2120 				lckstp = NULL;
2121 				goto tryagain;
2122 			    }
2123 			    if (ret == 0)
2124 				NFSUNLOCKSTATE();
2125 			    if (ret == 2)
2126 				error = NFSERR_PERM;
2127 			    else
2128 				error = NFSERR_OPENMODE;
2129 			    goto out;
2130 			}
2131 		    }
2132 		}
2133 
2134 		/* We're outta here */
2135 		NFSUNLOCKSTATE();
2136 		goto out;
2137 	    }
2138 	}
2139 
2140 	/*
2141 	 * For setattr, just get rid of all the Delegations for other clients.
2142 	 */
2143 	if (new_stp->ls_flags & NFSLCK_SETATTR) {
2144 		KASSERT(vnode_unlocked == 0,
2145 		    ("nfsrv_lockctrl: vnode unlocked2"));
2146 		ret = nfsrv_cleandeleg(vp, lfp, clp, &haslock, p);
2147 		if (ret) {
2148 			/*
2149 			 * nfsrv_cleandeleg() unlocks state when it
2150 			 * returns non-zero.
2151 			 */
2152 			if (ret == -1) {
2153 				lckstp = NULL;
2154 				goto tryagain;
2155 			}
2156 			error = ret;
2157 			goto out;
2158 		}
2159 		if (!(new_stp->ls_flags & NFSLCK_CHECK) ||
2160 		    (LIST_EMPTY(&lfp->lf_open) && LIST_EMPTY(&lfp->lf_lock) &&
2161 		     LIST_EMPTY(&lfp->lf_deleg))) {
2162 			NFSUNLOCKSTATE();
2163 			goto out;
2164 		}
2165 	}
2166 
2167 	/*
2168 	 * Check for a conflicting delegation. If one is found, call
2169 	 * nfsrv_delegconflict() to handle it. If the v4root lock hasn't
2170 	 * been set yet, it will get the lock. Otherwise, it will recall
2171 	 * the delegation. Then, we try try again...
2172 	 * I currently believe the conflict algorithm to be:
2173 	 * For Lock Ops (Lock/LockT/LockU)
2174 	 * - there is a conflict iff a different client has a write delegation
2175 	 * For Reading (Read Op)
2176 	 * - there is a conflict iff a different client has a write delegation
2177 	 *   (the specialids are always a different client)
2178 	 * For Writing (Write/Setattr of size)
2179 	 * - there is a conflict if a different client has any delegation
2180 	 * - there is a conflict if the same client has a read delegation
2181 	 *   (I don't understand why this isn't allowed, but that seems to be
2182 	 *    the current consensus?)
2183 	 */
2184 	tstp = LIST_FIRST(&lfp->lf_deleg);
2185 	while (tstp != LIST_END(&lfp->lf_deleg)) {
2186 	    nstp = LIST_NEXT(tstp, ls_file);
2187 	    if ((((new_stp->ls_flags&(NFSLCK_LOCK|NFSLCK_UNLOCK|NFSLCK_TEST))||
2188 		 ((new_stp->ls_flags & NFSLCK_CHECK) &&
2189 		  (new_lop->lo_flags & NFSLCK_READ))) &&
2190 		  clp != tstp->ls_clp &&
2191 		 (tstp->ls_flags & NFSLCK_DELEGWRITE)) ||
2192 		 ((new_stp->ls_flags & NFSLCK_CHECK) &&
2193 		   (new_lop->lo_flags & NFSLCK_WRITE) &&
2194 		  (clp != tstp->ls_clp ||
2195 		   (tstp->ls_flags & NFSLCK_DELEGREAD)))) {
2196 		ret = 0;
2197 		if (filestruct_locked != 0) {
2198 			/* Roll back local locks. */
2199 			NFSUNLOCKSTATE();
2200 			if (vnode_unlocked == 0) {
2201 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl4");
2202 				NFSVOPUNLOCK(vp);
2203 			}
2204 			nfsrv_locallock_rollback(vp, lfp, p);
2205 			NFSLOCKSTATE();
2206 			nfsrv_unlocklf(lfp);
2207 			NFSUNLOCKSTATE();
2208 			NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
2209 			vnode_unlocked = 0;
2210 			if (VN_IS_DOOMED(vp))
2211 				ret = NFSERR_SERVERFAULT;
2212 			NFSLOCKSTATE();
2213 		}
2214 		if (ret == 0)
2215 			ret = nfsrv_delegconflict(tstp, &haslock, p, vp);
2216 		if (ret) {
2217 		    /*
2218 		     * nfsrv_delegconflict unlocks state when it
2219 		     * returns non-zero, which it always does.
2220 		     */
2221 		    if (other_lop) {
2222 			free(other_lop, M_NFSDLOCK);
2223 			other_lop = NULL;
2224 		    }
2225 		    if (ret == -1) {
2226 			lckstp = NULL;
2227 			goto tryagain;
2228 		    }
2229 		    error = ret;
2230 		    goto out;
2231 		}
2232 		/* Never gets here. */
2233 	    }
2234 	    tstp = nstp;
2235 	}
2236 
2237 	/*
2238 	 * Handle the unlock case by calling nfsrv_updatelock().
2239 	 * (Should I have done some access checking above for unlock? For now,
2240 	 *  just let it happen.)
2241 	 */
2242 	if (new_stp->ls_flags & NFSLCK_UNLOCK) {
2243 		first = new_lop->lo_first;
2244 		end = new_lop->lo_end;
2245 		nfsrv_updatelock(stp, new_lopp, &other_lop, lfp);
2246 		stateidp->seqid = ++(stp->ls_stateid.seqid);
2247 		if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
2248 			stateidp->seqid = stp->ls_stateid.seqid = 1;
2249 		stateidp->other[0] = stp->ls_stateid.other[0];
2250 		stateidp->other[1] = stp->ls_stateid.other[1];
2251 		stateidp->other[2] = stp->ls_stateid.other[2];
2252 		if (filestruct_locked != 0) {
2253 			NFSUNLOCKSTATE();
2254 			if (vnode_unlocked == 0) {
2255 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl5");
2256 				vnode_unlocked = 1;
2257 				NFSVOPUNLOCK(vp);
2258 			}
2259 			/* Update the local locks. */
2260 			nfsrv_localunlock(vp, lfp, first, end, p);
2261 			NFSLOCKSTATE();
2262 			nfsrv_unlocklf(lfp);
2263 		}
2264 		NFSUNLOCKSTATE();
2265 		goto out;
2266 	}
2267 
2268 	/*
2269 	 * Search for a conflicting lock. A lock conflicts if:
2270 	 * - the lock range overlaps and
2271 	 * - at least one lock is a write lock and
2272 	 * - it is not owned by the same lock owner
2273 	 */
2274 	if (!delegation) {
2275 	  LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
2276 	    if (new_lop->lo_end > lop->lo_first &&
2277 		new_lop->lo_first < lop->lo_end &&
2278 		(new_lop->lo_flags == NFSLCK_WRITE ||
2279 		 lop->lo_flags == NFSLCK_WRITE) &&
2280 		lckstp != lop->lo_stp &&
2281 		(clp != lop->lo_stp->ls_clp ||
2282 		 lckstp->ls_ownerlen != lop->lo_stp->ls_ownerlen ||
2283 		 NFSBCMP(lckstp->ls_owner, lop->lo_stp->ls_owner,
2284 		    lckstp->ls_ownerlen))) {
2285 		if (other_lop) {
2286 		    free(other_lop, M_NFSDLOCK);
2287 		    other_lop = NULL;
2288 		}
2289 		if (vnode_unlocked != 0)
2290 		    ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
2291 			NULL, p);
2292 		else
2293 		    ret = nfsrv_clientconflict(lop->lo_stp->ls_clp, &haslock,
2294 			vp, p);
2295 		if (ret == 1) {
2296 		    if (filestruct_locked != 0) {
2297 			if (vnode_unlocked == 0) {
2298 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl6");
2299 				NFSVOPUNLOCK(vp);
2300 			}
2301 			/* Roll back local locks. */
2302 			nfsrv_locallock_rollback(vp, lfp, p);
2303 			NFSLOCKSTATE();
2304 			nfsrv_unlocklf(lfp);
2305 			NFSUNLOCKSTATE();
2306 			NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
2307 			vnode_unlocked = 0;
2308 			if (VN_IS_DOOMED(vp)) {
2309 				error = NFSERR_SERVERFAULT;
2310 				goto out;
2311 			}
2312 		    }
2313 		    /*
2314 		     * nfsrv_clientconflict() unlocks state when it
2315 		     * returns non-zero.
2316 		     */
2317 		    lckstp = NULL;
2318 		    goto tryagain;
2319 		}
2320 		/*
2321 		 * Found a conflicting lock, so record the conflict and
2322 		 * return the error.
2323 		 */
2324 		if (cfp != NULL && ret == 0) {
2325 		    cfp->cl_clientid.lval[0]=lop->lo_stp->ls_stateid.other[0];
2326 		    cfp->cl_clientid.lval[1]=lop->lo_stp->ls_stateid.other[1];
2327 		    cfp->cl_first = lop->lo_first;
2328 		    cfp->cl_end = lop->lo_end;
2329 		    cfp->cl_flags = lop->lo_flags;
2330 		    cfp->cl_ownerlen = lop->lo_stp->ls_ownerlen;
2331 		    NFSBCOPY(lop->lo_stp->ls_owner, cfp->cl_owner,
2332 			cfp->cl_ownerlen);
2333 		}
2334 		if (ret == 2)
2335 		    error = NFSERR_PERM;
2336 		else if (new_stp->ls_flags & NFSLCK_RECLAIM)
2337 		    error = NFSERR_RECLAIMCONFLICT;
2338 		else if (new_stp->ls_flags & NFSLCK_CHECK)
2339 		    error = NFSERR_LOCKED;
2340 		else
2341 		    error = NFSERR_DENIED;
2342 		if (filestruct_locked != 0 && ret == 0) {
2343 			/* Roll back local locks. */
2344 			NFSUNLOCKSTATE();
2345 			if (vnode_unlocked == 0) {
2346 				ASSERT_VOP_ELOCKED(vp, "nfsrv_lockctrl7");
2347 				vnode_unlocked = 1;
2348 				NFSVOPUNLOCK(vp);
2349 			}
2350 			nfsrv_locallock_rollback(vp, lfp, p);
2351 			NFSLOCKSTATE();
2352 			nfsrv_unlocklf(lfp);
2353 		}
2354 		if (ret == 0)
2355 			NFSUNLOCKSTATE();
2356 		goto out;
2357 	    }
2358 	  }
2359 	}
2360 
2361 	/*
2362 	 * We only get here if there was no lock that conflicted.
2363 	 */
2364 	if (new_stp->ls_flags & (NFSLCK_TEST | NFSLCK_CHECK)) {
2365 		NFSUNLOCKSTATE();
2366 		goto out;
2367 	}
2368 
2369 	/*
2370 	 * We only get here when we are creating or modifying a lock.
2371 	 * There are two variants:
2372 	 * - exist_lock_owner where lock_owner exists
2373 	 * - open_to_lock_owner with new lock_owner
2374 	 */
2375 	first = new_lop->lo_first;
2376 	end = new_lop->lo_end;
2377 	lock_flags = new_lop->lo_flags;
2378 	if (!(new_stp->ls_flags & NFSLCK_OPENTOLOCK)) {
2379 		nfsrv_updatelock(lckstp, new_lopp, &other_lop, lfp);
2380 		stateidp->seqid = ++(lckstp->ls_stateid.seqid);
2381 		if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
2382 			stateidp->seqid = lckstp->ls_stateid.seqid = 1;
2383 		stateidp->other[0] = lckstp->ls_stateid.other[0];
2384 		stateidp->other[1] = lckstp->ls_stateid.other[1];
2385 		stateidp->other[2] = lckstp->ls_stateid.other[2];
2386 	} else {
2387 		/*
2388 		 * The new open_to_lock_owner case.
2389 		 * Link the new nfsstate into the lists.
2390 		 */
2391 		new_stp->ls_seq = new_stp->ls_opentolockseq;
2392 		nfsrvd_refcache(new_stp->ls_op);
2393 		stateidp->seqid = new_stp->ls_stateid.seqid = 1;
2394 		stateidp->other[0] = new_stp->ls_stateid.other[0] =
2395 		    clp->lc_clientid.lval[0];
2396 		stateidp->other[1] = new_stp->ls_stateid.other[1] =
2397 		    clp->lc_clientid.lval[1];
2398 		stateidp->other[2] = new_stp->ls_stateid.other[2] =
2399 		    nfsrv_nextstateindex(clp);
2400 		new_stp->ls_clp = clp;
2401 		LIST_INIT(&new_stp->ls_lock);
2402 		new_stp->ls_openstp = stp;
2403 		new_stp->ls_lfp = lfp;
2404 		nfsrv_insertlock(new_lop, (struct nfslock *)new_stp, new_stp,
2405 		    lfp);
2406 		LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_stp->ls_stateid),
2407 		    new_stp, ls_hash);
2408 		LIST_INSERT_HEAD(&stp->ls_open, new_stp, ls_list);
2409 		*new_lopp = NULL;
2410 		*new_stpp = NULL;
2411 		NFSD_VNET(nfsstatsv1_p)->srvlockowners++;
2412 		nfsrv_openpluslock++;
2413 	}
2414 	if (filestruct_locked != 0) {
2415 		NFSUNLOCKSTATE();
2416 		nfsrv_locallock_commit(lfp, lock_flags, first, end);
2417 		NFSLOCKSTATE();
2418 		nfsrv_unlocklf(lfp);
2419 	}
2420 	NFSUNLOCKSTATE();
2421 
2422 out:
2423 	if (haslock) {
2424 		NFSLOCKV4ROOTMUTEX();
2425 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
2426 		NFSUNLOCKV4ROOTMUTEX();
2427 	}
2428 	if (vnode_unlocked != 0) {
2429 		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
2430 		if (error == 0 && VN_IS_DOOMED(vp))
2431 			error = NFSERR_SERVERFAULT;
2432 	}
2433 	if (other_lop)
2434 		free(other_lop, M_NFSDLOCK);
2435 	NFSEXITCODE2(error, nd);
2436 	return (error);
2437 }
2438 
2439 /*
2440  * Check for state errors for Open.
2441  * repstat is passed back out as an error if more critical errors
2442  * are not detected.
2443  */
2444 int
2445 nfsrv_opencheck(nfsquad_t clientid, nfsv4stateid_t *stateidp,
2446     struct nfsstate *new_stp, vnode_t vp, struct nfsrv_descript *nd,
2447     NFSPROC_T *p, int repstat)
2448 {
2449 	struct nfsstate *stp, *nstp;
2450 	struct nfsclient *clp;
2451 	struct nfsstate *ownerstp;
2452 	struct nfslockfile *lfp, *new_lfp;
2453 	int error = 0, haslock = 0, ret, readonly = 0, getfhret = 0;
2454 
2455 	if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
2456 		readonly = 1;
2457 	/*
2458 	 * Check for restart conditions (client and server).
2459 	 */
2460 	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
2461 		&new_stp->ls_stateid, 0);
2462 	if (error)
2463 		goto out;
2464 
2465 	/*
2466 	 * Check for state resource limit exceeded.
2467 	 * Technically this should be SMP protected, but the worst
2468 	 * case error is "out by one or two" on the count when it
2469 	 * returns NFSERR_RESOURCE and the limit is just a rather
2470 	 * arbitrary high water mark, so no harm is done.
2471 	 */
2472 	if (nfsrv_openpluslock > nfsrv_v4statelimit) {
2473 		error = NFSERR_RESOURCE;
2474 		goto out;
2475 	}
2476 
2477 tryagain:
2478 	new_lfp = malloc(sizeof (struct nfslockfile),
2479 	    M_NFSDLOCKFILE, M_WAITOK);
2480 	if (vp)
2481 		getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
2482 		    NULL, p);
2483 	NFSLOCKSTATE();
2484 	/*
2485 	 * Get the nfsclient structure.
2486 	 */
2487 	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
2488 	    (nfsquad_t)((u_quad_t)0), 0, nd, p);
2489 
2490 	/*
2491 	 * Look up the open owner. See if it needs confirmation and
2492 	 * check the seq#, as required.
2493 	 */
2494 	if (!error)
2495 		nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);
2496 
2497 	if (!error && ownerstp) {
2498 		error = nfsrv_checkseqid(nd, new_stp->ls_seq, ownerstp,
2499 		    new_stp->ls_op);
2500 		/*
2501 		 * If the OpenOwner hasn't been confirmed, assume the
2502 		 * old one was a replay and this one is ok.
2503 		 * See: RFC3530 Sec. 14.2.18.
2504 		 */
2505 		if (error == NFSERR_BADSEQID &&
2506 		    (ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM))
2507 			error = 0;
2508 	}
2509 
2510 	/*
2511 	 * Check for grace.
2512 	 */
2513 	if (!error)
2514 		error = nfsrv_checkgrace(nd, clp, new_stp->ls_flags);
2515 	if ((new_stp->ls_flags & NFSLCK_RECLAIM) && !error &&
2516 		nfsrv_checkstable(clp))
2517 		error = NFSERR_NOGRACE;
2518 
2519 	/*
2520 	 * If none of the above errors occurred, let repstat be
2521 	 * returned.
2522 	 */
2523 	if (repstat && !error)
2524 		error = repstat;
2525 	if (error) {
2526 		NFSUNLOCKSTATE();
2527 		if (haslock) {
2528 			NFSLOCKV4ROOTMUTEX();
2529 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2530 			NFSUNLOCKV4ROOTMUTEX();
2531 		}
2532 		free(new_lfp, M_NFSDLOCKFILE);
2533 		goto out;
2534 	}
2535 
2536 	/*
2537 	 * If vp == NULL, the file doesn't exist yet, so return ok.
2538 	 * (This always happens on the first pass, so haslock must be 0.)
2539 	 */
2540 	if (vp == NULL) {
2541 		NFSUNLOCKSTATE();
2542 		free(new_lfp, M_NFSDLOCKFILE);
2543 		goto out;
2544 	}
2545 
2546 	/*
2547 	 * Get the structure for the underlying file.
2548 	 */
2549 	if (getfhret)
2550 		error = getfhret;
2551 	else
2552 		error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
2553 		    NULL, 0);
2554 	if (new_lfp)
2555 		free(new_lfp, M_NFSDLOCKFILE);
2556 	if (error) {
2557 		NFSUNLOCKSTATE();
2558 		if (haslock) {
2559 			NFSLOCKV4ROOTMUTEX();
2560 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2561 			NFSUNLOCKV4ROOTMUTEX();
2562 		}
2563 		goto out;
2564 	}
2565 
2566 	/*
2567 	 * Search for a conflicting open/share.
2568 	 */
2569 	if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
2570 	    /*
2571 	     * For Delegate_Cur, search for the matching Delegation,
2572 	     * which indicates no conflict.
2573 	     * An old delegation should have been recovered by the
2574 	     * client doing a Claim_DELEGATE_Prev, so I won't let
2575 	     * it match and return NFSERR_EXPIRED. Should I let it
2576 	     * match?
2577 	     */
2578 	    LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
2579 		if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
2580 		    (((nd->nd_flag & ND_NFSV41) != 0 &&
2581 		    stateidp->seqid == 0) ||
2582 		    stateidp->seqid == stp->ls_stateid.seqid) &&
2583 		    !NFSBCMP(stateidp->other, stp->ls_stateid.other,
2584 			  NFSX_STATEIDOTHER))
2585 			break;
2586 	    }
2587 	    if (stp == LIST_END(&lfp->lf_deleg) ||
2588 		((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
2589 		 (stp->ls_flags & NFSLCK_DELEGREAD))) {
2590 		NFSUNLOCKSTATE();
2591 		if (haslock) {
2592 			NFSLOCKV4ROOTMUTEX();
2593 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2594 			NFSUNLOCKV4ROOTMUTEX();
2595 		}
2596 		error = NFSERR_EXPIRED;
2597 		goto out;
2598 	    }
2599 	}
2600 
2601 	/*
2602 	 * Check for access/deny bit conflicts. I check for the same
2603 	 * owner as well, in case the client didn't bother.
2604 	 */
2605 	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
2606 		if (!(new_stp->ls_flags & NFSLCK_DELEGCUR) &&
2607 		    (((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
2608 		      ((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))||
2609 		     ((stp->ls_flags & NFSLCK_ACCESSBITS) &
2610 		      ((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS)))){
2611 			ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
2612 			if (ret == 1) {
2613 				/*
2614 				 * nfsrv_clientconflict() unlocks
2615 				 * state when it returns non-zero.
2616 				 */
2617 				goto tryagain;
2618 			}
2619 			if (ret == 2)
2620 				error = NFSERR_PERM;
2621 			else if (new_stp->ls_flags & NFSLCK_RECLAIM)
2622 				error = NFSERR_RECLAIMCONFLICT;
2623 			else
2624 				error = NFSERR_SHAREDENIED;
2625 			if (ret == 0)
2626 				NFSUNLOCKSTATE();
2627 			if (haslock) {
2628 				NFSLOCKV4ROOTMUTEX();
2629 				nfsv4_unlock(&nfsv4rootfs_lock, 1);
2630 				NFSUNLOCKV4ROOTMUTEX();
2631 			}
2632 			goto out;
2633 		}
2634 	}
2635 
2636 	/*
2637 	 * Check for a conflicting delegation. If one is found, call
2638 	 * nfsrv_delegconflict() to handle it. If the v4root lock hasn't
2639 	 * been set yet, it will get the lock. Otherwise, it will recall
2640 	 * the delegation. Then, we try try again...
2641 	 * (If NFSLCK_DELEGCUR is set, it has a delegation, so there
2642 	 *  isn't a conflict.)
2643 	 * I currently believe the conflict algorithm to be:
2644 	 * For Open with Read Access and Deny None
2645 	 * - there is a conflict iff a different client has a write delegation
2646 	 * For Open with other Write Access or any Deny except None
2647 	 * - there is a conflict if a different client has any delegation
2648 	 * - there is a conflict if the same client has a read delegation
2649 	 *   (The current consensus is that this last case should be
2650 	 *    considered a conflict since the client with a read delegation
2651 	 *    could have done an Open with ReadAccess and WriteDeny
2652 	 *    locally and then not have checked for the WriteDeny.)
2653 	 * Don't check for a Reclaim, since that will be dealt with
2654 	 * by nfsrv_openctrl().
2655 	 */
2656 	if (!(new_stp->ls_flags &
2657 		(NFSLCK_DELEGPREV | NFSLCK_DELEGCUR | NFSLCK_RECLAIM))) {
2658 	    stp = LIST_FIRST(&lfp->lf_deleg);
2659 	    while (stp != LIST_END(&lfp->lf_deleg)) {
2660 		nstp = LIST_NEXT(stp, ls_file);
2661 		if ((readonly && stp->ls_clp != clp &&
2662 		       (stp->ls_flags & NFSLCK_DELEGWRITE)) ||
2663 		    (!readonly && (stp->ls_clp != clp ||
2664 		         (stp->ls_flags & NFSLCK_DELEGREAD)))) {
2665 			ret = nfsrv_delegconflict(stp, &haslock, p, vp);
2666 			if (ret) {
2667 			    /*
2668 			     * nfsrv_delegconflict() unlocks state
2669 			     * when it returns non-zero.
2670 			     */
2671 			    if (ret == -1)
2672 				goto tryagain;
2673 			    error = ret;
2674 			    goto out;
2675 			}
2676 		}
2677 		stp = nstp;
2678 	    }
2679 	}
2680 	NFSUNLOCKSTATE();
2681 	if (haslock) {
2682 		NFSLOCKV4ROOTMUTEX();
2683 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
2684 		NFSUNLOCKV4ROOTMUTEX();
2685 	}
2686 
2687 out:
2688 	NFSEXITCODE2(error, nd);
2689 	return (error);
2690 }
2691 
2692 /*
2693  * Open control function to create/update open state for an open.
2694  */
2695 int
2696 nfsrv_openctrl(struct nfsrv_descript *nd, vnode_t vp,
2697     struct nfsstate **new_stpp, nfsquad_t clientid, nfsv4stateid_t *stateidp,
2698     nfsv4stateid_t *delegstateidp, u_int32_t *rflagsp, struct nfsexstuff *exp,
2699     NFSPROC_T *p, u_quad_t filerev)
2700 {
2701 	struct nfsstate *new_stp = *new_stpp;
2702 	struct nfsstate *stp, *nstp;
2703 	struct nfsstate *openstp = NULL, *new_open, *ownerstp, *new_deleg;
2704 	struct nfslockfile *lfp, *new_lfp;
2705 	struct nfsclient *clp;
2706 	int error = 0, haslock = 0, ret, delegate = 1, writedeleg = 1;
2707 	int readonly = 0, cbret = 1, getfhret = 0;
2708 	int gotstate = 0, len = 0;
2709 	u_char *clidp = NULL;
2710 
2711 	if ((new_stp->ls_flags & NFSLCK_SHAREBITS) == NFSLCK_READACCESS)
2712 		readonly = 1;
2713 	/*
2714 	 * Check for restart conditions (client and server).
2715 	 * (Paranoia, should have been detected by nfsrv_opencheck().)
2716 	 * If an error does show up, return NFSERR_EXPIRED, since the
2717 	 * the seqid# has already been incremented.
2718 	 */
2719 	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
2720 	    &new_stp->ls_stateid, 0);
2721 	if (error) {
2722 		printf("Nfsd: openctrl unexpected restart err=%d\n",
2723 		    error);
2724 		error = NFSERR_EXPIRED;
2725 		goto out;
2726 	}
2727 
2728 	clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
2729 tryagain:
2730 	new_lfp = malloc(sizeof (struct nfslockfile),
2731 	    M_NFSDLOCKFILE, M_WAITOK);
2732 	new_open = malloc(sizeof (struct nfsstate),
2733 	    M_NFSDSTATE, M_WAITOK);
2734 	new_deleg = malloc(sizeof (struct nfsstate),
2735 	    M_NFSDSTATE, M_WAITOK);
2736 	getfhret = nfsrv_getlockfh(vp, new_stp->ls_flags, new_lfp,
2737 	    NULL, p);
2738 	NFSLOCKSTATE();
2739 	/*
2740 	 * Get the client structure. Since the linked lists could be changed
2741 	 * by other nfsd processes if this process does a tsleep(), one of
2742 	 * two things must be done.
2743 	 * 1 - don't tsleep()
2744 	 * or
2745 	 * 2 - get the nfsv4_lock() { indicated by haslock == 1 }
2746 	 *     before using the lists, since this lock stops the other
2747 	 *     nfsd. This should only be used for rare cases, since it
2748 	 *     essentially single threads the nfsd.
2749 	 *     At this time, it is only done for cases where the stable
2750 	 *     storage file must be written prior to completion of state
2751 	 *     expiration.
2752 	 */
2753 	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
2754 	    (nfsquad_t)((u_quad_t)0), 0, nd, p);
2755 	if (!error && (clp->lc_flags & LCL_NEEDSCBNULL) &&
2756 	    clp->lc_program) {
2757 		/*
2758 		 * This happens on the first open for a client
2759 		 * that supports callbacks.
2760 		 */
2761 		NFSUNLOCKSTATE();
2762 		/*
2763 		 * Although nfsrv_docallback() will sleep, clp won't
2764 		 * go away, since they are only removed when the
2765 		 * nfsv4_lock() has blocked the nfsd threads. The
2766 		 * fields in clp can change, but having multiple
2767 		 * threads do this Null callback RPC should be
2768 		 * harmless.
2769 		 */
2770 		cbret = nfsrv_docallback(clp, NFSV4PROC_CBNULL,
2771 		    NULL, 0, NULL, NULL, NULL, 0, p);
2772 		NFSLOCKSTATE();
2773 		clp->lc_flags &= ~LCL_NEEDSCBNULL;
2774 		if (!cbret)
2775 			clp->lc_flags |= LCL_CALLBACKSON;
2776 	}
2777 
2778 	/*
2779 	 * Look up the open owner. See if it needs confirmation and
2780 	 * check the seq#, as required.
2781 	 */
2782 	if (!error)
2783 		nfsrv_getowner(&clp->lc_open, new_stp, &ownerstp);
2784 
2785 	if (error) {
2786 		NFSUNLOCKSTATE();
2787 		printf("Nfsd: openctrl unexpected state err=%d\n",
2788 			error);
2789 		free(new_lfp, M_NFSDLOCKFILE);
2790 		free(new_open, M_NFSDSTATE);
2791 		free(new_deleg, M_NFSDSTATE);
2792 		if (haslock) {
2793 			NFSLOCKV4ROOTMUTEX();
2794 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2795 			NFSUNLOCKV4ROOTMUTEX();
2796 		}
2797 		error = NFSERR_EXPIRED;
2798 		goto out;
2799 	}
2800 
2801 	if (new_stp->ls_flags & NFSLCK_RECLAIM)
2802 		nfsrv_markstable(clp);
2803 
2804 	/*
2805 	 * Get the structure for the underlying file.
2806 	 */
2807 	if (getfhret)
2808 		error = getfhret;
2809 	else
2810 		error = nfsrv_getlockfile(new_stp->ls_flags, &new_lfp, &lfp,
2811 		    NULL, 0);
2812 	if (new_lfp)
2813 		free(new_lfp, M_NFSDLOCKFILE);
2814 	if (error) {
2815 		NFSUNLOCKSTATE();
2816 		printf("Nfsd openctrl unexpected getlockfile err=%d\n",
2817 		    error);
2818 		free(new_open, M_NFSDSTATE);
2819 		free(new_deleg, M_NFSDSTATE);
2820 		if (haslock) {
2821 			NFSLOCKV4ROOTMUTEX();
2822 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2823 			NFSUNLOCKV4ROOTMUTEX();
2824 		}
2825 		goto out;
2826 	}
2827 
2828 	/*
2829 	 * Search for a conflicting open/share.
2830 	 */
2831 	if (new_stp->ls_flags & NFSLCK_DELEGCUR) {
2832 	    /*
2833 	     * For Delegate_Cur, search for the matching Delegation,
2834 	     * which indicates no conflict.
2835 	     * An old delegation should have been recovered by the
2836 	     * client doing a Claim_DELEGATE_Prev, so I won't let
2837 	     * it match and return NFSERR_EXPIRED. Should I let it
2838 	     * match?
2839 	     */
2840 	    LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
2841 		if (!(stp->ls_flags & NFSLCK_OLDDELEG) &&
2842 		    (((nd->nd_flag & ND_NFSV41) != 0 &&
2843 		    stateidp->seqid == 0) ||
2844 		    stateidp->seqid == stp->ls_stateid.seqid) &&
2845 		    !NFSBCMP(stateidp->other, stp->ls_stateid.other,
2846 			NFSX_STATEIDOTHER))
2847 			break;
2848 	    }
2849 	    if (stp == LIST_END(&lfp->lf_deleg) ||
2850 		((new_stp->ls_flags & NFSLCK_WRITEACCESS) &&
2851 		 (stp->ls_flags & NFSLCK_DELEGREAD))) {
2852 		NFSUNLOCKSTATE();
2853 		printf("Nfsd openctrl unexpected expiry\n");
2854 		free(new_open, M_NFSDSTATE);
2855 		free(new_deleg, M_NFSDSTATE);
2856 		if (haslock) {
2857 			NFSLOCKV4ROOTMUTEX();
2858 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
2859 			NFSUNLOCKV4ROOTMUTEX();
2860 		}
2861 		error = NFSERR_EXPIRED;
2862 		goto out;
2863 	    }
2864 
2865 	    /*
2866 	     * Don't issue a Delegation, since one already exists and
2867 	     * delay delegation timeout, as required.
2868 	     */
2869 	    delegate = 0;
2870 	    nfsrv_delaydelegtimeout(stp);
2871 	}
2872 
2873 	/*
2874 	 * Check for access/deny bit conflicts. I also check for the
2875 	 * same owner, since the client might not have bothered to check.
2876 	 * Also, note an open for the same file and owner, if found,
2877 	 * which is all we do here for Delegate_Cur, since conflict
2878 	 * checking is already done.
2879 	 */
2880 	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
2881 		if (ownerstp && stp->ls_openowner == ownerstp)
2882 			openstp = stp;
2883 		if (!(new_stp->ls_flags & NFSLCK_DELEGCUR)) {
2884 		    /*
2885 		     * If another client has the file open, the only
2886 		     * delegation that can be issued is a Read delegation
2887 		     * and only if it is a Read open with Deny none.
2888 		     */
2889 		    if (clp != stp->ls_clp) {
2890 			if ((stp->ls_flags & NFSLCK_SHAREBITS) ==
2891 			    NFSLCK_READACCESS)
2892 			    writedeleg = 0;
2893 			else
2894 			    delegate = 0;
2895 		    }
2896 		    if(((new_stp->ls_flags & NFSLCK_ACCESSBITS) &
2897 		        ((stp->ls_flags>>NFSLCK_SHIFT) & NFSLCK_ACCESSBITS))||
2898 		       ((stp->ls_flags & NFSLCK_ACCESSBITS) &
2899 		        ((new_stp->ls_flags>>NFSLCK_SHIFT)&NFSLCK_ACCESSBITS))){
2900 			ret = nfsrv_clientconflict(stp->ls_clp,&haslock,vp,p);
2901 			if (ret == 1) {
2902 				/*
2903 				 * nfsrv_clientconflict() unlocks state
2904 				 * when it returns non-zero.
2905 				 */
2906 				free(new_open, M_NFSDSTATE);
2907 				free(new_deleg, M_NFSDSTATE);
2908 				openstp = NULL;
2909 				goto tryagain;
2910 			}
2911 			if (ret == 2)
2912 				error = NFSERR_PERM;
2913 			else if (new_stp->ls_flags & NFSLCK_RECLAIM)
2914 				error = NFSERR_RECLAIMCONFLICT;
2915 			else
2916 				error = NFSERR_SHAREDENIED;
2917 			if (ret == 0)
2918 				NFSUNLOCKSTATE();
2919 			if (haslock) {
2920 				NFSLOCKV4ROOTMUTEX();
2921 				nfsv4_unlock(&nfsv4rootfs_lock, 1);
2922 				NFSUNLOCKV4ROOTMUTEX();
2923 			}
2924 			free(new_open, M_NFSDSTATE);
2925 			free(new_deleg, M_NFSDSTATE);
2926 			printf("nfsd openctrl unexpected client cnfl\n");
2927 			goto out;
2928 		    }
2929 		}
2930 	}
2931 
2932 	/*
2933 	 * Check for a conflicting delegation. If one is found, call
2934 	 * nfsrv_delegconflict() to handle it. If the v4root lock hasn't
2935 	 * been set yet, it will get the lock. Otherwise, it will recall
2936 	 * the delegation. Then, we try try again...
2937 	 * (If NFSLCK_DELEGCUR is set, it has a delegation, so there
2938 	 *  isn't a conflict.)
2939 	 * I currently believe the conflict algorithm to be:
2940 	 * For Open with Read Access and Deny None
2941 	 * - there is a conflict iff a different client has a write delegation
2942 	 * For Open with other Write Access or any Deny except None
2943 	 * - there is a conflict if a different client has any delegation
2944 	 * - there is a conflict if the same client has a read delegation
2945 	 *   (The current consensus is that this last case should be
2946 	 *    considered a conflict since the client with a read delegation
2947 	 *    could have done an Open with ReadAccess and WriteDeny
2948 	 *    locally and then not have checked for the WriteDeny.)
2949 	 */
2950 	if (!(new_stp->ls_flags & (NFSLCK_DELEGPREV | NFSLCK_DELEGCUR))) {
2951 	    stp = LIST_FIRST(&lfp->lf_deleg);
2952 	    while (stp != LIST_END(&lfp->lf_deleg)) {
2953 		nstp = LIST_NEXT(stp, ls_file);
2954 		if (stp->ls_clp != clp && (stp->ls_flags & NFSLCK_DELEGREAD))
2955 			writedeleg = 0;
2956 		else
2957 			delegate = 0;
2958 		if ((readonly && stp->ls_clp != clp &&
2959 		       (stp->ls_flags & NFSLCK_DELEGWRITE)) ||
2960 		    (!readonly && (stp->ls_clp != clp ||
2961 		         (stp->ls_flags & NFSLCK_DELEGREAD)))) {
2962 		    if (new_stp->ls_flags & NFSLCK_RECLAIM) {
2963 			delegate = 2;
2964 		    } else {
2965 			ret = nfsrv_delegconflict(stp, &haslock, p, vp);
2966 			if (ret) {
2967 			    /*
2968 			     * nfsrv_delegconflict() unlocks state
2969 			     * when it returns non-zero.
2970 			     */
2971 			    printf("Nfsd openctrl unexpected deleg cnfl\n");
2972 			    free(new_open, M_NFSDSTATE);
2973 			    free(new_deleg, M_NFSDSTATE);
2974 			    if (ret == -1) {
2975 				openstp = NULL;
2976 				goto tryagain;
2977 			    }
2978 			    error = ret;
2979 			    goto out;
2980 			}
2981 		    }
2982 		}
2983 		stp = nstp;
2984 	    }
2985 	}
2986 
2987 	/*
2988 	 * We only get here if there was no open that conflicted.
2989 	 * If an open for the owner exists, or in the access/deny bits.
2990 	 * Otherwise it is a new open. If the open_owner hasn't been
2991 	 * confirmed, replace the open with the new one needing confirmation,
2992 	 * otherwise add the open.
2993 	 */
2994 	if (new_stp->ls_flags & NFSLCK_DELEGPREV) {
2995 	    /*
2996 	     * Handle NFSLCK_DELEGPREV by searching the old delegations for
2997 	     * a match. If found, just move the old delegation to the current
2998 	     * delegation list and issue open. If not found, return
2999 	     * NFSERR_EXPIRED.
3000 	     */
3001 	    LIST_FOREACH(stp, &clp->lc_olddeleg, ls_list) {
3002 		if (stp->ls_lfp == lfp) {
3003 		    /* Found it */
3004 		    if (stp->ls_clp != clp)
3005 			panic("olddeleg clp");
3006 		    LIST_REMOVE(stp, ls_list);
3007 		    LIST_REMOVE(stp, ls_hash);
3008 		    stp->ls_flags &= ~NFSLCK_OLDDELEG;
3009 		    stp->ls_stateid.seqid = delegstateidp->seqid = 1;
3010 		    stp->ls_stateid.other[0] = delegstateidp->other[0] =
3011 			clp->lc_clientid.lval[0];
3012 		    stp->ls_stateid.other[1] = delegstateidp->other[1] =
3013 			clp->lc_clientid.lval[1];
3014 		    stp->ls_stateid.other[2] = delegstateidp->other[2] =
3015 			nfsrv_nextstateindex(clp);
3016 		    stp->ls_compref = nd->nd_compref;
3017 		    LIST_INSERT_HEAD(&clp->lc_deleg, stp, ls_list);
3018 		    LIST_INSERT_HEAD(NFSSTATEHASH(clp,
3019 			stp->ls_stateid), stp, ls_hash);
3020 		    if (stp->ls_flags & NFSLCK_DELEGWRITE)
3021 			*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
3022 		    else
3023 			*rflagsp |= NFSV4OPEN_READDELEGATE;
3024 		    clp->lc_delegtime = NFSD_MONOSEC +
3025 			nfsrv_lease + NFSRV_LEASEDELTA;
3026 
3027 		    /*
3028 		     * Now, do the associated open.
3029 		     */
3030 		    new_open->ls_stateid.seqid = 1;
3031 		    new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
3032 		    new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
3033 		    new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
3034 		    new_open->ls_flags = (new_stp->ls_flags&NFSLCK_DENYBITS)|
3035 			NFSLCK_OPEN;
3036 		    if (stp->ls_flags & NFSLCK_DELEGWRITE)
3037 			new_open->ls_flags |= (NFSLCK_READACCESS |
3038 			    NFSLCK_WRITEACCESS);
3039 		    else
3040 			new_open->ls_flags |= NFSLCK_READACCESS;
3041 		    new_open->ls_uid = new_stp->ls_uid;
3042 		    new_open->ls_lfp = lfp;
3043 		    new_open->ls_clp = clp;
3044 		    LIST_INIT(&new_open->ls_open);
3045 		    LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
3046 		    LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
3047 			new_open, ls_hash);
3048 		    /*
3049 		     * and handle the open owner
3050 		     */
3051 		    if (ownerstp) {
3052 			new_open->ls_openowner = ownerstp;
3053 			LIST_INSERT_HEAD(&ownerstp->ls_open,new_open,ls_list);
3054 		    } else {
3055 			new_open->ls_openowner = new_stp;
3056 			new_stp->ls_flags = 0;
3057 			nfsrvd_refcache(new_stp->ls_op);
3058 			new_stp->ls_noopens = 0;
3059 			LIST_INIT(&new_stp->ls_open);
3060 			LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
3061 			LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
3062 			*new_stpp = NULL;
3063 			NFSD_VNET(nfsstatsv1_p)->srvopenowners++;
3064 			nfsrv_openpluslock++;
3065 		    }
3066 		    openstp = new_open;
3067 		    new_open = NULL;
3068 		    NFSD_VNET(nfsstatsv1_p)->srvopens++;
3069 		    nfsrv_openpluslock++;
3070 		    break;
3071 		}
3072 	    }
3073 	    if (stp == LIST_END(&clp->lc_olddeleg))
3074 		error = NFSERR_EXPIRED;
3075 	} else if (new_stp->ls_flags & (NFSLCK_DELEGREAD | NFSLCK_DELEGWRITE)) {
3076 	    /*
3077 	     * Scan to see that no delegation for this client and file
3078 	     * doesn't already exist.
3079 	     * There also shouldn't yet be an Open for this file and
3080 	     * openowner.
3081 	     */
3082 	    LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
3083 		if (stp->ls_clp == clp)
3084 		    break;
3085 	    }
3086 	    if (stp == LIST_END(&lfp->lf_deleg) && openstp == NULL) {
3087 		/*
3088 		 * This is the Claim_Previous case with a delegation
3089 		 * type != Delegate_None.
3090 		 */
3091 		/*
3092 		 * First, add the delegation. (Although we must issue the
3093 		 * delegation, we can also ask for an immediate return.)
3094 		 */
3095 		new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
3096 		new_deleg->ls_stateid.other[0] = delegstateidp->other[0] =
3097 		    clp->lc_clientid.lval[0];
3098 		new_deleg->ls_stateid.other[1] = delegstateidp->other[1] =
3099 		    clp->lc_clientid.lval[1];
3100 		new_deleg->ls_stateid.other[2] = delegstateidp->other[2] =
3101 		    nfsrv_nextstateindex(clp);
3102 		if (new_stp->ls_flags & NFSLCK_DELEGWRITE) {
3103 		    new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
3104 			NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
3105 		    *rflagsp |= NFSV4OPEN_WRITEDELEGATE;
3106 		    nfsrv_writedelegcnt++;
3107 		} else {
3108 		    new_deleg->ls_flags = (NFSLCK_DELEGREAD |
3109 			NFSLCK_READACCESS);
3110 		    *rflagsp |= NFSV4OPEN_READDELEGATE;
3111 		}
3112 		new_deleg->ls_uid = new_stp->ls_uid;
3113 		new_deleg->ls_lfp = lfp;
3114 		new_deleg->ls_clp = clp;
3115 		new_deleg->ls_filerev = filerev;
3116 		new_deleg->ls_compref = nd->nd_compref;
3117 		new_deleg->ls_lastrecall = 0;
3118 		LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
3119 		LIST_INSERT_HEAD(NFSSTATEHASH(clp,
3120 		    new_deleg->ls_stateid), new_deleg, ls_hash);
3121 		LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
3122 		new_deleg = NULL;
3123 		if (delegate == 2 || nfsrv_issuedelegs == 0 ||
3124 		    (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
3125 		     LCL_CALLBACKSON ||
3126 		    NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) ||
3127 		    !NFSVNO_DELEGOK(vp))
3128 		    *rflagsp |= NFSV4OPEN_RECALL;
3129 		NFSD_VNET(nfsstatsv1_p)->srvdelegates++;
3130 		nfsrv_openpluslock++;
3131 		nfsrv_delegatecnt++;
3132 
3133 		/*
3134 		 * Now, do the associated open.
3135 		 */
3136 		new_open->ls_stateid.seqid = 1;
3137 		new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
3138 		new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
3139 		new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
3140 		new_open->ls_flags = (new_stp->ls_flags & NFSLCK_DENYBITS) |
3141 		    NFSLCK_OPEN;
3142 		if (new_stp->ls_flags & NFSLCK_DELEGWRITE)
3143 			new_open->ls_flags |= (NFSLCK_READACCESS |
3144 			    NFSLCK_WRITEACCESS);
3145 		else
3146 			new_open->ls_flags |= NFSLCK_READACCESS;
3147 		new_open->ls_uid = new_stp->ls_uid;
3148 		new_open->ls_lfp = lfp;
3149 		new_open->ls_clp = clp;
3150 		LIST_INIT(&new_open->ls_open);
3151 		LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
3152 		LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
3153 		   new_open, ls_hash);
3154 		/*
3155 		 * and handle the open owner
3156 		 */
3157 		if (ownerstp) {
3158 		    new_open->ls_openowner = ownerstp;
3159 		    LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
3160 		} else {
3161 		    new_open->ls_openowner = new_stp;
3162 		    new_stp->ls_flags = 0;
3163 		    nfsrvd_refcache(new_stp->ls_op);
3164 		    new_stp->ls_noopens = 0;
3165 		    LIST_INIT(&new_stp->ls_open);
3166 		    LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
3167 		    LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
3168 		    *new_stpp = NULL;
3169 		    NFSD_VNET(nfsstatsv1_p)->srvopenowners++;
3170 		    nfsrv_openpluslock++;
3171 		}
3172 		openstp = new_open;
3173 		new_open = NULL;
3174 		NFSD_VNET(nfsstatsv1_p)->srvopens++;
3175 		nfsrv_openpluslock++;
3176 	    } else {
3177 		error = NFSERR_RECLAIMCONFLICT;
3178 	    }
3179 	} else if (ownerstp) {
3180 		if (ownerstp->ls_flags & NFSLCK_NEEDSCONFIRM) {
3181 		    /* Replace the open */
3182 		    if (ownerstp->ls_op)
3183 			nfsrvd_derefcache(ownerstp->ls_op);
3184 		    ownerstp->ls_op = new_stp->ls_op;
3185 		    nfsrvd_refcache(ownerstp->ls_op);
3186 		    ownerstp->ls_seq = new_stp->ls_seq;
3187 		    *rflagsp |= NFSV4OPEN_RESULTCONFIRM;
3188 		    stp = LIST_FIRST(&ownerstp->ls_open);
3189 		    stp->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) |
3190 			NFSLCK_OPEN;
3191 		    stp->ls_stateid.seqid = 1;
3192 		    stp->ls_uid = new_stp->ls_uid;
3193 		    if (lfp != stp->ls_lfp) {
3194 			LIST_REMOVE(stp, ls_file);
3195 			LIST_INSERT_HEAD(&lfp->lf_open, stp, ls_file);
3196 			stp->ls_lfp = lfp;
3197 		    }
3198 		    openstp = stp;
3199 		} else if (openstp) {
3200 		    openstp->ls_flags |= (new_stp->ls_flags & NFSLCK_SHAREBITS);
3201 		    openstp->ls_stateid.seqid++;
3202 		    if ((nd->nd_flag & ND_NFSV41) != 0 &&
3203 			openstp->ls_stateid.seqid == 0)
3204 			openstp->ls_stateid.seqid = 1;
3205 
3206 		    /*
3207 		     * This is where we can choose to issue a delegation.
3208 		     */
3209 		    if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
3210 			*rflagsp |= NFSV4OPEN_WDNOTWANTED;
3211 		    else if (nfsrv_issuedelegs == 0)
3212 			*rflagsp |= NFSV4OPEN_WDSUPPFTYPE;
3213 		    else if (NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
3214 			*rflagsp |= NFSV4OPEN_WDRESOURCE;
3215 		    else if (delegate == 0 || writedeleg == 0 ||
3216 			NFSVNO_EXRDONLY(exp) || (readonly != 0 &&
3217 			nfsrv_writedelegifpos == 0) ||
3218 			!NFSVNO_DELEGOK(vp) ||
3219 			(new_stp->ls_flags & NFSLCK_WANTRDELEG) != 0 ||
3220 			(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
3221 			 LCL_CALLBACKSON)
3222 			*rflagsp |= NFSV4OPEN_WDCONTENTION;
3223 		    else {
3224 			new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
3225 			new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
3226 			    = clp->lc_clientid.lval[0];
3227 			new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
3228 			    = clp->lc_clientid.lval[1];
3229 			new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
3230 			    = nfsrv_nextstateindex(clp);
3231 			new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
3232 			    NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
3233 			*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
3234 			new_deleg->ls_uid = new_stp->ls_uid;
3235 			new_deleg->ls_lfp = lfp;
3236 			new_deleg->ls_clp = clp;
3237 			new_deleg->ls_filerev = filerev;
3238 			new_deleg->ls_compref = nd->nd_compref;
3239 			new_deleg->ls_lastrecall = 0;
3240 			nfsrv_writedelegcnt++;
3241 			LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
3242 			LIST_INSERT_HEAD(NFSSTATEHASH(clp,
3243 			    new_deleg->ls_stateid), new_deleg, ls_hash);
3244 			LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
3245 			new_deleg = NULL;
3246 			NFSD_VNET(nfsstatsv1_p)->srvdelegates++;
3247 			nfsrv_openpluslock++;
3248 			nfsrv_delegatecnt++;
3249 		    }
3250 		} else {
3251 		    new_open->ls_stateid.seqid = 1;
3252 		    new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
3253 		    new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
3254 		    new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
3255 		    new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS)|
3256 			NFSLCK_OPEN;
3257 		    new_open->ls_uid = new_stp->ls_uid;
3258 		    new_open->ls_openowner = ownerstp;
3259 		    new_open->ls_lfp = lfp;
3260 		    new_open->ls_clp = clp;
3261 		    LIST_INIT(&new_open->ls_open);
3262 		    LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
3263 		    LIST_INSERT_HEAD(&ownerstp->ls_open, new_open, ls_list);
3264 		    LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
3265 			new_open, ls_hash);
3266 		    openstp = new_open;
3267 		    new_open = NULL;
3268 		    NFSD_VNET(nfsstatsv1_p)->srvopens++;
3269 		    nfsrv_openpluslock++;
3270 
3271 		    /*
3272 		     * This is where we can choose to issue a delegation.
3273 		     */
3274 		    if ((new_stp->ls_flags & NFSLCK_WANTNODELEG) != 0)
3275 			*rflagsp |= NFSV4OPEN_WDNOTWANTED;
3276 		    else if (nfsrv_issuedelegs == 0)
3277 			*rflagsp |= NFSV4OPEN_WDSUPPFTYPE;
3278 		    else if (NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt))
3279 			*rflagsp |= NFSV4OPEN_WDRESOURCE;
3280 		    else if (delegate == 0 || (writedeleg == 0 &&
3281 			readonly == 0) || !NFSVNO_DELEGOK(vp) ||
3282 			(clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) !=
3283 			 LCL_CALLBACKSON)
3284 			*rflagsp |= NFSV4OPEN_WDCONTENTION;
3285 		    else {
3286 			new_deleg->ls_stateid.seqid = delegstateidp->seqid = 1;
3287 			new_deleg->ls_stateid.other[0] = delegstateidp->other[0]
3288 			    = clp->lc_clientid.lval[0];
3289 			new_deleg->ls_stateid.other[1] = delegstateidp->other[1]
3290 			    = clp->lc_clientid.lval[1];
3291 			new_deleg->ls_stateid.other[2] = delegstateidp->other[2]
3292 			    = nfsrv_nextstateindex(clp);
3293 			if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
3294 			    (nfsrv_writedelegifpos || !readonly) &&
3295 			    (new_stp->ls_flags & NFSLCK_WANTRDELEG) == 0) {
3296 			    new_deleg->ls_flags = (NFSLCK_DELEGWRITE |
3297 				NFSLCK_READACCESS | NFSLCK_WRITEACCESS);
3298 			    *rflagsp |= NFSV4OPEN_WRITEDELEGATE;
3299 			    nfsrv_writedelegcnt++;
3300 			} else {
3301 			    new_deleg->ls_flags = (NFSLCK_DELEGREAD |
3302 				NFSLCK_READACCESS);
3303 			    *rflagsp |= NFSV4OPEN_READDELEGATE;
3304 			}
3305 			new_deleg->ls_uid = new_stp->ls_uid;
3306 			new_deleg->ls_lfp = lfp;
3307 			new_deleg->ls_clp = clp;
3308 			new_deleg->ls_filerev = filerev;
3309 			new_deleg->ls_compref = nd->nd_compref;
3310 			new_deleg->ls_lastrecall = 0;
3311 			LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg, ls_file);
3312 			LIST_INSERT_HEAD(NFSSTATEHASH(clp,
3313 			    new_deleg->ls_stateid), new_deleg, ls_hash);
3314 			LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg, ls_list);
3315 			new_deleg = NULL;
3316 			NFSD_VNET(nfsstatsv1_p)->srvdelegates++;
3317 			nfsrv_openpluslock++;
3318 			nfsrv_delegatecnt++;
3319 		    }
3320 		}
3321 	} else {
3322 		/*
3323 		 * New owner case. Start the open_owner sequence with a
3324 		 * Needs confirmation (unless a reclaim) and hang the
3325 		 * new open off it.
3326 		 */
3327 		new_open->ls_stateid.seqid = 1;
3328 		new_open->ls_stateid.other[0] = clp->lc_clientid.lval[0];
3329 		new_open->ls_stateid.other[1] = clp->lc_clientid.lval[1];
3330 		new_open->ls_stateid.other[2] = nfsrv_nextstateindex(clp);
3331 		new_open->ls_flags = (new_stp->ls_flags & NFSLCK_SHAREBITS) |
3332 		    NFSLCK_OPEN;
3333 		new_open->ls_uid = new_stp->ls_uid;
3334 		LIST_INIT(&new_open->ls_open);
3335 		new_open->ls_openowner = new_stp;
3336 		new_open->ls_lfp = lfp;
3337 		new_open->ls_clp = clp;
3338 		LIST_INSERT_HEAD(&lfp->lf_open, new_open, ls_file);
3339 		if (new_stp->ls_flags & NFSLCK_RECLAIM) {
3340 			new_stp->ls_flags = 0;
3341 		} else if ((nd->nd_flag & ND_NFSV41) != 0) {
3342 			/* NFSv4.1 never needs confirmation. */
3343 			new_stp->ls_flags = 0;
3344 
3345 			/*
3346 			 * This is where we can choose to issue a delegation.
3347 			 */
3348 			if (delegate && nfsrv_issuedelegs &&
3349 			    (writedeleg || readonly) &&
3350 			    (clp->lc_flags & (LCL_CALLBACKSON | LCL_CBDOWN)) ==
3351 			     LCL_CALLBACKSON &&
3352 			    !NFSRV_V4DELEGLIMIT(nfsrv_delegatecnt) &&
3353 			    NFSVNO_DELEGOK(vp) &&
3354 			    ((nd->nd_flag & ND_NFSV41) == 0 ||
3355 			     (new_stp->ls_flags & NFSLCK_WANTNODELEG) == 0)) {
3356 				new_deleg->ls_stateid.seqid =
3357 				    delegstateidp->seqid = 1;
3358 				new_deleg->ls_stateid.other[0] =
3359 				    delegstateidp->other[0]
3360 				    = clp->lc_clientid.lval[0];
3361 				new_deleg->ls_stateid.other[1] =
3362 				    delegstateidp->other[1]
3363 				    = clp->lc_clientid.lval[1];
3364 				new_deleg->ls_stateid.other[2] =
3365 				    delegstateidp->other[2]
3366 				    = nfsrv_nextstateindex(clp);
3367 				if (writedeleg && !NFSVNO_EXRDONLY(exp) &&
3368 				    (nfsrv_writedelegifpos || !readonly) &&
3369 				    ((nd->nd_flag & ND_NFSV41) == 0 ||
3370 				     (new_stp->ls_flags & NFSLCK_WANTRDELEG) ==
3371 				     0)) {
3372 					new_deleg->ls_flags =
3373 					    (NFSLCK_DELEGWRITE |
3374 					     NFSLCK_READACCESS |
3375 					     NFSLCK_WRITEACCESS);
3376 					*rflagsp |= NFSV4OPEN_WRITEDELEGATE;
3377 					nfsrv_writedelegcnt++;
3378 				} else {
3379 					new_deleg->ls_flags =
3380 					    (NFSLCK_DELEGREAD |
3381 					     NFSLCK_READACCESS);
3382 					*rflagsp |= NFSV4OPEN_READDELEGATE;
3383 				}
3384 				new_deleg->ls_uid = new_stp->ls_uid;
3385 				new_deleg->ls_lfp = lfp;
3386 				new_deleg->ls_clp = clp;
3387 				new_deleg->ls_filerev = filerev;
3388 				new_deleg->ls_compref = nd->nd_compref;
3389 				new_deleg->ls_lastrecall = 0;
3390 				LIST_INSERT_HEAD(&lfp->lf_deleg, new_deleg,
3391 				    ls_file);
3392 				LIST_INSERT_HEAD(NFSSTATEHASH(clp,
3393 				    new_deleg->ls_stateid), new_deleg, ls_hash);
3394 				LIST_INSERT_HEAD(&clp->lc_deleg, new_deleg,
3395 				    ls_list);
3396 				new_deleg = NULL;
3397 				NFSD_VNET(nfsstatsv1_p)->srvdelegates++;
3398 				nfsrv_openpluslock++;
3399 				nfsrv_delegatecnt++;
3400 			}
3401 			/*
3402 			 * Since NFSv4.1 never does an OpenConfirm, the first
3403 			 * open state will be acquired here.
3404 			 */
3405 			if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
3406 				clp->lc_flags |= LCL_STAMPEDSTABLE;
3407 				len = clp->lc_idlen;
3408 				NFSBCOPY(clp->lc_id, clidp, len);
3409 				gotstate = 1;
3410 			}
3411 		} else {
3412 			*rflagsp |= NFSV4OPEN_RESULTCONFIRM;
3413 			new_stp->ls_flags = NFSLCK_NEEDSCONFIRM;
3414 		}
3415 		nfsrvd_refcache(new_stp->ls_op);
3416 		new_stp->ls_noopens = 0;
3417 		LIST_INIT(&new_stp->ls_open);
3418 		LIST_INSERT_HEAD(&new_stp->ls_open, new_open, ls_list);
3419 		LIST_INSERT_HEAD(&clp->lc_open, new_stp, ls_list);
3420 		LIST_INSERT_HEAD(NFSSTATEHASH(clp, new_open->ls_stateid),
3421 		    new_open, ls_hash);
3422 		openstp = new_open;
3423 		new_open = NULL;
3424 		*new_stpp = NULL;
3425 		NFSD_VNET(nfsstatsv1_p)->srvopens++;
3426 		nfsrv_openpluslock++;
3427 		NFSD_VNET(nfsstatsv1_p)->srvopenowners++;
3428 		nfsrv_openpluslock++;
3429 	}
3430 	if (!error) {
3431 		stateidp->seqid = openstp->ls_stateid.seqid;
3432 		stateidp->other[0] = openstp->ls_stateid.other[0];
3433 		stateidp->other[1] = openstp->ls_stateid.other[1];
3434 		stateidp->other[2] = openstp->ls_stateid.other[2];
3435 	}
3436 	NFSUNLOCKSTATE();
3437 	if (haslock) {
3438 		NFSLOCKV4ROOTMUTEX();
3439 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
3440 		NFSUNLOCKV4ROOTMUTEX();
3441 	}
3442 	if (new_open)
3443 		free(new_open, M_NFSDSTATE);
3444 	if (new_deleg)
3445 		free(new_deleg, M_NFSDSTATE);
3446 
3447 	/*
3448 	 * If the NFSv4.1 client just acquired its first open, write a timestamp
3449 	 * to the stable storage file.
3450 	 */
3451 	if (gotstate != 0) {
3452 		nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
3453 		nfsrv_backupstable();
3454 	}
3455 
3456 out:
3457 	free(clidp, M_TEMP);
3458 	NFSEXITCODE2(error, nd);
3459 	return (error);
3460 }
3461 
3462 /*
3463  * Open update. Does the confirm, downgrade and close.
3464  */
3465 int
3466 nfsrv_openupdate(vnode_t vp, struct nfsstate *new_stp, nfsquad_t clientid,
3467     nfsv4stateid_t *stateidp, struct nfsrv_descript *nd, NFSPROC_T *p,
3468     int *retwriteaccessp)
3469 {
3470 	struct nfsstate *stp;
3471 	struct nfsclient *clp;
3472 	struct nfslockfile *lfp;
3473 	u_int32_t bits;
3474 	int error = 0, gotstate = 0, len = 0;
3475 	u_char *clidp = NULL;
3476 
3477 	/*
3478 	 * Check for restart conditions (client and server).
3479 	 */
3480 	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
3481 	    &new_stp->ls_stateid, 0);
3482 	if (error)
3483 		goto out;
3484 
3485 	clidp = malloc(NFSV4_OPAQUELIMIT, M_TEMP, M_WAITOK);
3486 	NFSLOCKSTATE();
3487 	/*
3488 	 * Get the open structure via clientid and stateid.
3489 	 */
3490 	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
3491 	    (nfsquad_t)((u_quad_t)0), 0, nd, p);
3492 	if (!error)
3493 		error = nfsrv_getstate(clp, &new_stp->ls_stateid,
3494 		    new_stp->ls_flags, &stp);
3495 
3496 	/*
3497 	 * Sanity check the open.
3498 	 */
3499 	if (!error && (!(stp->ls_flags & NFSLCK_OPEN) ||
3500 		(!(new_stp->ls_flags & NFSLCK_CONFIRM) &&
3501 		 (stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)) ||
3502 		((new_stp->ls_flags & NFSLCK_CONFIRM) &&
3503 		 (!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM)))))
3504 		error = NFSERR_BADSTATEID;
3505 
3506 	if (!error)
3507 		error = nfsrv_checkseqid(nd, new_stp->ls_seq,
3508 		    stp->ls_openowner, new_stp->ls_op);
3509 	if (!error && stp->ls_stateid.seqid != new_stp->ls_stateid.seqid &&
3510 	    (((nd->nd_flag & ND_NFSV41) == 0 &&
3511 	      !(new_stp->ls_flags & NFSLCK_CONFIRM)) ||
3512 	     ((nd->nd_flag & ND_NFSV41) != 0 &&
3513 	      new_stp->ls_stateid.seqid != 0)))
3514 		error = NFSERR_OLDSTATEID;
3515 	if (!error && vp->v_type != VREG) {
3516 		if (vp->v_type == VDIR)
3517 			error = NFSERR_ISDIR;
3518 		else
3519 			error = NFSERR_INVAL;
3520 	}
3521 
3522 	if (error) {
3523 		/*
3524 		 * If a client tries to confirm an Open with a bad
3525 		 * seqid# and there are no byte range locks or other Opens
3526 		 * on the openowner, just throw it away, so the next use of the
3527 		 * openowner will start a fresh seq#.
3528 		 */
3529 		if (error == NFSERR_BADSEQID &&
3530 		    (new_stp->ls_flags & NFSLCK_CONFIRM) &&
3531 		    nfsrv_nootherstate(stp))
3532 			nfsrv_freeopenowner(stp->ls_openowner, 0, p);
3533 		NFSUNLOCKSTATE();
3534 		goto out;
3535 	}
3536 
3537 	/*
3538 	 * Set the return stateid.
3539 	 */
3540 	stateidp->seqid = stp->ls_stateid.seqid + 1;
3541 	if ((nd->nd_flag & ND_NFSV41) != 0 && stateidp->seqid == 0)
3542 		stateidp->seqid = 1;
3543 	stateidp->other[0] = stp->ls_stateid.other[0];
3544 	stateidp->other[1] = stp->ls_stateid.other[1];
3545 	stateidp->other[2] = stp->ls_stateid.other[2];
3546 	/*
3547 	 * Now, handle the three cases.
3548 	 */
3549 	if (new_stp->ls_flags & NFSLCK_CONFIRM) {
3550 		/*
3551 		 * If the open doesn't need confirmation, it seems to me that
3552 		 * there is a client error, but I'll just log it and keep going?
3553 		 */
3554 		if (!(stp->ls_openowner->ls_flags & NFSLCK_NEEDSCONFIRM))
3555 			printf("Nfsv4d: stray open confirm\n");
3556 		stp->ls_openowner->ls_flags = 0;
3557 		stp->ls_stateid.seqid++;
3558 		if ((nd->nd_flag & ND_NFSV41) != 0 &&
3559 		    stp->ls_stateid.seqid == 0)
3560 			stp->ls_stateid.seqid = 1;
3561 		if (!(clp->lc_flags & LCL_STAMPEDSTABLE)) {
3562 			clp->lc_flags |= LCL_STAMPEDSTABLE;
3563 			len = clp->lc_idlen;
3564 			NFSBCOPY(clp->lc_id, clidp, len);
3565 			gotstate = 1;
3566 		}
3567 		NFSUNLOCKSTATE();
3568 	} else if (new_stp->ls_flags & NFSLCK_CLOSE) {
3569 		lfp = stp->ls_lfp;
3570 		if (retwriteaccessp != NULL) {
3571 			if ((stp->ls_flags & NFSLCK_WRITEACCESS) != 0)
3572 				*retwriteaccessp = 1;
3573 			else
3574 				*retwriteaccessp = 0;
3575 		}
3576 		if (nfsrv_dolocallocks != 0 && !LIST_EMPTY(&stp->ls_open)) {
3577 			/* Get the lf lock */
3578 			nfsrv_locklf(lfp);
3579 			NFSUNLOCKSTATE();
3580 			ASSERT_VOP_ELOCKED(vp, "nfsrv_openupdate");
3581 			NFSVOPUNLOCK(vp);
3582 			if (nfsrv_freeopen(stp, vp, 1, p) == 0) {
3583 				NFSLOCKSTATE();
3584 				nfsrv_unlocklf(lfp);
3585 				NFSUNLOCKSTATE();
3586 			}
3587 			NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
3588 		} else {
3589 			(void) nfsrv_freeopen(stp, NULL, 0, p);
3590 			NFSUNLOCKSTATE();
3591 		}
3592 	} else {
3593 		/*
3594 		 * Update the share bits, making sure that the new set are a
3595 		 * subset of the old ones.
3596 		 */
3597 		bits = (new_stp->ls_flags & NFSLCK_SHAREBITS);
3598 		if (~(stp->ls_flags) & bits) {
3599 			NFSUNLOCKSTATE();
3600 			error = NFSERR_INVAL;
3601 			goto out;
3602 		}
3603 		stp->ls_flags = (bits | NFSLCK_OPEN);
3604 		stp->ls_stateid.seqid++;
3605 		if ((nd->nd_flag & ND_NFSV41) != 0 &&
3606 		    stp->ls_stateid.seqid == 0)
3607 			stp->ls_stateid.seqid = 1;
3608 		NFSUNLOCKSTATE();
3609 	}
3610 
3611 	/*
3612 	 * If the client just confirmed its first open, write a timestamp
3613 	 * to the stable storage file.
3614 	 */
3615 	if (gotstate != 0) {
3616 		nfsrv_writestable(clidp, len, NFSNST_NEWSTATE, p);
3617 		nfsrv_backupstable();
3618 	}
3619 
3620 out:
3621 	free(clidp, M_TEMP);
3622 	NFSEXITCODE2(error, nd);
3623 	return (error);
3624 }
3625 
3626 /*
3627  * Delegation update. Does the purge and return.
3628  */
3629 int
3630 nfsrv_delegupdate(struct nfsrv_descript *nd, nfsquad_t clientid,
3631     nfsv4stateid_t *stateidp, vnode_t vp, int op, struct ucred *cred,
3632     NFSPROC_T *p, int *retwriteaccessp)
3633 {
3634 	struct nfsstate *stp;
3635 	struct nfsclient *clp;
3636 	int error = 0;
3637 	fhandle_t fh;
3638 
3639 	/*
3640 	 * Do a sanity check against the file handle for DelegReturn.
3641 	 */
3642 	if (vp) {
3643 		error = nfsvno_getfh(vp, &fh, p);
3644 		if (error)
3645 			goto out;
3646 	}
3647 	/*
3648 	 * Check for restart conditions (client and server).
3649 	 */
3650 	if (op == NFSV4OP_DELEGRETURN)
3651 		error = nfsrv_checkrestart(clientid, NFSLCK_DELEGRETURN,
3652 			stateidp, 0);
3653 	else
3654 		error = nfsrv_checkrestart(clientid, NFSLCK_DELEGPURGE,
3655 			stateidp, 0);
3656 
3657 	NFSLOCKSTATE();
3658 	/*
3659 	 * Get the open structure via clientid and stateid.
3660 	 */
3661 	if (!error)
3662 	    error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
3663 		(nfsquad_t)((u_quad_t)0), 0, nd, p);
3664 	if (error) {
3665 		if (error == NFSERR_CBPATHDOWN)
3666 			error = 0;
3667 		if (error == NFSERR_STALECLIENTID && op == NFSV4OP_DELEGRETURN)
3668 			error = NFSERR_STALESTATEID;
3669 	}
3670 	if (!error && op == NFSV4OP_DELEGRETURN) {
3671 	    error = nfsrv_getstate(clp, stateidp, NFSLCK_DELEGRETURN, &stp);
3672 	    if (!error && stp->ls_stateid.seqid != stateidp->seqid &&
3673 		((nd->nd_flag & ND_NFSV41) == 0 || stateidp->seqid != 0))
3674 		error = NFSERR_OLDSTATEID;
3675 	}
3676 	/*
3677 	 * NFSERR_EXPIRED means that the state has gone away,
3678 	 * so Delegations have been purged. Just return ok.
3679 	 */
3680 	if (error == NFSERR_EXPIRED && op == NFSV4OP_DELEGPURGE) {
3681 		NFSUNLOCKSTATE();
3682 		error = 0;
3683 		goto out;
3684 	}
3685 	if (error) {
3686 		NFSUNLOCKSTATE();
3687 		goto out;
3688 	}
3689 
3690 	if (op == NFSV4OP_DELEGRETURN) {
3691 		if (NFSBCMP((caddr_t)&fh, (caddr_t)&stp->ls_lfp->lf_fh,
3692 		    sizeof (fhandle_t))) {
3693 			NFSUNLOCKSTATE();
3694 			error = NFSERR_BADSTATEID;
3695 			goto out;
3696 		}
3697 		if (retwriteaccessp != NULL) {
3698 			if ((stp->ls_flags & NFSLCK_DELEGWRITE) != 0)
3699 				*retwriteaccessp = 1;
3700 			else
3701 				*retwriteaccessp = 0;
3702 		}
3703 		nfsrv_freedeleg(stp);
3704 	} else {
3705 		nfsrv_freedeleglist(&clp->lc_olddeleg);
3706 	}
3707 	NFSUNLOCKSTATE();
3708 	error = 0;
3709 
3710 out:
3711 	NFSEXITCODE(error);
3712 	return (error);
3713 }
3714 
3715 /*
3716  * Release lock owner.
3717  */
3718 int
3719 nfsrv_releaselckown(struct nfsstate *new_stp, nfsquad_t clientid,
3720     NFSPROC_T *p)
3721 {
3722 	struct nfsstate *stp, *nstp, *openstp, *ownstp;
3723 	struct nfsclient *clp;
3724 	int error = 0;
3725 
3726 	/*
3727 	 * Check for restart conditions (client and server).
3728 	 */
3729 	error = nfsrv_checkrestart(clientid, new_stp->ls_flags,
3730 	    &new_stp->ls_stateid, 0);
3731 	if (error)
3732 		goto out;
3733 
3734 	NFSLOCKSTATE();
3735 	/*
3736 	 * Get the lock owner by name.
3737 	 */
3738 	error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
3739 	    (nfsquad_t)((u_quad_t)0), 0, NULL, p);
3740 	if (error) {
3741 		NFSUNLOCKSTATE();
3742 		goto out;
3743 	}
3744 	LIST_FOREACH(ownstp, &clp->lc_open, ls_list) {
3745 	    LIST_FOREACH(openstp, &ownstp->ls_open, ls_list) {
3746 		stp = LIST_FIRST(&openstp->ls_open);
3747 		while (stp != LIST_END(&openstp->ls_open)) {
3748 		    nstp = LIST_NEXT(stp, ls_list);
3749 		    /*
3750 		     * If the owner matches, check for locks and
3751 		     * then free or return an error.
3752 		     */
3753 		    if (stp->ls_ownerlen == new_stp->ls_ownerlen &&
3754 			!NFSBCMP(stp->ls_owner, new_stp->ls_owner,
3755 			 stp->ls_ownerlen)){
3756 			if (LIST_EMPTY(&stp->ls_lock)) {
3757 			    nfsrv_freelockowner(stp, NULL, 0, p);
3758 			} else {
3759 			    NFSUNLOCKSTATE();
3760 			    error = NFSERR_LOCKSHELD;
3761 			    goto out;
3762 			}
3763 		    }
3764 		    stp = nstp;
3765 		}
3766 	    }
3767 	}
3768 	NFSUNLOCKSTATE();
3769 
3770 out:
3771 	NFSEXITCODE(error);
3772 	return (error);
3773 }
3774 
3775 /*
3776  * Get the file handle for a lock structure.
3777  */
3778 static int
3779 nfsrv_getlockfh(vnode_t vp, u_short flags, struct nfslockfile *new_lfp,
3780     fhandle_t *nfhp, NFSPROC_T *p)
3781 {
3782 	fhandle_t *fhp = NULL;
3783 	int error;
3784 
3785 	/*
3786 	 * For lock, use the new nfslock structure, otherwise just
3787 	 * a fhandle_t on the stack.
3788 	 */
3789 	if (flags & NFSLCK_OPEN) {
3790 		KASSERT(new_lfp != NULL, ("nfsrv_getlockfh: new_lfp NULL"));
3791 		fhp = &new_lfp->lf_fh;
3792 	} else if (nfhp) {
3793 		fhp = nfhp;
3794 	} else {
3795 		panic("nfsrv_getlockfh");
3796 	}
3797 	error = nfsvno_getfh(vp, fhp, p);
3798 	NFSEXITCODE(error);
3799 	return (error);
3800 }
3801 
3802 /*
3803  * Get an nfs lock structure. Allocate one, as required, and return a
3804  * pointer to it.
3805  * Returns an NFSERR_xxx upon failure or -1 to indicate no current lock.
3806  */
3807 static int
3808 nfsrv_getlockfile(u_short flags, struct nfslockfile **new_lfpp,
3809     struct nfslockfile **lfpp, fhandle_t *nfhp, int lockit)
3810 {
3811 	struct nfslockfile *lfp;
3812 	fhandle_t *fhp = NULL, *tfhp;
3813 	struct nfslockhashhead *hp;
3814 	struct nfslockfile *new_lfp = NULL;
3815 
3816 	/*
3817 	 * For lock, use the new nfslock structure, otherwise just
3818 	 * a fhandle_t on the stack.
3819 	 */
3820 	if (flags & NFSLCK_OPEN) {
3821 		new_lfp = *new_lfpp;
3822 		fhp = &new_lfp->lf_fh;
3823 	} else if (nfhp) {
3824 		fhp = nfhp;
3825 	} else {
3826 		panic("nfsrv_getlockfile");
3827 	}
3828 
3829 	hp = NFSLOCKHASH(fhp);
3830 	LIST_FOREACH(lfp, hp, lf_hash) {
3831 		tfhp = &lfp->lf_fh;
3832 		if (NFSVNO_CMPFH(fhp, tfhp)) {
3833 			if (lockit)
3834 				nfsrv_locklf(lfp);
3835 			*lfpp = lfp;
3836 			return (0);
3837 		}
3838 	}
3839 	if (!(flags & NFSLCK_OPEN))
3840 		return (-1);
3841 
3842 	/*
3843 	 * No match, so chain the new one into the list.
3844 	 */
3845 	LIST_INIT(&new_lfp->lf_open);
3846 	LIST_INIT(&new_lfp->lf_lock);
3847 	LIST_INIT(&new_lfp->lf_deleg);
3848 	LIST_INIT(&new_lfp->lf_locallock);
3849 	LIST_INIT(&new_lfp->lf_rollback);
3850 	new_lfp->lf_locallock_lck.nfslock_usecnt = 0;
3851 	new_lfp->lf_locallock_lck.nfslock_lock = 0;
3852 	new_lfp->lf_usecount = 0;
3853 	LIST_INSERT_HEAD(hp, new_lfp, lf_hash);
3854 	*lfpp = new_lfp;
3855 	*new_lfpp = NULL;
3856 	return (0);
3857 }
3858 
3859 /*
3860  * This function adds a nfslock lock structure to the list for the associated
3861  * nfsstate and nfslockfile structures. It will be inserted after the
3862  * entry pointed at by insert_lop.
3863  */
3864 static void
3865 nfsrv_insertlock(struct nfslock *new_lop, struct nfslock *insert_lop,
3866     struct nfsstate *stp, struct nfslockfile *lfp)
3867 {
3868 	struct nfslock *lop, *nlop;
3869 
3870 	new_lop->lo_stp = stp;
3871 	new_lop->lo_lfp = lfp;
3872 
3873 	if (stp != NULL) {
3874 		/* Insert in increasing lo_first order */
3875 		lop = LIST_FIRST(&lfp->lf_lock);
3876 		if (lop == LIST_END(&lfp->lf_lock) ||
3877 		    new_lop->lo_first <= lop->lo_first) {
3878 			LIST_INSERT_HEAD(&lfp->lf_lock, new_lop, lo_lckfile);
3879 		} else {
3880 			nlop = LIST_NEXT(lop, lo_lckfile);
3881 			while (nlop != LIST_END(&lfp->lf_lock) &&
3882 			       nlop->lo_first < new_lop->lo_first) {
3883 				lop = nlop;
3884 				nlop = LIST_NEXT(lop, lo_lckfile);
3885 			}
3886 			LIST_INSERT_AFTER(lop, new_lop, lo_lckfile);
3887 		}
3888 	} else {
3889 		new_lop->lo_lckfile.le_prev = NULL;	/* list not used */
3890 	}
3891 
3892 	/*
3893 	 * Insert after insert_lop, which is overloaded as stp or lfp for
3894 	 * an empty list.
3895 	 */
3896 	if (stp == NULL && (struct nfslockfile *)insert_lop == lfp)
3897 		LIST_INSERT_HEAD(&lfp->lf_locallock, new_lop, lo_lckowner);
3898 	else if ((struct nfsstate *)insert_lop == stp)
3899 		LIST_INSERT_HEAD(&stp->ls_lock, new_lop, lo_lckowner);
3900 	else
3901 		LIST_INSERT_AFTER(insert_lop, new_lop, lo_lckowner);
3902 	if (stp != NULL) {
3903 		NFSD_VNET(nfsstatsv1_p)->srvlocks++;
3904 		nfsrv_openpluslock++;
3905 	}
3906 }
3907 
3908 /*
3909  * This function updates the locking for a lock owner and given file. It
3910  * maintains a list of lock ranges ordered on increasing file offset that
3911  * are NFSLCK_READ or NFSLCK_WRITE and non-overlapping (aka POSIX style).
3912  * It always adds new_lop to the list and sometimes uses the one pointed
3913  * at by other_lopp.
3914  */
3915 static void
3916 nfsrv_updatelock(struct nfsstate *stp, struct nfslock **new_lopp,
3917     struct nfslock **other_lopp, struct nfslockfile *lfp)
3918 {
3919 	struct nfslock *new_lop = *new_lopp;
3920 	struct nfslock *lop, *tlop, *ilop;
3921 	struct nfslock *other_lop = *other_lopp;
3922 	int unlock = 0, myfile = 0;
3923 	u_int64_t tmp;
3924 
3925 	/*
3926 	 * Work down the list until the lock is merged.
3927 	 */
3928 	if (new_lop->lo_flags & NFSLCK_UNLOCK)
3929 		unlock = 1;
3930 	if (stp != NULL) {
3931 		ilop = (struct nfslock *)stp;
3932 		lop = LIST_FIRST(&stp->ls_lock);
3933 	} else {
3934 		ilop = (struct nfslock *)lfp;
3935 		lop = LIST_FIRST(&lfp->lf_locallock);
3936 	}
3937 	while (lop != NULL) {
3938 	    /*
3939 	     * Only check locks for this file that aren't before the start of
3940 	     * new lock's range.
3941 	     */
3942 	    if (lop->lo_lfp == lfp) {
3943 	      myfile = 1;
3944 	      if (lop->lo_end >= new_lop->lo_first) {
3945 		if (new_lop->lo_end < lop->lo_first) {
3946 			/*
3947 			 * If the new lock ends before the start of the
3948 			 * current lock's range, no merge, just insert
3949 			 * the new lock.
3950 			 */
3951 			break;
3952 		}
3953 		if (new_lop->lo_flags == lop->lo_flags ||
3954 		    (new_lop->lo_first <= lop->lo_first &&
3955 		     new_lop->lo_end >= lop->lo_end)) {
3956 			/*
3957 			 * This lock can be absorbed by the new lock/unlock.
3958 			 * This happens when it covers the entire range
3959 			 * of the old lock or is contiguous
3960 			 * with the old lock and is of the same type or an
3961 			 * unlock.
3962 			 */
3963 			if (lop->lo_first < new_lop->lo_first)
3964 				new_lop->lo_first = lop->lo_first;
3965 			if (lop->lo_end > new_lop->lo_end)
3966 				new_lop->lo_end = lop->lo_end;
3967 			tlop = lop;
3968 			lop = LIST_NEXT(lop, lo_lckowner);
3969 			nfsrv_freenfslock(tlop);
3970 			continue;
3971 		}
3972 
3973 		/*
3974 		 * All these cases are for contiguous locks that are not the
3975 		 * same type, so they can't be merged.
3976 		 */
3977 		if (new_lop->lo_first <= lop->lo_first) {
3978 			/*
3979 			 * This case is where the new lock overlaps with the
3980 			 * first part of the old lock. Move the start of the
3981 			 * old lock to just past the end of the new lock. The
3982 			 * new lock will be inserted in front of the old, since
3983 			 * ilop hasn't been updated. (We are done now.)
3984 			 */
3985 			lop->lo_first = new_lop->lo_end;
3986 			break;
3987 		}
3988 		if (new_lop->lo_end >= lop->lo_end) {
3989 			/*
3990 			 * This case is where the new lock overlaps with the
3991 			 * end of the old lock's range. Move the old lock's
3992 			 * end to just before the new lock's first and insert
3993 			 * the new lock after the old lock.
3994 			 * Might not be done yet, since the new lock could
3995 			 * overlap further locks with higher ranges.
3996 			 */
3997 			lop->lo_end = new_lop->lo_first;
3998 			ilop = lop;
3999 			lop = LIST_NEXT(lop, lo_lckowner);
4000 			continue;
4001 		}
4002 		/*
4003 		 * The final case is where the new lock's range is in the
4004 		 * middle of the current lock's and splits the current lock
4005 		 * up. Use *other_lopp to handle the second part of the
4006 		 * split old lock range. (We are done now.)
4007 		 * For unlock, we use new_lop as other_lop and tmp, since
4008 		 * other_lop and new_lop are the same for this case.
4009 		 * We noted the unlock case above, so we don't need
4010 		 * new_lop->lo_flags any longer.
4011 		 */
4012 		tmp = new_lop->lo_first;
4013 		if (other_lop == NULL) {
4014 			if (!unlock)
4015 				panic("nfsd srv update unlock");
4016 			other_lop = new_lop;
4017 			*new_lopp = NULL;
4018 		}
4019 		other_lop->lo_first = new_lop->lo_end;
4020 		other_lop->lo_end = lop->lo_end;
4021 		other_lop->lo_flags = lop->lo_flags;
4022 		other_lop->lo_stp = stp;
4023 		other_lop->lo_lfp = lfp;
4024 		lop->lo_end = tmp;
4025 		nfsrv_insertlock(other_lop, lop, stp, lfp);
4026 		*other_lopp = NULL;
4027 		ilop = lop;
4028 		break;
4029 	      }
4030 	    }
4031 	    ilop = lop;
4032 	    lop = LIST_NEXT(lop, lo_lckowner);
4033 	    if (myfile && (lop == NULL || lop->lo_lfp != lfp))
4034 		break;
4035 	}
4036 
4037 	/*
4038 	 * Insert the new lock in the list at the appropriate place.
4039 	 */
4040 	if (!unlock) {
4041 		nfsrv_insertlock(new_lop, ilop, stp, lfp);
4042 		*new_lopp = NULL;
4043 	}
4044 }
4045 
4046 /*
4047  * This function handles sequencing of locks, etc.
4048  * It returns an error that indicates what the caller should do.
4049  */
4050 static int
4051 nfsrv_checkseqid(struct nfsrv_descript *nd, u_int32_t seqid,
4052     struct nfsstate *stp, struct nfsrvcache *op)
4053 {
4054 	int error = 0;
4055 
4056 	if ((nd->nd_flag & ND_NFSV41) != 0)
4057 		/* NFSv4.1 ignores the open_seqid and lock_seqid. */
4058 		goto out;
4059 	if (op != nd->nd_rp)
4060 		panic("nfsrvstate checkseqid");
4061 	if (!(op->rc_flag & RC_INPROG))
4062 		panic("nfsrvstate not inprog");
4063 	if (stp->ls_op && stp->ls_op->rc_refcnt <= 0) {
4064 		printf("refcnt=%d\n", stp->ls_op->rc_refcnt);
4065 		panic("nfsrvstate op refcnt");
4066 	}
4067 
4068 	/* If ND_ERELOOKUP is set, the seqid has already been handled. */
4069 	if ((nd->nd_flag & ND_ERELOOKUP) != 0)
4070 		goto out;
4071 
4072 	if ((stp->ls_seq + 1) == seqid) {
4073 		if (stp->ls_op)
4074 			nfsrvd_derefcache(stp->ls_op);
4075 		stp->ls_op = op;
4076 		nfsrvd_refcache(op);
4077 		stp->ls_seq = seqid;
4078 		goto out;
4079 	} else if (stp->ls_seq == seqid && stp->ls_op &&
4080 		op->rc_xid == stp->ls_op->rc_xid &&
4081 		op->rc_refcnt == 0 &&
4082 		op->rc_reqlen == stp->ls_op->rc_reqlen &&
4083 		op->rc_cksum == stp->ls_op->rc_cksum) {
4084 		if (stp->ls_op->rc_flag & RC_INPROG) {
4085 			error = NFSERR_DONTREPLY;
4086 			goto out;
4087 		}
4088 		nd->nd_rp = stp->ls_op;
4089 		nd->nd_rp->rc_flag |= RC_INPROG;
4090 		nfsrvd_delcache(op);
4091 		error = NFSERR_REPLYFROMCACHE;
4092 		goto out;
4093 	}
4094 	error = NFSERR_BADSEQID;
4095 
4096 out:
4097 	NFSEXITCODE2(error, nd);
4098 	return (error);
4099 }
4100 
4101 /*
4102  * Get the client ip address for callbacks. If the strings can't be parsed,
4103  * just set lc_program to 0 to indicate no callbacks are possible.
4104  * (For cases where the address can't be parsed or is 0.0.0.0.0.0, set
4105  *  the address to the client's transport address. This won't be used
4106  *  for callbacks, but can be printed out by nfsstats for info.)
4107  * Return error if the xdr can't be parsed, 0 otherwise.
4108  */
4109 int
4110 nfsrv_getclientipaddr(struct nfsrv_descript *nd, struct nfsclient *clp)
4111 {
4112 	u_int32_t *tl;
4113 	u_char *cp, *cp2;
4114 	int i, j, maxalen = 0, minalen = 0;
4115 	sa_family_t af;
4116 #ifdef INET
4117 	struct sockaddr_in *rin = NULL, *sin;
4118 #endif
4119 #ifdef INET6
4120 	struct sockaddr_in6 *rin6 = NULL, *sin6;
4121 #endif
4122 	u_char *addr;
4123 	int error = 0, cantparse = 0;
4124 	union {
4125 		in_addr_t ival;
4126 		u_char cval[4];
4127 	} ip;
4128 	union {
4129 		in_port_t sval;
4130 		u_char cval[2];
4131 	} port;
4132 
4133 	/* 8 is the maximum length of the port# string. */
4134 	addr = malloc(INET6_ADDRSTRLEN + 8, M_TEMP, M_WAITOK);
4135 	clp->lc_req.nr_client = NULL;
4136 	clp->lc_req.nr_lock = 0;
4137 	af = AF_UNSPEC;
4138 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
4139 	i = fxdr_unsigned(int, *tl);
4140 	if (i >= 3 && i <= 4) {
4141 		error = nfsrv_mtostr(nd, addr, i);
4142 		if (error)
4143 			goto nfsmout;
4144 #ifdef INET
4145 		if (!strcmp(addr, "tcp")) {
4146 			clp->lc_flags |= LCL_TCPCALLBACK;
4147 			clp->lc_req.nr_sotype = SOCK_STREAM;
4148 			clp->lc_req.nr_soproto = IPPROTO_TCP;
4149 			af = AF_INET;
4150 		} else if (!strcmp(addr, "udp")) {
4151 			clp->lc_req.nr_sotype = SOCK_DGRAM;
4152 			clp->lc_req.nr_soproto = IPPROTO_UDP;
4153 			af = AF_INET;
4154 		}
4155 #endif
4156 #ifdef INET6
4157 		if (af == AF_UNSPEC) {
4158 			if (!strcmp(addr, "tcp6")) {
4159 				clp->lc_flags |= LCL_TCPCALLBACK;
4160 				clp->lc_req.nr_sotype = SOCK_STREAM;
4161 				clp->lc_req.nr_soproto = IPPROTO_TCP;
4162 				af = AF_INET6;
4163 			} else if (!strcmp(addr, "udp6")) {
4164 				clp->lc_req.nr_sotype = SOCK_DGRAM;
4165 				clp->lc_req.nr_soproto = IPPROTO_UDP;
4166 				af = AF_INET6;
4167 			}
4168 		}
4169 #endif
4170 		if (af == AF_UNSPEC) {
4171 			cantparse = 1;
4172 		}
4173 	} else {
4174 		cantparse = 1;
4175 		if (i > 0) {
4176 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
4177 			if (error)
4178 				goto nfsmout;
4179 		}
4180 	}
4181 	/*
4182 	 * The caller has allocated clp->lc_req.nr_nam to be large enough
4183 	 * for either AF_INET or AF_INET6 and zeroed out the contents.
4184 	 * maxalen is set to the maximum length of the host IP address string
4185 	 * plus 8 for the maximum length of the port#.
4186 	 * minalen is set to the minimum length of the host IP address string
4187 	 * plus 4 for the minimum length of the port#.
4188 	 * These lengths do not include NULL termination,
4189 	 * so INET[6]_ADDRSTRLEN - 1 is used in the calculations.
4190 	 */
4191 	switch (af) {
4192 #ifdef INET
4193 	case AF_INET:
4194 		rin = (struct sockaddr_in *)clp->lc_req.nr_nam;
4195 		rin->sin_family = AF_INET;
4196 		rin->sin_len = sizeof(struct sockaddr_in);
4197 		maxalen = INET_ADDRSTRLEN - 1 + 8;
4198 		minalen = 7 + 4;
4199 		break;
4200 #endif
4201 #ifdef INET6
4202 	case AF_INET6:
4203 		rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam;
4204 		rin6->sin6_family = AF_INET6;
4205 		rin6->sin6_len = sizeof(struct sockaddr_in6);
4206 		maxalen = INET6_ADDRSTRLEN - 1 + 8;
4207 		minalen = 3 + 4;
4208 		break;
4209 #endif
4210 	}
4211 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
4212 	i = fxdr_unsigned(int, *tl);
4213 	if (i < 0) {
4214 		error = NFSERR_BADXDR;
4215 		goto nfsmout;
4216 	} else if (i == 0) {
4217 		cantparse = 1;
4218 	} else if (!cantparse && i <= maxalen && i >= minalen) {
4219 		error = nfsrv_mtostr(nd, addr, i);
4220 		if (error)
4221 			goto nfsmout;
4222 
4223 		/*
4224 		 * Parse out the address fields. We expect 6 decimal numbers
4225 		 * separated by '.'s for AF_INET and two decimal numbers
4226 		 * preceeded by '.'s for AF_INET6.
4227 		 */
4228 		cp = NULL;
4229 		switch (af) {
4230 #ifdef INET6
4231 		/*
4232 		 * For AF_INET6, first parse the host address.
4233 		 */
4234 		case AF_INET6:
4235 			cp = strchr(addr, '.');
4236 			if (cp != NULL) {
4237 				*cp++ = '\0';
4238 				if (inet_pton(af, addr, &rin6->sin6_addr) == 1)
4239 					i = 4;
4240 				else {
4241 					cp = NULL;
4242 					cantparse = 1;
4243 				}
4244 			}
4245 			break;
4246 #endif
4247 #ifdef INET
4248 		case AF_INET:
4249 			cp = addr;
4250 			i = 0;
4251 			break;
4252 #endif
4253 		}
4254 		while (cp != NULL && *cp && i < 6) {
4255 			cp2 = cp;
4256 			while (*cp2 && *cp2 != '.')
4257 				cp2++;
4258 			if (*cp2)
4259 				*cp2++ = '\0';
4260 			else if (i != 5) {
4261 				cantparse = 1;
4262 				break;
4263 			}
4264 			j = nfsrv_getipnumber(cp);
4265 			if (j >= 0) {
4266 				if (i < 4)
4267 					ip.cval[3 - i] = j;
4268 				else
4269 					port.cval[5 - i] = j;
4270 			} else {
4271 				cantparse = 1;
4272 				break;
4273 			}
4274 			cp = cp2;
4275 			i++;
4276 		}
4277 		if (!cantparse) {
4278 			/*
4279 			 * The host address INADDR_ANY is (mis)used to indicate
4280 			 * "there is no valid callback address".
4281 			 */
4282 			switch (af) {
4283 #ifdef INET6
4284 			case AF_INET6:
4285 				if (!IN6_ARE_ADDR_EQUAL(&rin6->sin6_addr,
4286 				    &in6addr_any))
4287 					rin6->sin6_port = htons(port.sval);
4288 				else
4289 					cantparse = 1;
4290 				break;
4291 #endif
4292 #ifdef INET
4293 			case AF_INET:
4294 				if (ip.ival != INADDR_ANY) {
4295 					rin->sin_addr.s_addr = htonl(ip.ival);
4296 					rin->sin_port = htons(port.sval);
4297 				} else {
4298 					cantparse = 1;
4299 				}
4300 				break;
4301 #endif
4302 			}
4303 		}
4304 	} else {
4305 		cantparse = 1;
4306 		if (i > 0) {
4307 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
4308 			if (error)
4309 				goto nfsmout;
4310 		}
4311 	}
4312 	if (cantparse) {
4313 		switch (nd->nd_nam->sa_family) {
4314 #ifdef INET
4315 		case AF_INET:
4316 			sin = (struct sockaddr_in *)nd->nd_nam;
4317 			rin = (struct sockaddr_in *)clp->lc_req.nr_nam;
4318 			rin->sin_family = AF_INET;
4319 			rin->sin_len = sizeof(struct sockaddr_in);
4320 			rin->sin_addr.s_addr = sin->sin_addr.s_addr;
4321 			rin->sin_port = 0x0;
4322 			break;
4323 #endif
4324 #ifdef INET6
4325 		case AF_INET6:
4326 			sin6 = (struct sockaddr_in6 *)nd->nd_nam;
4327 			rin6 = (struct sockaddr_in6 *)clp->lc_req.nr_nam;
4328 			rin6->sin6_family = AF_INET6;
4329 			rin6->sin6_len = sizeof(struct sockaddr_in6);
4330 			rin6->sin6_addr = sin6->sin6_addr;
4331 			rin6->sin6_port = 0x0;
4332 			break;
4333 #endif
4334 		}
4335 		clp->lc_program = 0;
4336 	}
4337 nfsmout:
4338 	free(addr, M_TEMP);
4339 	NFSEXITCODE2(error, nd);
4340 	return (error);
4341 }
4342 
4343 /*
4344  * Turn a string of up to three decimal digits into a number. Return -1 upon
4345  * error.
4346  */
4347 static int
4348 nfsrv_getipnumber(u_char *cp)
4349 {
4350 	int i = 0, j = 0;
4351 
4352 	while (*cp) {
4353 		if (j > 2 || *cp < '0' || *cp > '9')
4354 			return (-1);
4355 		i *= 10;
4356 		i += (*cp - '0');
4357 		cp++;
4358 		j++;
4359 	}
4360 	if (i < 256)
4361 		return (i);
4362 	return (-1);
4363 }
4364 
4365 /*
4366  * This function checks for restart conditions.
4367  */
4368 static int
4369 nfsrv_checkrestart(nfsquad_t clientid, u_int32_t flags,
4370     nfsv4stateid_t *stateidp, int specialid)
4371 {
4372 	int ret = 0;
4373 
4374 	/*
4375 	 * First check for a server restart. Open, LockT, ReleaseLockOwner
4376 	 * and DelegPurge have a clientid, the rest a stateid.
4377 	 */
4378 	if (flags &
4379 	    (NFSLCK_OPEN | NFSLCK_TEST | NFSLCK_RELEASE | NFSLCK_DELEGPURGE)) {
4380 		if (clientid.lval[0] != NFSD_VNET(nfsrvboottime)) {
4381 			ret = NFSERR_STALECLIENTID;
4382 			goto out;
4383 		}
4384 	} else if (stateidp->other[0] != NFSD_VNET(nfsrvboottime) &&
4385 		specialid == 0) {
4386 		ret = NFSERR_STALESTATEID;
4387 		goto out;
4388 	}
4389 
4390 	/*
4391 	 * Read, Write, Setattr and LockT can return NFSERR_GRACE and do
4392 	 * not use a lock/open owner seqid#, so the check can be done now.
4393 	 * (The others will be checked, as required, later.)
4394 	 */
4395 	if (!(flags & (NFSLCK_CHECK | NFSLCK_TEST)))
4396 		goto out;
4397 
4398 	NFSLOCKSTATE();
4399 	ret = nfsrv_checkgrace(NULL, NULL, flags);
4400 	NFSUNLOCKSTATE();
4401 
4402 out:
4403 	NFSEXITCODE(ret);
4404 	return (ret);
4405 }
4406 
4407 /*
4408  * Check for grace.
4409  */
4410 static int
4411 nfsrv_checkgrace(struct nfsrv_descript *nd, struct nfsclient *clp,
4412     u_int32_t flags)
4413 {
4414 	int error = 0, notreclaimed;
4415 	struct nfsrv_stable *sp;
4416 
4417 	if ((NFSD_VNET(nfsrv_stablefirst).nsf_flags & (NFSNSF_UPDATEDONE |
4418 	     NFSNSF_GRACEOVER)) == 0) {
4419 		/*
4420 		 * First, check to see if all of the clients have done a
4421 		 * ReclaimComplete.  If so, grace can end now.
4422 		 */
4423 		notreclaimed = 0;
4424 		LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head,
4425 		    nst_list) {
4426 			if ((sp->nst_flag & NFSNST_RECLAIMED) == 0) {
4427 				notreclaimed = 1;
4428 				break;
4429 			}
4430 		}
4431 		if (notreclaimed == 0)
4432 			NFSD_VNET(nfsrv_stablefirst).nsf_flags |=
4433 			    (NFSNSF_GRACEOVER | NFSNSF_NEEDLOCK);
4434 	}
4435 
4436 	if ((NFSD_VNET(nfsrv_stablefirst).nsf_flags & NFSNSF_GRACEOVER) != 0) {
4437 		if (flags & NFSLCK_RECLAIM) {
4438 			error = NFSERR_NOGRACE;
4439 			goto out;
4440 		}
4441 	} else {
4442 		if (!(flags & NFSLCK_RECLAIM)) {
4443 			error = NFSERR_GRACE;
4444 			goto out;
4445 		}
4446 		if (nd != NULL && clp != NULL &&
4447 		    (nd->nd_flag & ND_NFSV41) != 0 &&
4448 		    (clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0) {
4449 			error = NFSERR_NOGRACE;
4450 			goto out;
4451 		}
4452 
4453 		/*
4454 		 * If grace is almost over and we are still getting Reclaims,
4455 		 * extend grace a bit.
4456 		 */
4457 		if ((NFSD_MONOSEC + NFSRV_LEASEDELTA) >
4458 		    NFSD_VNET(nfsrv_stablefirst).nsf_eograce)
4459 			NFSD_VNET(nfsrv_stablefirst).nsf_eograce =
4460 				NFSD_MONOSEC + NFSRV_LEASEDELTA;
4461 	}
4462 
4463 out:
4464 	NFSEXITCODE(error);
4465 	return (error);
4466 }
4467 
4468 /*
4469  * Do a server callback.
4470  * The "trunc" argument is slightly overloaded and refers to different
4471  * boolean arguments for CBRECALL and CBLAYOUTRECALL.
4472  */
4473 static int
4474 nfsrv_docallback(struct nfsclient *clp, int procnum, nfsv4stateid_t *stateidp,
4475     int trunc, fhandle_t *fhp, struct nfsvattr *nap, nfsattrbit_t *attrbitp,
4476     int laytype, NFSPROC_T *p)
4477 {
4478 	struct mbuf *m;
4479 	u_int32_t *tl;
4480 	struct nfsrv_descript *nd;
4481 	struct ucred *cred;
4482 	int error = 0, slotpos;
4483 	u_int32_t callback;
4484 	struct nfsdsession *sep = NULL;
4485 	uint64_t tval;
4486 	bool dotls;
4487 
4488 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
4489 	cred = newnfs_getcred();
4490 	NFSLOCKSTATE();	/* mostly for lc_cbref++ */
4491 	if (clp->lc_flags & LCL_NEEDSCONFIRM) {
4492 		NFSUNLOCKSTATE();
4493 		panic("docallb");
4494 	}
4495 	clp->lc_cbref++;
4496 
4497 	/*
4498 	 * Fill the callback program# and version into the request
4499 	 * structure for newnfs_connect() to use.
4500 	 */
4501 	clp->lc_req.nr_prog = clp->lc_program;
4502 #ifdef notnow
4503 	if ((clp->lc_flags & LCL_NFSV41) != 0)
4504 		clp->lc_req.nr_vers = NFSV41_CBVERS;
4505 	else
4506 #endif
4507 		clp->lc_req.nr_vers = NFSV4_CBVERS;
4508 
4509 	/*
4510 	 * First, fill in some of the fields of nd and cr.
4511 	 */
4512 	nd->nd_flag = ND_NFSV4;
4513 	if (clp->lc_flags & LCL_GSS)
4514 		nd->nd_flag |= ND_KERBV;
4515 	if ((clp->lc_flags & LCL_NFSV41) != 0)
4516 		nd->nd_flag |= ND_NFSV41;
4517 	if ((clp->lc_flags & LCL_NFSV42) != 0)
4518 		nd->nd_flag |= ND_NFSV42;
4519 	nd->nd_repstat = 0;
4520 	cred->cr_uid = clp->lc_uid;
4521 	cred->cr_gid = clp->lc_gid;
4522 	callback = clp->lc_callback;
4523 	NFSUNLOCKSTATE();
4524 	cred->cr_ngroups = 1;
4525 
4526 	/*
4527 	 * Get the first mbuf for the request.
4528 	 */
4529 	MGET(m, M_WAITOK, MT_DATA);
4530 	m->m_len = 0;
4531 	nd->nd_mreq = nd->nd_mb = m;
4532 	nd->nd_bpos = mtod(m, caddr_t);
4533 
4534 	/*
4535 	 * and build the callback request.
4536 	 */
4537 	if (procnum == NFSV4OP_CBGETATTR) {
4538 		nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
4539 		error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBGETATTR,
4540 		    "CB Getattr", &sep, &slotpos);
4541 		if (error != 0) {
4542 			m_freem(nd->nd_mreq);
4543 			goto errout;
4544 		}
4545 		(void)nfsm_fhtom(NULL, nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
4546 		(void)nfsrv_putattrbit(nd, attrbitp);
4547 	} else if (procnum == NFSV4OP_CBRECALL) {
4548 		nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
4549 		error = nfsrv_cbcallargs(nd, clp, callback, NFSV4OP_CBRECALL,
4550 		    "CB Recall", &sep, &slotpos);
4551 		if (error != 0) {
4552 			m_freem(nd->nd_mreq);
4553 			goto errout;
4554 		}
4555 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_STATEID);
4556 		*tl++ = txdr_unsigned(stateidp->seqid);
4557 		NFSBCOPY((caddr_t)stateidp->other, (caddr_t)tl,
4558 		    NFSX_STATEIDOTHER);
4559 		tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
4560 		if (trunc)
4561 			*tl = newnfs_true;
4562 		else
4563 			*tl = newnfs_false;
4564 		(void)nfsm_fhtom(NULL, nd, (u_int8_t *)fhp, NFSX_MYFH, 0);
4565 	} else if (procnum == NFSV4OP_CBLAYOUTRECALL) {
4566 		NFSD_DEBUG(4, "docallback layout recall\n");
4567 		nd->nd_procnum = NFSV4PROC_CBCOMPOUND;
4568 		error = nfsrv_cbcallargs(nd, clp, callback,
4569 		    NFSV4OP_CBLAYOUTRECALL, "CB Reclayout", &sep, &slotpos);
4570 		NFSD_DEBUG(4, "aft cbcallargs=%d\n", error);
4571 		if (error != 0) {
4572 			m_freem(nd->nd_mreq);
4573 			goto errout;
4574 		}
4575 		NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
4576 		*tl++ = txdr_unsigned(laytype);
4577 		*tl++ = txdr_unsigned(NFSLAYOUTIOMODE_ANY);
4578 		if (trunc)
4579 			*tl++ = newnfs_true;
4580 		else
4581 			*tl++ = newnfs_false;
4582 		*tl = txdr_unsigned(NFSV4LAYOUTRET_FILE);
4583 		(void)nfsm_fhtom(NULL, nd, (uint8_t *)fhp, NFSX_MYFH, 0);
4584 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_HYPER + NFSX_STATEID);
4585 		tval = 0;
4586 		txdr_hyper(tval, tl); tl += 2;
4587 		tval = UINT64_MAX;
4588 		txdr_hyper(tval, tl); tl += 2;
4589 		*tl++ = txdr_unsigned(stateidp->seqid);
4590 		NFSBCOPY(stateidp->other, tl, NFSX_STATEIDOTHER);
4591 		tl += (NFSX_STATEIDOTHER / NFSX_UNSIGNED);
4592 		NFSD_DEBUG(4, "aft args\n");
4593 	} else if (procnum == NFSV4PROC_CBNULL) {
4594 		nd->nd_procnum = NFSV4PROC_CBNULL;
4595 		if ((clp->lc_flags & LCL_NFSV41) != 0) {
4596 			error = nfsv4_getcbsession(clp, &sep);
4597 			if (error != 0) {
4598 				m_freem(nd->nd_mreq);
4599 				goto errout;
4600 			}
4601 		}
4602 	} else {
4603 		error = NFSERR_SERVERFAULT;
4604 		m_freem(nd->nd_mreq);
4605 		goto errout;
4606 	}
4607 
4608 	/*
4609 	 * Call newnfs_connect(), as required, and then newnfs_request().
4610 	 */
4611 	dotls = false;
4612 	if ((clp->lc_flags & LCL_TLSCB) != 0)
4613 		dotls = true;
4614 	(void) newnfs_sndlock(&clp->lc_req.nr_lock);
4615 	if (clp->lc_req.nr_client == NULL) {
4616 		if ((clp->lc_flags & LCL_NFSV41) != 0) {
4617 			error = ECONNREFUSED;
4618 			if (procnum != NFSV4PROC_CBNULL)
4619 				nfsv4_freeslot(&sep->sess_cbsess, slotpos,
4620 				    true);
4621 			nfsrv_freesession(NULL, sep, NULL);
4622 		} else if (nd->nd_procnum == NFSV4PROC_CBNULL)
4623 			error = newnfs_connect(NULL, &clp->lc_req, cred,
4624 			    NULL, 1, dotls, &clp->lc_req.nr_client);
4625 		else
4626 			error = newnfs_connect(NULL, &clp->lc_req, cred,
4627 			    NULL, 3, dotls, &clp->lc_req.nr_client);
4628 	}
4629 	newnfs_sndunlock(&clp->lc_req.nr_lock);
4630 	NFSD_DEBUG(4, "aft sndunlock=%d\n", error);
4631 	if (!error) {
4632 		if ((nd->nd_flag & ND_NFSV41) != 0) {
4633 			KASSERT(sep != NULL, ("sep NULL"));
4634 			if (sep->sess_cbsess.nfsess_xprt != NULL)
4635 				error = newnfs_request(nd, NULL, clp,
4636 				    &clp->lc_req, NULL, NULL, cred,
4637 				    clp->lc_program, clp->lc_req.nr_vers, NULL,
4638 				    1, NULL, &sep->sess_cbsess);
4639 			else {
4640 				/*
4641 				 * This should probably never occur, but if a
4642 				 * client somehow does an RPC without a
4643 				 * SequenceID Op that causes a callback just
4644 				 * after the nfsd threads have been terminated
4645 				 * and restared we could conceivably get here
4646 				 * without a backchannel xprt.
4647 				 */
4648 				printf("nfsrv_docallback: no xprt\n");
4649 				error = ECONNREFUSED;
4650 			}
4651 			NFSD_DEBUG(4, "aft newnfs_request=%d\n", error);
4652 			if (error != 0 && procnum != NFSV4PROC_CBNULL) {
4653 				/*
4654 				 * It is likely that the callback was never
4655 				 * processed by the client and, as such,
4656 				 * the sequence# for the session slot needs
4657 				 * to be backed up by one to avoid a
4658 				 * NFSERR_SEQMISORDERED error reply.
4659 				 * For the unlikely case where the callback
4660 				 * was processed by the client, this will
4661 				 * make the next callback on the slot
4662 				 * appear to be a retry.
4663 				 * Since callbacks never specify that the
4664 				 * reply be cached, this "apparent retry"
4665 				 * should not be a problem.
4666 				 */
4667 				nfsv4_freeslot(&sep->sess_cbsess, slotpos,
4668 				    true);
4669 			}
4670 			nfsrv_freesession(NULL, sep, NULL);
4671 		} else
4672 			error = newnfs_request(nd, NULL, clp, &clp->lc_req,
4673 			    NULL, NULL, cred, clp->lc_program,
4674 			    clp->lc_req.nr_vers, NULL, 1, NULL, NULL);
4675 	}
4676 errout:
4677 	NFSFREECRED(cred);
4678 
4679 	/*
4680 	 * If error is set here, the Callback path isn't working
4681 	 * properly, so twiddle the appropriate LCL_ flags.
4682 	 * (nd_repstat != 0 indicates the Callback path is working,
4683 	 *  but the callback failed on the client.)
4684 	 */
4685 	if (error) {
4686 		/*
4687 		 * Mark the callback pathway down, which disabled issuing
4688 		 * of delegations and gets Renew to return NFSERR_CBPATHDOWN.
4689 		 */
4690 		NFSLOCKSTATE();
4691 		clp->lc_flags |= LCL_CBDOWN;
4692 		NFSUNLOCKSTATE();
4693 	} else {
4694 		/*
4695 		 * Callback worked. If the callback path was down, disable
4696 		 * callbacks, so no more delegations will be issued. (This
4697 		 * is done on the assumption that the callback pathway is
4698 		 * flakey.)
4699 		 */
4700 		NFSLOCKSTATE();
4701 		if (clp->lc_flags & LCL_CBDOWN)
4702 			clp->lc_flags &= ~(LCL_CBDOWN | LCL_CALLBACKSON);
4703 		NFSUNLOCKSTATE();
4704 		if (nd->nd_repstat) {
4705 			error = nd->nd_repstat;
4706 			NFSD_DEBUG(1, "nfsrv_docallback op=%d err=%d\n",
4707 			    procnum, error);
4708 		} else if (error == 0 && procnum == NFSV4OP_CBGETATTR)
4709 			error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
4710 			    NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL,
4711 			    p, NULL);
4712 		m_freem(nd->nd_mrep);
4713 	}
4714 	NFSLOCKSTATE();
4715 	clp->lc_cbref--;
4716 	if ((clp->lc_flags & LCL_WAKEUPWANTED) && clp->lc_cbref == 0) {
4717 		clp->lc_flags &= ~LCL_WAKEUPWANTED;
4718 		wakeup(clp);
4719 	}
4720 	NFSUNLOCKSTATE();
4721 
4722 	free(nd, M_TEMP);
4723 	NFSEXITCODE(error);
4724 	return (error);
4725 }
4726 
4727 /*
4728  * Set up the compound RPC for the callback.
4729  */
4730 static int
4731 nfsrv_cbcallargs(struct nfsrv_descript *nd, struct nfsclient *clp,
4732     uint32_t callback, int op, const char *optag, struct nfsdsession **sepp,
4733     int *slotposp)
4734 {
4735 	uint32_t *tl;
4736 	int error, len;
4737 
4738 	len = strlen(optag);
4739 	(void)nfsm_strtom(nd, optag, len);
4740 	NFSM_BUILD(tl, uint32_t *, 4 * NFSX_UNSIGNED);
4741 	if ((nd->nd_flag & ND_NFSV41) != 0) {
4742 		if ((nd->nd_flag & ND_NFSV42) != 0)
4743 			*tl++ = txdr_unsigned(NFSV42_MINORVERSION);
4744 		else
4745 			*tl++ = txdr_unsigned(NFSV41_MINORVERSION);
4746 		*tl++ = txdr_unsigned(callback);
4747 		*tl++ = txdr_unsigned(2);
4748 		*tl = txdr_unsigned(NFSV4OP_CBSEQUENCE);
4749 		error = nfsv4_setcbsequence(nd, clp, 1, sepp, slotposp);
4750 		if (error != 0)
4751 			return (error);
4752 		NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
4753 		*tl = txdr_unsigned(op);
4754 	} else {
4755 		*tl++ = txdr_unsigned(NFSV4_MINORVERSION);
4756 		*tl++ = txdr_unsigned(callback);
4757 		*tl++ = txdr_unsigned(1);
4758 		*tl = txdr_unsigned(op);
4759 	}
4760 	return (0);
4761 }
4762 
4763 /*
4764  * Return the next index# for a clientid. Mostly just increment and return
4765  * the next one, but... if the 32bit unsigned does actually wrap around,
4766  * it should be rebooted.
4767  * At an average rate of one new client per second, it will wrap around in
4768  * approximately 136 years. (I think the server will have been shut
4769  * down or rebooted before then.)
4770  */
4771 static u_int32_t
4772 nfsrv_nextclientindex(void)
4773 {
4774 	static u_int32_t client_index = 0;
4775 
4776 	client_index++;
4777 	if (client_index != 0)
4778 		return (client_index);
4779 
4780 	printf("%s: out of clientids\n", __func__);
4781 	return (client_index);
4782 }
4783 
4784 /*
4785  * Return the next index# for a stateid. Mostly just increment and return
4786  * the next one, but... if the 32bit unsigned does actually wrap around
4787  * (will a BSD server stay up that long?), find
4788  * new start and end values.
4789  */
4790 static u_int32_t
4791 nfsrv_nextstateindex(struct nfsclient *clp)
4792 {
4793 	struct nfsstate *stp;
4794 	int i;
4795 	u_int32_t canuse, min_index, max_index;
4796 
4797 	if (!(clp->lc_flags & LCL_INDEXNOTOK)) {
4798 		clp->lc_stateindex++;
4799 		if (clp->lc_stateindex != clp->lc_statemaxindex)
4800 			return (clp->lc_stateindex);
4801 	}
4802 
4803 	/*
4804 	 * Yuck, we've hit the end.
4805 	 * Look for a new min and max.
4806 	 */
4807 	min_index = 0;
4808 	max_index = 0xffffffff;
4809 	for (i = 0; i < nfsrv_statehashsize; i++) {
4810 	    LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
4811 		if (stp->ls_stateid.other[2] > 0x80000000) {
4812 		    if (stp->ls_stateid.other[2] < max_index)
4813 			max_index = stp->ls_stateid.other[2];
4814 		} else {
4815 		    if (stp->ls_stateid.other[2] > min_index)
4816 			min_index = stp->ls_stateid.other[2];
4817 		}
4818 	    }
4819 	}
4820 
4821 	/*
4822 	 * Yikes, highly unlikely, but I'll handle it anyhow.
4823 	 */
4824 	if (min_index == 0x80000000 && max_index == 0x80000001) {
4825 	    canuse = 0;
4826 	    /*
4827 	     * Loop around until we find an unused entry. Return that
4828 	     * and set LCL_INDEXNOTOK, so the search will continue next time.
4829 	     * (This is one of those rare cases where a goto is the
4830 	     *  cleanest way to code the loop.)
4831 	     */
4832 tryagain:
4833 	    for (i = 0; i < nfsrv_statehashsize; i++) {
4834 		LIST_FOREACH(stp, &clp->lc_stateid[i], ls_hash) {
4835 		    if (stp->ls_stateid.other[2] == canuse) {
4836 			canuse++;
4837 			goto tryagain;
4838 		    }
4839 		}
4840 	    }
4841 	    clp->lc_flags |= LCL_INDEXNOTOK;
4842 	    return (canuse);
4843 	}
4844 
4845 	/*
4846 	 * Ok to start again from min + 1.
4847 	 */
4848 	clp->lc_stateindex = min_index + 1;
4849 	clp->lc_statemaxindex = max_index;
4850 	clp->lc_flags &= ~LCL_INDEXNOTOK;
4851 	return (clp->lc_stateindex);
4852 }
4853 
4854 /*
4855  * The following functions handle the stable storage file that deals with
4856  * the edge conditions described in RFC3530 Sec. 8.6.3.
4857  * The file is as follows:
4858  * - a single record at the beginning that has the lease time of the
4859  *   previous server instance (before the last reboot) and the nfsrvboottime
4860  *   values for the previous server boots.
4861  *   These previous boot times are used to ensure that the current
4862  *   nfsrvboottime does not, somehow, get set to a previous one.
4863  *   (This is important so that Stale ClientIDs and StateIDs can
4864  *    be recognized.)
4865  *   The number of previous nfsvrboottime values precedes the list.
4866  * - followed by some number of appended records with:
4867  *   - client id string
4868  *   - flag that indicates it is a record revoking state via lease
4869  *     expiration or similar
4870  *     OR has successfully acquired state.
4871  * These structures vary in length, with the client string at the end, up
4872  * to NFSV4_OPAQUELIMIT in size.
4873  *
4874  * At the end of the grace period, the file is truncated, the first
4875  * record is rewritten with updated information and any acquired state
4876  * records for successful reclaims of state are written.
4877  *
4878  * Subsequent records are appended when the first state is issued to
4879  * a client and when state is revoked for a client.
4880  *
4881  * When reading the file in, state issued records that come later in
4882  * the file override older ones, since the append log is in cronological order.
4883  * If, for some reason, the file can't be read, the grace period is
4884  * immediately terminated and all reclaims get NFSERR_NOGRACE.
4885  */
4886 
4887 /*
4888  * Read in the stable storage file. Called by nfssvc() before the nfsd
4889  * processes start servicing requests.
4890  */
4891 void
4892 nfsrv_setupstable(NFSPROC_T *p)
4893 {
4894 	struct nfsrv_stablefirst *sf = &NFSD_VNET(nfsrv_stablefirst);
4895 	struct nfsrv_stable *sp, *nsp;
4896 	struct nfst_rec *tsp;
4897 	int error, i, tryagain;
4898 	off_t off = 0;
4899 	ssize_t aresid, len;
4900 
4901 	/*
4902 	 * If NFSNSF_UPDATEDONE is set, this is a restart of the nfsds without
4903 	 * a reboot, so state has not been lost.
4904 	 */
4905 	if (sf->nsf_flags & NFSNSF_UPDATEDONE)
4906 		return;
4907 	/*
4908 	 * Set Grace over just until the file reads successfully.
4909 	 */
4910 	NFSD_VNET(nfsrvboottime) = time_second;
4911 	LIST_INIT(&sf->nsf_head);
4912 	sf->nsf_flags = (NFSNSF_GRACEOVER | NFSNSF_NEEDLOCK);
4913 	sf->nsf_eograce = NFSD_MONOSEC + NFSRV_LEASEDELTA;
4914 	if (sf->nsf_fp == NULL)
4915 		return;
4916 	error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
4917 	    (caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), off, UIO_SYSSPACE,
4918 	    0, NFSFPCRED(sf->nsf_fp), &aresid, p);
4919 	if (error || aresid || sf->nsf_numboots == 0 ||
4920 		sf->nsf_numboots > NFSNSF_MAXNUMBOOTS)
4921 		return;
4922 
4923 	/*
4924 	 * Now, read in the boottimes.
4925 	 */
4926 	sf->nsf_bootvals = (time_t *)malloc((sf->nsf_numboots + 1) *
4927 		sizeof(time_t), M_TEMP, M_WAITOK);
4928 	off = sizeof (struct nfsf_rec);
4929 	error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
4930 	    (caddr_t)sf->nsf_bootvals, sf->nsf_numboots * sizeof (time_t), off,
4931 	    UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
4932 	if (error || aresid) {
4933 		free(sf->nsf_bootvals, M_TEMP);
4934 		sf->nsf_bootvals = NULL;
4935 		return;
4936 	}
4937 
4938 	/*
4939 	 * Make sure this nfsrvboottime is different from all recorded
4940 	 * previous ones.
4941 	 */
4942 	do {
4943 		tryagain = 0;
4944 		for (i = 0; i < sf->nsf_numboots; i++) {
4945 			if (NFSD_VNET(nfsrvboottime) == sf->nsf_bootvals[i]) {
4946 				NFSD_VNET(nfsrvboottime)++;
4947 				tryagain = 1;
4948 				break;
4949 			}
4950 		}
4951 	} while (tryagain);
4952 
4953 	sf->nsf_flags |= NFSNSF_OK;
4954 	off += (sf->nsf_numboots * sizeof (time_t));
4955 
4956 	/*
4957 	 * Read through the file, building a list of records for grace
4958 	 * checking.
4959 	 * Each record is between sizeof (struct nfst_rec) and
4960 	 * sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1
4961 	 * and is actually sizeof (struct nfst_rec) + nst_len - 1.
4962 	 */
4963 	tsp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
4964 		NFSV4_OPAQUELIMIT - 1, M_TEMP, M_WAITOK);
4965 	do {
4966 	    error = NFSD_RDWR(UIO_READ, NFSFPVNODE(sf->nsf_fp),
4967 	        (caddr_t)tsp, sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1,
4968 	        off, UIO_SYSSPACE, 0, NFSFPCRED(sf->nsf_fp), &aresid, p);
4969 	    len = (sizeof (struct nfst_rec) + NFSV4_OPAQUELIMIT - 1) - aresid;
4970 	    if (error || (len > 0 && (len < sizeof (struct nfst_rec) ||
4971 		len < (sizeof (struct nfst_rec) + tsp->len - 1)))) {
4972 		/*
4973 		 * Yuck, the file has been corrupted, so just return
4974 		 * after clearing out any restart state, so the grace period
4975 		 * is over.
4976 		 */
4977 		LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
4978 			LIST_REMOVE(sp, nst_list);
4979 			free(sp, M_TEMP);
4980 		}
4981 		free(tsp, M_TEMP);
4982 		sf->nsf_flags &= ~NFSNSF_OK;
4983 		free(sf->nsf_bootvals, M_TEMP);
4984 		sf->nsf_bootvals = NULL;
4985 		return;
4986 	    }
4987 	    if (len > 0) {
4988 		off += sizeof (struct nfst_rec) + tsp->len - 1;
4989 		/*
4990 		 * Search the list for a matching client.
4991 		 */
4992 		LIST_FOREACH(sp, &sf->nsf_head, nst_list) {
4993 			if (tsp->len == sp->nst_len &&
4994 			    !NFSBCMP(tsp->client, sp->nst_client, tsp->len))
4995 				break;
4996 		}
4997 		if (sp == LIST_END(&sf->nsf_head)) {
4998 			sp = (struct nfsrv_stable *)malloc(tsp->len +
4999 				sizeof (struct nfsrv_stable) - 1, M_TEMP,
5000 				M_WAITOK);
5001 			NFSBCOPY((caddr_t)tsp, (caddr_t)&sp->nst_rec,
5002 				sizeof (struct nfst_rec) + tsp->len - 1);
5003 			LIST_INSERT_HEAD(&sf->nsf_head, sp, nst_list);
5004 		} else {
5005 			if (tsp->flag == NFSNST_REVOKE)
5006 				sp->nst_flag |= NFSNST_REVOKE;
5007 			else
5008 				/*
5009 				 * A subsequent timestamp indicates the client
5010 				 * did a setclientid/confirm and any previous
5011 				 * revoke is no longer relevant.
5012 				 */
5013 				sp->nst_flag &= ~NFSNST_REVOKE;
5014 		}
5015 	    }
5016 	} while (len > 0);
5017 	free(tsp, M_TEMP);
5018 	sf->nsf_flags = NFSNSF_OK;
5019 	sf->nsf_eograce = NFSD_MONOSEC + sf->nsf_lease +
5020 		NFSRV_LEASEDELTA;
5021 }
5022 
5023 /*
5024  * Update the stable storage file, now that the grace period is over.
5025  */
5026 void
5027 nfsrv_updatestable(NFSPROC_T *p)
5028 {
5029 	struct nfsrv_stablefirst *sf = &NFSD_VNET(nfsrv_stablefirst);
5030 	struct nfsrv_stable *sp, *nsp;
5031 	int i;
5032 	struct nfsvattr nva;
5033 	vnode_t vp;
5034 #if defined(__FreeBSD_version) && (__FreeBSD_version >= 500000)
5035 	mount_t mp = NULL;
5036 #endif
5037 	int error;
5038 
5039 	if (sf->nsf_fp == NULL || (sf->nsf_flags & NFSNSF_UPDATEDONE))
5040 		return;
5041 	sf->nsf_flags |= NFSNSF_UPDATEDONE;
5042 	/*
5043 	 * Ok, we need to rewrite the stable storage file.
5044 	 * - truncate to 0 length
5045 	 * - write the new first structure
5046 	 * - loop through the data structures, writing out any that
5047 	 *   have timestamps older than the old boot
5048 	 */
5049 	if (sf->nsf_bootvals) {
5050 		sf->nsf_numboots++;
5051 		for (i = sf->nsf_numboots - 2; i >= 0; i--)
5052 			sf->nsf_bootvals[i + 1] = sf->nsf_bootvals[i];
5053 	} else {
5054 		sf->nsf_numboots = 1;
5055 		sf->nsf_bootvals = (time_t *)malloc(sizeof(time_t),
5056 			M_TEMP, M_WAITOK);
5057 	}
5058 	sf->nsf_bootvals[0] = NFSD_VNET(nfsrvboottime);
5059 	sf->nsf_lease = nfsrv_lease;
5060 	NFSVNO_ATTRINIT(&nva);
5061 	NFSVNO_SETATTRVAL(&nva, size, 0);
5062 	vp = NFSFPVNODE(sf->nsf_fp);
5063 	vn_start_write(vp, &mp, V_WAIT);
5064 	if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
5065 		error = nfsvno_setattr(vp, &nva, NFSFPCRED(sf->nsf_fp), p,
5066 		    NULL);
5067 		NFSVOPUNLOCK(vp);
5068 	} else
5069 		error = EPERM;
5070 	vn_finished_write(mp);
5071 	if (!error)
5072 	    error = NFSD_RDWR(UIO_WRITE, vp,
5073 		(caddr_t)&sf->nsf_rec, sizeof (struct nfsf_rec), (off_t)0,
5074 		UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
5075 	if (!error)
5076 	    error = NFSD_RDWR(UIO_WRITE, vp,
5077 		(caddr_t)sf->nsf_bootvals,
5078 		sf->nsf_numboots * sizeof (time_t),
5079 		(off_t)(sizeof (struct nfsf_rec)),
5080 		UIO_SYSSPACE, IO_SYNC, NFSFPCRED(sf->nsf_fp), NULL, p);
5081 	free(sf->nsf_bootvals, M_TEMP);
5082 	sf->nsf_bootvals = NULL;
5083 	if (error) {
5084 		sf->nsf_flags &= ~NFSNSF_OK;
5085 		printf("EEK! Can't write NfsV4 stable storage file\n");
5086 		return;
5087 	}
5088 	sf->nsf_flags |= NFSNSF_OK;
5089 
5090 	/*
5091 	 * Loop through the list and write out timestamp records for
5092 	 * any clients that successfully reclaimed state.
5093 	 */
5094 	LIST_FOREACH_SAFE(sp, &sf->nsf_head, nst_list, nsp) {
5095 		if (sp->nst_flag & NFSNST_GOTSTATE) {
5096 			nfsrv_writestable(sp->nst_client, sp->nst_len,
5097 				NFSNST_NEWSTATE, p);
5098 			sp->nst_clp->lc_flags |= LCL_STAMPEDSTABLE;
5099 		}
5100 		LIST_REMOVE(sp, nst_list);
5101 		free(sp, M_TEMP);
5102 	}
5103 	nfsrv_backupstable();
5104 }
5105 
5106 /*
5107  * Append a record to the stable storage file.
5108  */
5109 void
5110 nfsrv_writestable(u_char *client, int len, int flag, NFSPROC_T *p)
5111 {
5112 	struct nfsrv_stablefirst *sf = &NFSD_VNET(nfsrv_stablefirst);
5113 	struct nfst_rec *sp;
5114 	int error;
5115 
5116 	if (!(sf->nsf_flags & NFSNSF_OK) || sf->nsf_fp == NULL)
5117 		return;
5118 	sp = (struct nfst_rec *)malloc(sizeof (struct nfst_rec) +
5119 		len - 1, M_TEMP, M_WAITOK);
5120 	sp->len = len;
5121 	NFSBCOPY(client, sp->client, len);
5122 	sp->flag = flag;
5123 	error = NFSD_RDWR(UIO_WRITE, NFSFPVNODE(sf->nsf_fp),
5124 	    (caddr_t)sp, sizeof (struct nfst_rec) + len - 1, (off_t)0,
5125 	    UIO_SYSSPACE, (IO_SYNC | IO_APPEND), NFSFPCRED(sf->nsf_fp), NULL, p);
5126 	free(sp, M_TEMP);
5127 	if (error) {
5128 		sf->nsf_flags &= ~NFSNSF_OK;
5129 		printf("EEK! Can't write NfsV4 stable storage file\n");
5130 	}
5131 }
5132 
5133 /*
5134  * This function is called during the grace period to mark a client
5135  * that successfully reclaimed state.
5136  */
5137 static void
5138 nfsrv_markstable(struct nfsclient *clp)
5139 {
5140 	struct nfsrv_stable *sp;
5141 
5142 	/*
5143 	 * First find the client structure.
5144 	 */
5145 	LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, nst_list) {
5146 		if (sp->nst_len == clp->lc_idlen &&
5147 		    !NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
5148 			break;
5149 	}
5150 	if (sp == LIST_END(&NFSD_VNET(nfsrv_stablefirst).nsf_head))
5151 		return;
5152 
5153 	/*
5154 	 * Now, just mark it and set the nfsclient back pointer.
5155 	 */
5156 	sp->nst_flag |= NFSNST_GOTSTATE;
5157 	sp->nst_clp = clp;
5158 }
5159 
5160 /*
5161  * This function is called when a NFSv4.1 client does a ReclaimComplete.
5162  * Very similar to nfsrv_markstable(), except for the flag being set.
5163  */
5164 static void
5165 nfsrv_markreclaim(struct nfsclient *clp)
5166 {
5167 	struct nfsrv_stable *sp;
5168 
5169 	/*
5170 	 * First find the client structure.
5171 	 */
5172 	LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, nst_list) {
5173 		if (sp->nst_len == clp->lc_idlen &&
5174 		    !NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
5175 			break;
5176 	}
5177 	if (sp == LIST_END(&NFSD_VNET(nfsrv_stablefirst).nsf_head))
5178 		return;
5179 
5180 	/*
5181 	 * Now, just set the flag.
5182 	 */
5183 	sp->nst_flag |= NFSNST_RECLAIMED;
5184 }
5185 
5186 /*
5187  * This function is called for a reclaim, to see if it gets grace.
5188  * It returns 0 if a reclaim is allowed, 1 otherwise.
5189  */
5190 static int
5191 nfsrv_checkstable(struct nfsclient *clp)
5192 {
5193 	struct nfsrv_stable *sp;
5194 
5195 	/*
5196 	 * First, find the entry for the client.
5197 	 */
5198 	LIST_FOREACH(sp, &NFSD_VNET(nfsrv_stablefirst).nsf_head, nst_list) {
5199 		if (sp->nst_len == clp->lc_idlen &&
5200 		    !NFSBCMP(sp->nst_client, clp->lc_id, sp->nst_len))
5201 			break;
5202 	}
5203 
5204 	/*
5205 	 * If not in the list, state was revoked or no state was issued
5206 	 * since the previous reboot, a reclaim is denied.
5207 	 */
5208 	if (sp == LIST_END(&NFSD_VNET(nfsrv_stablefirst).nsf_head) ||
5209 	    (sp->nst_flag & NFSNST_REVOKE) ||
5210 	    !(NFSD_VNET(nfsrv_stablefirst).nsf_flags & NFSNSF_OK))
5211 		return (1);
5212 	return (0);
5213 }
5214 
5215 /*
5216  * Test for and try to clear out a conflicting client. This is called by
5217  * nfsrv_lockctrl() and nfsrv_openctrl() when conflicts with other clients
5218  * a found.
5219  * The trick here is that it can't revoke a conflicting client with an
5220  * expired lease unless it holds the v4root lock, so...
5221  * If no v4root lock, get the lock and return 1 to indicate "try again".
5222  * Return 0 to indicate the conflict can't be revoked and 1 to indicate
5223  * the revocation worked and the conflicting client is "bye, bye", so it
5224  * can be tried again.
5225  * Return 2 to indicate that the vnode is VIRF_DOOMED after NFSVOPLOCK().
5226  * Unlocks State before a non-zero value is returned.
5227  */
5228 static int
5229 nfsrv_clientconflict(struct nfsclient *clp, int *haslockp, vnode_t vp,
5230     NFSPROC_T *p)
5231 {
5232 	int gotlock, lktype = 0;
5233 
5234 	/*
5235 	 * If lease hasn't expired, we can't fix it.
5236 	 */
5237 	if (clp->lc_expiry >= NFSD_MONOSEC ||
5238 	    !(NFSD_VNET(nfsrv_stablefirst).nsf_flags & NFSNSF_UPDATEDONE))
5239 		return (0);
5240 	if (*haslockp == 0) {
5241 		NFSUNLOCKSTATE();
5242 		if (vp != NULL) {
5243 			lktype = NFSVOPISLOCKED(vp);
5244 			NFSVOPUNLOCK(vp);
5245 		}
5246 		NFSLOCKV4ROOTMUTEX();
5247 		nfsv4_relref(&nfsv4rootfs_lock);
5248 		do {
5249 			gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
5250 			    NFSV4ROOTLOCKMUTEXPTR, NULL);
5251 		} while (!gotlock);
5252 		NFSUNLOCKV4ROOTMUTEX();
5253 		*haslockp = 1;
5254 		if (vp != NULL) {
5255 			NFSVOPLOCK(vp, lktype | LK_RETRY);
5256 			if (VN_IS_DOOMED(vp))
5257 				return (2);
5258 		}
5259 		return (1);
5260 	}
5261 	NFSUNLOCKSTATE();
5262 
5263 	/*
5264 	 * Ok, we can expire the conflicting client.
5265 	 */
5266 	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
5267 	nfsrv_backupstable();
5268 	nfsrv_cleanclient(clp, p);
5269 	nfsrv_freedeleglist(&clp->lc_deleg);
5270 	nfsrv_freedeleglist(&clp->lc_olddeleg);
5271 	LIST_REMOVE(clp, lc_hash);
5272 	nfsrv_zapclient(clp, p);
5273 	return (1);
5274 }
5275 
5276 /*
5277  * Resolve a delegation conflict.
5278  * Returns 0 to indicate the conflict was resolved without sleeping.
5279  * Return -1 to indicate that the caller should check for conflicts again.
5280  * Return > 0 for an error that should be returned, normally NFSERR_DELAY.
5281  *
5282  * Also, manipulate the nfsv4root_lock, as required. It isn't changed
5283  * for a return of 0, since there was no sleep and it could be required
5284  * later. It is released for a return of NFSERR_DELAY, since the caller
5285  * will return that error. It is released when a sleep was done waiting
5286  * for the delegation to be returned or expire (so that other nfsds can
5287  * handle ops). Then, it must be acquired for the write to stable storage.
5288  * (This function is somewhat similar to nfsrv_clientconflict(), but
5289  *  the semantics differ in a couple of subtle ways. The return of 0
5290  *  indicates the conflict was resolved without sleeping here, not
5291  *  that the conflict can't be resolved and the handling of nfsv4root_lock
5292  *  differs, as noted above.)
5293  * Unlocks State before returning a non-zero value.
5294  */
5295 static int
5296 nfsrv_delegconflict(struct nfsstate *stp, int *haslockp, NFSPROC_T *p,
5297     vnode_t vp)
5298 {
5299 	struct nfsclient *clp = stp->ls_clp;
5300 	int gotlock, error, lktype = 0, retrycnt, zapped_clp;
5301 	nfsv4stateid_t tstateid;
5302 	fhandle_t tfh;
5303 
5304 	/*
5305 	 * If the conflict is with an old delegation...
5306 	 */
5307 	if (stp->ls_flags & NFSLCK_OLDDELEG) {
5308 		/*
5309 		 * You can delete it, if it has expired.
5310 		 */
5311 		if (clp->lc_delegtime < NFSD_MONOSEC) {
5312 			nfsrv_freedeleg(stp);
5313 			NFSUNLOCKSTATE();
5314 			error = -1;
5315 			goto out;
5316 		}
5317 		NFSUNLOCKSTATE();
5318 		/*
5319 		 * During this delay, the old delegation could expire or it
5320 		 * could be recovered by the client via an Open with
5321 		 * CLAIM_DELEGATE_PREV.
5322 		 * Release the nfsv4root_lock, if held.
5323 		 */
5324 		if (*haslockp) {
5325 			*haslockp = 0;
5326 			NFSLOCKV4ROOTMUTEX();
5327 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
5328 			NFSUNLOCKV4ROOTMUTEX();
5329 		}
5330 		error = NFSERR_DELAY;
5331 		goto out;
5332 	}
5333 
5334 	/*
5335 	 * It's a current delegation, so:
5336 	 * - check to see if the delegation has expired
5337 	 *   - if so, get the v4root lock and then expire it
5338 	 */
5339 	if ((stp->ls_flags & NFSLCK_DELEGRECALL) == 0 || (stp->ls_lastrecall <
5340 	    NFSD_MONOSEC && clp->lc_expiry >= NFSD_MONOSEC &&
5341 	    stp->ls_delegtime >= NFSD_MONOSEC)) {
5342 		/*
5343 		 * - do a recall callback, since not yet done
5344 		 * For now, never allow truncate to be set. To use
5345 		 * truncate safely, it must be guaranteed that the
5346 		 * Remove, Rename or Setattr with size of 0 will
5347 		 * succeed and that would require major changes to
5348 		 * the VFS/Vnode OPs.
5349 		 * Set the expiry time large enough so that it won't expire
5350 		 * until after the callback, then set it correctly, once
5351 		 * the callback is done. (The delegation will now time
5352 		 * out whether or not the Recall worked ok. The timeout
5353 		 * will be extended when ops are done on the delegation
5354 		 * stateid, up to the timelimit.)
5355 		 */
5356 		if ((stp->ls_flags & NFSLCK_DELEGRECALL) == 0) {
5357 			stp->ls_delegtime = NFSD_MONOSEC + (2 * nfsrv_lease) +
5358 			    NFSRV_LEASEDELTA;
5359 			stp->ls_delegtimelimit = NFSD_MONOSEC + (6 *
5360 			    nfsrv_lease) + NFSRV_LEASEDELTA;
5361 			stp->ls_flags |= NFSLCK_DELEGRECALL;
5362 		}
5363 		stp->ls_lastrecall = time_uptime + 1;
5364 
5365 		/*
5366 		 * Loop NFSRV_CBRETRYCNT times while the CBRecall replies
5367 		 * NFSERR_BADSTATEID or NFSERR_BADHANDLE. This is done
5368 		 * in order to try and avoid a race that could happen
5369 		 * when a CBRecall request passed the Open reply with
5370 		 * the delegation in it when transitting the network.
5371 		 * Since nfsrv_docallback will sleep, don't use stp after
5372 		 * the call.
5373 		 */
5374 		NFSBCOPY((caddr_t)&stp->ls_stateid, (caddr_t)&tstateid,
5375 		    sizeof (tstateid));
5376 		NFSBCOPY((caddr_t)&stp->ls_lfp->lf_fh, (caddr_t)&tfh,
5377 		    sizeof (tfh));
5378 		NFSUNLOCKSTATE();
5379 		if (*haslockp) {
5380 			*haslockp = 0;
5381 			NFSLOCKV4ROOTMUTEX();
5382 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
5383 			NFSUNLOCKV4ROOTMUTEX();
5384 		}
5385 		retrycnt = 0;
5386 		do {
5387 		    error = nfsrv_docallback(clp, NFSV4OP_CBRECALL,
5388 			&tstateid, 0, &tfh, NULL, NULL, 0, p);
5389 		    retrycnt++;
5390 		} while ((error == NFSERR_BADSTATEID ||
5391 		    error == NFSERR_BADHANDLE) && retrycnt < NFSV4_CBRETRYCNT);
5392 		error = NFSERR_DELAY;
5393 		goto out;
5394 	}
5395 
5396 	if (clp->lc_expiry >= NFSD_MONOSEC &&
5397 	    stp->ls_delegtime >= NFSD_MONOSEC) {
5398 		NFSUNLOCKSTATE();
5399 		/*
5400 		 * A recall has been done, but it has not yet expired.
5401 		 * So, RETURN_DELAY.
5402 		 */
5403 		if (*haslockp) {
5404 			*haslockp = 0;
5405 			NFSLOCKV4ROOTMUTEX();
5406 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
5407 			NFSUNLOCKV4ROOTMUTEX();
5408 		}
5409 		error = NFSERR_DELAY;
5410 		goto out;
5411 	}
5412 
5413 	/*
5414 	 * If we don't yet have the lock, just get it and then return,
5415 	 * since we need that before deleting expired state, such as
5416 	 * this delegation.
5417 	 * When getting the lock, unlock the vnode, so other nfsds that
5418 	 * are in progress, won't get stuck waiting for the vnode lock.
5419 	 */
5420 	if (*haslockp == 0) {
5421 		NFSUNLOCKSTATE();
5422 		if (vp != NULL) {
5423 			lktype = NFSVOPISLOCKED(vp);
5424 			NFSVOPUNLOCK(vp);
5425 		}
5426 		NFSLOCKV4ROOTMUTEX();
5427 		nfsv4_relref(&nfsv4rootfs_lock);
5428 		do {
5429 			gotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
5430 			    NFSV4ROOTLOCKMUTEXPTR, NULL);
5431 		} while (!gotlock);
5432 		NFSUNLOCKV4ROOTMUTEX();
5433 		*haslockp = 1;
5434 		if (vp != NULL) {
5435 			NFSVOPLOCK(vp, lktype | LK_RETRY);
5436 			if (VN_IS_DOOMED(vp)) {
5437 				*haslockp = 0;
5438 				NFSLOCKV4ROOTMUTEX();
5439 				nfsv4_unlock(&nfsv4rootfs_lock, 1);
5440 				NFSUNLOCKV4ROOTMUTEX();
5441 				error = NFSERR_PERM;
5442 				goto out;
5443 			}
5444 		}
5445 		error = -1;
5446 		goto out;
5447 	}
5448 
5449 	NFSUNLOCKSTATE();
5450 	/*
5451 	 * Ok, we can delete the expired delegation.
5452 	 * First, write the Revoke record to stable storage and then
5453 	 * clear out the conflict.
5454 	 * Since all other nfsd threads are now blocked, we can safely
5455 	 * sleep without the state changing.
5456 	 */
5457 	nfsrv_writestable(clp->lc_id, clp->lc_idlen, NFSNST_REVOKE, p);
5458 	nfsrv_backupstable();
5459 	if (clp->lc_expiry < NFSD_MONOSEC) {
5460 		nfsrv_cleanclient(clp, p);
5461 		nfsrv_freedeleglist(&clp->lc_deleg);
5462 		nfsrv_freedeleglist(&clp->lc_olddeleg);
5463 		LIST_REMOVE(clp, lc_hash);
5464 		zapped_clp = 1;
5465 	} else {
5466 		nfsrv_freedeleg(stp);
5467 		zapped_clp = 0;
5468 	}
5469 	if (zapped_clp)
5470 		nfsrv_zapclient(clp, p);
5471 	error = -1;
5472 
5473 out:
5474 	NFSEXITCODE(error);
5475 	return (error);
5476 }
5477 
5478 /*
5479  * Check for a remove allowed, if remove is set to 1 and get rid of
5480  * delegations.
5481  */
5482 int
5483 nfsrv_checkremove(vnode_t vp, int remove, struct nfsrv_descript *nd,
5484     nfsquad_t clientid, NFSPROC_T *p)
5485 {
5486 	struct nfsclient *clp;
5487 	struct nfsstate *stp;
5488 	struct nfslockfile *lfp;
5489 	int error, haslock = 0;
5490 	fhandle_t nfh;
5491 
5492 	clp = NULL;
5493 	/*
5494 	 * First, get the lock file structure.
5495 	 * (A return of -1 means no associated state, so remove ok.)
5496 	 */
5497 	error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
5498 tryagain:
5499 	NFSLOCKSTATE();
5500 	if (error == 0 && clientid.qval != 0)
5501 		error = nfsrv_getclient(clientid, CLOPS_RENEW, &clp, NULL,
5502 		    (nfsquad_t)((u_quad_t)0), 0, nd, p);
5503 	if (!error)
5504 		error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
5505 	if (error) {
5506 		NFSUNLOCKSTATE();
5507 		if (haslock) {
5508 			NFSLOCKV4ROOTMUTEX();
5509 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
5510 			NFSUNLOCKV4ROOTMUTEX();
5511 		}
5512 		if (error == -1)
5513 			error = 0;
5514 		goto out;
5515 	}
5516 
5517 	/*
5518 	 * Now, we must Recall any delegations.
5519 	 */
5520 	error = nfsrv_cleandeleg(vp, lfp, clp, &haslock, p);
5521 	if (error) {
5522 		/*
5523 		 * nfsrv_cleandeleg() unlocks state for non-zero
5524 		 * return.
5525 		 */
5526 		if (error == -1)
5527 			goto tryagain;
5528 		if (haslock) {
5529 			NFSLOCKV4ROOTMUTEX();
5530 			nfsv4_unlock(&nfsv4rootfs_lock, 1);
5531 			NFSUNLOCKV4ROOTMUTEX();
5532 		}
5533 		goto out;
5534 	}
5535 
5536 	/*
5537 	 * Now, look for a conflicting open share.
5538 	 */
5539 	if (remove) {
5540 		/*
5541 		 * If the entry in the directory was the last reference to the
5542 		 * corresponding filesystem object, the object can be destroyed
5543 		 * */
5544 		if(lfp->lf_usecount>1)
5545 			LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
5546 				if (stp->ls_flags & NFSLCK_WRITEDENY) {
5547 					error = NFSERR_FILEOPEN;
5548 					break;
5549 				}
5550 			}
5551 	}
5552 
5553 	NFSUNLOCKSTATE();
5554 	if (haslock) {
5555 		NFSLOCKV4ROOTMUTEX();
5556 		nfsv4_unlock(&nfsv4rootfs_lock, 1);
5557 		NFSUNLOCKV4ROOTMUTEX();
5558 	}
5559 
5560 out:
5561 	NFSEXITCODE(error);
5562 	return (error);
5563 }
5564 
5565 /*
5566  * Clear out all delegations for the file referred to by lfp.
5567  * May return NFSERR_DELAY, if there will be a delay waiting for
5568  * delegations to expire.
5569  * Returns -1 to indicate it slept while recalling a delegation.
5570  * This function has the side effect of deleting the nfslockfile structure,
5571  * if it no longer has associated state and didn't have to sleep.
5572  * Unlocks State before a non-zero value is returned.
5573  */
5574 static int
5575 nfsrv_cleandeleg(vnode_t vp, struct nfslockfile *lfp,
5576     struct nfsclient *clp, int *haslockp, NFSPROC_T *p)
5577 {
5578 	struct nfsstate *stp, *nstp;
5579 	int ret = 0;
5580 
5581 	stp = LIST_FIRST(&lfp->lf_deleg);
5582 	while (stp != LIST_END(&lfp->lf_deleg)) {
5583 		nstp = LIST_NEXT(stp, ls_file);
5584 		if (stp->ls_clp != clp) {
5585 			ret = nfsrv_delegconflict(stp, haslockp, p, vp);
5586 			if (ret) {
5587 				/*
5588 				 * nfsrv_delegconflict() unlocks state
5589 				 * when it returns non-zero.
5590 				 */
5591 				goto out;
5592 			}
5593 		}
5594 		stp = nstp;
5595 	}
5596 out:
5597 	NFSEXITCODE(ret);
5598 	return (ret);
5599 }
5600 
5601 /*
5602  * There are certain operations that, when being done outside of NFSv4,
5603  * require that any NFSv4 delegation for the file be recalled.
5604  * This function is to be called for those cases:
5605  * VOP_RENAME() - When a delegation is being recalled for any reason,
5606  *	the client may have to do Opens against the server, using the file's
5607  *	final component name. If the file has been renamed on the server,
5608  *	that component name will be incorrect and the Open will fail.
5609  * VOP_REMOVE() - Theoretically, a client could Open a file after it has
5610  *	been removed on the server, if there is a delegation issued to
5611  *	that client for the file. I say "theoretically" since clients
5612  *	normally do an Access Op before the Open and that Access Op will
5613  *	fail with ESTALE. Note that NFSv2 and 3 don't even do Opens, so
5614  *	they will detect the file's removal in the same manner. (There is
5615  *	one case where RFC3530 allows a client to do an Open without first
5616  *	doing an Access Op, which is passage of a check against the ACE
5617  *	returned with a Write delegation, but current practice is to ignore
5618  *	the ACE and always do an Access Op.)
5619  *	Since the functions can only be called with an unlocked vnode, this
5620  *	can't be done at this time.
5621  * VOP_ADVLOCK() - When a client holds a delegation, it can issue byte range
5622  *	locks locally in the client, which are not visible to the server. To
5623  *	deal with this, issuing of delegations for a vnode must be disabled
5624  *	and all delegations for the vnode recalled. This is done via the
5625  *	second function, using the VV_DISABLEDELEG vflag on the vnode.
5626  */
5627 void
5628 nfsd_recalldelegation(vnode_t vp, NFSPROC_T *p)
5629 {
5630 	time_t starttime;
5631 	int error;
5632 
5633 	/*
5634 	 * First, check to see if the server is currently running and it has
5635 	 * been called for a regular file when issuing delegations.
5636 	 */
5637 	if (NFSD_VNET(nfsrv_numnfsd) == 0 || vp->v_type != VREG ||
5638 	    nfsrv_issuedelegs == 0)
5639 		return;
5640 
5641 	KASSERT((NFSVOPISLOCKED(vp) != LK_EXCLUSIVE), ("vp %p is locked", vp));
5642 	/*
5643 	 * First, get a reference on the nfsv4rootfs_lock so that an
5644 	 * exclusive lock cannot be acquired by another thread.
5645 	 */
5646 	NFSLOCKV4ROOTMUTEX();
5647 	nfsv4_getref(&nfsv4rootfs_lock, NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
5648 	NFSUNLOCKV4ROOTMUTEX();
5649 
5650 	/*
5651 	 * Now, call nfsrv_checkremove() in a loop while it returns
5652 	 * NFSERR_DELAY. Return upon any other error or when timed out.
5653 	 */
5654 	starttime = NFSD_MONOSEC;
5655 	do {
5656 		if (NFSVOPLOCK(vp, LK_EXCLUSIVE) == 0) {
5657 			error = nfsrv_checkremove(vp, 0, NULL,
5658 			    (nfsquad_t)((u_quad_t)0), p);
5659 			NFSVOPUNLOCK(vp);
5660 		} else
5661 			error = EPERM;
5662 		if (error == NFSERR_DELAY) {
5663 			if (NFSD_MONOSEC - starttime > NFS_REMOVETIMEO)
5664 				break;
5665 			/* Sleep for a short period of time */
5666 			(void) nfs_catnap(PZERO, 0, "nfsremove");
5667 		}
5668 	} while (error == NFSERR_DELAY);
5669 	NFSLOCKV4ROOTMUTEX();
5670 	nfsv4_relref(&nfsv4rootfs_lock);
5671 	NFSUNLOCKV4ROOTMUTEX();
5672 }
5673 
5674 void
5675 nfsd_disabledelegation(vnode_t vp, NFSPROC_T *p)
5676 {
5677 
5678 #ifdef VV_DISABLEDELEG
5679 	/*
5680 	 * First, flag issuance of delegations disabled.
5681 	 */
5682 	atomic_set_long(&vp->v_vflag, VV_DISABLEDELEG);
5683 #endif
5684 
5685 	/*
5686 	 * Then call nfsd_recalldelegation() to get rid of all extant
5687 	 * delegations.
5688 	 */
5689 	nfsd_recalldelegation(vp, p);
5690 }
5691 
5692 /*
5693  * Check for conflicting locks, etc. and then get rid of delegations.
5694  * (At one point I thought that I should get rid of delegations for any
5695  *  Setattr, since it could potentially disallow the I/O op (read or write)
5696  *  allowed by the delegation. However, Setattr Ops that aren't changing
5697  *  the size get a stateid of all 0s, so you can't tell if it is a delegation
5698  *  for the same client or a different one, so I decided to only get rid
5699  *  of delegations for other clients when the size is being changed.)
5700  * In general, a Setattr can disable NFS I/O Ops that are outstanding, such
5701  * as Write backs, even if there is no delegation, so it really isn't any
5702  * different?)
5703  */
5704 int
5705 nfsrv_checksetattr(vnode_t vp, struct nfsrv_descript *nd,
5706     nfsv4stateid_t *stateidp, struct nfsvattr *nvap, nfsattrbit_t *attrbitp,
5707     struct nfsexstuff *exp, NFSPROC_T *p)
5708 {
5709 	struct nfsstate st, *stp = &st;
5710 	struct nfslock lo, *lop = &lo;
5711 	int error = 0;
5712 	nfsquad_t clientid;
5713 
5714 	if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE)) {
5715 		stp->ls_flags = (NFSLCK_CHECK | NFSLCK_WRITEACCESS);
5716 		lop->lo_first = nvap->na_size;
5717 	} else {
5718 		stp->ls_flags = 0;
5719 		lop->lo_first = 0;
5720 	}
5721 	if (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNER) ||
5722 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_OWNERGROUP) ||
5723 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_MODE) ||
5724 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_ACL))
5725 		stp->ls_flags |= NFSLCK_SETATTR;
5726 	if (stp->ls_flags == 0)
5727 		goto out;
5728 	lop->lo_end = NFS64BITSSET;
5729 	lop->lo_flags = NFSLCK_WRITE;
5730 	stp->ls_ownerlen = 0;
5731 	stp->ls_op = NULL;
5732 	stp->ls_uid = nd->nd_cred->cr_uid;
5733 	stp->ls_stateid.seqid = stateidp->seqid;
5734 	clientid.lval[0] = stp->ls_stateid.other[0] = stateidp->other[0];
5735 	clientid.lval[1] = stp->ls_stateid.other[1] = stateidp->other[1];
5736 	stp->ls_stateid.other[2] = stateidp->other[2];
5737 	error = nfsrv_lockctrl(vp, &stp, &lop, NULL, clientid,
5738 	    stateidp, exp, nd, p);
5739 
5740 out:
5741 	NFSEXITCODE2(error, nd);
5742 	return (error);
5743 }
5744 
5745 /*
5746  * Check for a write delegation and do a CBGETATTR if there is one, updating
5747  * the attributes, as required.
5748  * Should I return an error if I can't get the attributes? (For now, I'll
5749  * just return ok.
5750  */
5751 int
5752 nfsrv_checkgetattr(struct nfsrv_descript *nd, vnode_t vp,
5753     struct nfsvattr *nvap, nfsattrbit_t *attrbitp, NFSPROC_T *p)
5754 {
5755 	struct nfsstate *stp;
5756 	struct nfslockfile *lfp;
5757 	struct nfsclient *clp;
5758 	struct nfsvattr nva;
5759 	fhandle_t nfh;
5760 	int error = 0;
5761 	nfsattrbit_t cbbits;
5762 	u_quad_t delegfilerev;
5763 
5764 	NFSCBGETATTR_ATTRBIT(attrbitp, &cbbits);
5765 	if (!NFSNONZERO_ATTRBIT(&cbbits))
5766 		goto out;
5767 	if (nfsrv_writedelegcnt == 0)
5768 		goto out;
5769 
5770 	/*
5771 	 * Get the lock file structure.
5772 	 * (A return of -1 means no associated state, so return ok.)
5773 	 */
5774 	error = nfsrv_getlockfh(vp, NFSLCK_CHECK, NULL, &nfh, p);
5775 	NFSLOCKSTATE();
5776 	if (!error)
5777 		error = nfsrv_getlockfile(NFSLCK_CHECK, NULL, &lfp, &nfh, 0);
5778 	if (error) {
5779 		NFSUNLOCKSTATE();
5780 		if (error == -1)
5781 			error = 0;
5782 		goto out;
5783 	}
5784 
5785 	/*
5786 	 * Now, look for a write delegation.
5787 	 */
5788 	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
5789 		if (stp->ls_flags & NFSLCK_DELEGWRITE)
5790 			break;
5791 	}
5792 	if (stp == LIST_END(&lfp->lf_deleg)) {
5793 		NFSUNLOCKSTATE();
5794 		goto out;
5795 	}
5796 	clp = stp->ls_clp;
5797 
5798 	/* If the clientid is not confirmed, ignore the delegation. */
5799 	if (clp->lc_flags & LCL_NEEDSCONFIRM) {
5800 		NFSUNLOCKSTATE();
5801 		goto out;
5802 	}
5803 
5804 	delegfilerev = stp->ls_filerev;
5805 	/*
5806 	 * If the Write delegation was issued as a part of this Compound RPC
5807 	 * or if we have an Implied Clientid (used in a previous Op in this
5808 	 * compound) and it is the client the delegation was issued to,
5809 	 * just return ok.
5810 	 * I also assume that it is from the same client iff the network
5811 	 * host IP address is the same as the callback address. (Not
5812 	 * exactly correct by the RFC, but avoids a lot of Getattr
5813 	 * callbacks.)
5814 	 */
5815 	if (nd->nd_compref == stp->ls_compref ||
5816 	    ((nd->nd_flag & ND_IMPLIEDCLID) &&
5817 	     clp->lc_clientid.qval == nd->nd_clientid.qval) ||
5818 	     nfsaddr2_match(clp->lc_req.nr_nam, nd->nd_nam)) {
5819 		NFSUNLOCKSTATE();
5820 		goto out;
5821 	}
5822 
5823 	/*
5824 	 * We are now done with the delegation state structure,
5825 	 * so the statelock can be released and we can now tsleep().
5826 	 */
5827 
5828 	/*
5829 	 * Now, we must do the CB Getattr callback, to see if Change or Size
5830 	 * has changed.
5831 	 */
5832 	if (clp->lc_expiry >= NFSD_MONOSEC) {
5833 		NFSUNLOCKSTATE();
5834 		NFSVNO_ATTRINIT(&nva);
5835 		nva.na_filerev = NFS64BITSSET;
5836 		error = nfsrv_docallback(clp, NFSV4OP_CBGETATTR, NULL,
5837 		    0, &nfh, &nva, &cbbits, 0, p);
5838 		if (!error) {
5839 			if ((nva.na_filerev != NFS64BITSSET &&
5840 			    nva.na_filerev > delegfilerev) ||
5841 			    (NFSVNO_ISSETSIZE(&nva) &&
5842 			     nva.na_size != nvap->na_size)) {
5843 				error = nfsvno_updfilerev(vp, nvap, nd, p);
5844 				if (NFSVNO_ISSETSIZE(&nva))
5845 					nvap->na_size = nva.na_size;
5846 			}
5847 		} else
5848 			error = 0;	/* Ignore callback errors for now. */
5849 	} else {
5850 		NFSUNLOCKSTATE();
5851 	}
5852 
5853 out:
5854 	NFSEXITCODE2(error, nd);
5855 	return (error);
5856 }
5857 
5858 /*
5859  * This function looks for openowners that haven't had any opens for
5860  * a while and throws them away. Called by an nfsd when NFSNSF_NOOPENS
5861  * is set.
5862  */
5863 void
5864 nfsrv_throwawayopens(NFSPROC_T *p)
5865 {
5866 	struct nfsclient *clp, *nclp;
5867 	struct nfsstate *stp, *nstp;
5868 	int i;
5869 
5870 	NFSLOCKSTATE();
5871 	NFSD_VNET(nfsrv_stablefirst).nsf_flags &= ~NFSNSF_NOOPENS;
5872 	/*
5873 	 * For each client...
5874 	 */
5875 	for (i = 0; i < nfsrv_clienthashsize; i++) {
5876 	    LIST_FOREACH_SAFE(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash,
5877 		nclp) {
5878 		LIST_FOREACH_SAFE(stp, &clp->lc_open, ls_list, nstp) {
5879 			if (LIST_EMPTY(&stp->ls_open) &&
5880 			    (stp->ls_noopens > NFSNOOPEN ||
5881 			     (nfsrv_openpluslock * 2) >
5882 			     nfsrv_v4statelimit))
5883 				nfsrv_freeopenowner(stp, 0, p);
5884 		}
5885 	    }
5886 	}
5887 	NFSUNLOCKSTATE();
5888 }
5889 
5890 /*
5891  * This function checks to see if the credentials are the same.
5892  * The check for same credentials is needed for state management operations
5893  * for NFSv4.0 or NFSv4.1/4.2 when SP4_MACH_CRED is configured via
5894  * ExchangeID.
5895  * Returns 1 for not same, 0 otherwise.
5896  */
5897 static int
5898 nfsrv_notsamecredname(int op, struct nfsrv_descript *nd, struct nfsclient *clp)
5899 {
5900 
5901 	/* Check for the SP4_MACH_CRED case. */
5902 	if (op != 0 && nfsrv_checkmachcred(op, nd, clp) != 0)
5903 		return (1);
5904 
5905 	/* For NFSv4.1/4.2, SP4_NONE always allows this. */
5906 	if ((nd->nd_flag & ND_NFSV41) != 0)
5907 		return (0);
5908 
5909 	if (nd->nd_flag & ND_GSS) {
5910 		if (!(clp->lc_flags & LCL_GSS))
5911 			return (1);
5912 		if (clp->lc_flags & LCL_NAME) {
5913 			if (nd->nd_princlen != clp->lc_namelen ||
5914 			    NFSBCMP(nd->nd_principal, clp->lc_name,
5915 				clp->lc_namelen))
5916 				return (1);
5917 			else
5918 				return (0);
5919 		}
5920 		if (nd->nd_cred->cr_uid == clp->lc_uid)
5921 			return (0);
5922 		else
5923 			return (1);
5924 	} else if (clp->lc_flags & LCL_GSS)
5925 		return (1);
5926 	/*
5927 	 * For AUTH_SYS, allow the same uid or root. (This is underspecified
5928 	 * in RFC3530, which talks about principals, but doesn't say anything
5929 	 * about uids for AUTH_SYS.)
5930 	 */
5931 	if (nd->nd_cred->cr_uid == clp->lc_uid || nd->nd_cred->cr_uid == 0)
5932 		return (0);
5933 	else
5934 		return (1);
5935 }
5936 
5937 /*
5938  * Calculate the lease expiry time.
5939  */
5940 static time_t
5941 nfsrv_leaseexpiry(void)
5942 {
5943 
5944 	if (NFSD_VNET(nfsrv_stablefirst).nsf_eograce > NFSD_MONOSEC)
5945 		return (NFSD_MONOSEC + 2 * (nfsrv_lease + NFSRV_LEASEDELTA));
5946 	return (NFSD_MONOSEC + nfsrv_lease + NFSRV_LEASEDELTA);
5947 }
5948 
5949 /*
5950  * Delay the delegation timeout as far as ls_delegtimelimit, as required.
5951  */
5952 static void
5953 nfsrv_delaydelegtimeout(struct nfsstate *stp)
5954 {
5955 
5956 	if ((stp->ls_flags & NFSLCK_DELEGRECALL) == 0)
5957 		return;
5958 
5959 	if ((stp->ls_delegtime + 15) > NFSD_MONOSEC &&
5960 	    stp->ls_delegtime < stp->ls_delegtimelimit) {
5961 		stp->ls_delegtime += nfsrv_lease;
5962 		if (stp->ls_delegtime > stp->ls_delegtimelimit)
5963 			stp->ls_delegtime = stp->ls_delegtimelimit;
5964 	}
5965 }
5966 
5967 /*
5968  * This function checks to see if there is any other state associated
5969  * with the openowner for this Open.
5970  * It returns 1 if there is no other state, 0 otherwise.
5971  */
5972 static int
5973 nfsrv_nootherstate(struct nfsstate *stp)
5974 {
5975 	struct nfsstate *tstp;
5976 
5977 	LIST_FOREACH(tstp, &stp->ls_openowner->ls_open, ls_list) {
5978 		if (tstp != stp || !LIST_EMPTY(&tstp->ls_lock))
5979 			return (0);
5980 	}
5981 	return (1);
5982 }
5983 
5984 /*
5985  * Create a list of lock deltas (changes to local byte range locking
5986  * that can be rolled back using the list) and apply the changes via
5987  * nfsvno_advlock(). Optionally, lock the list. It is expected that either
5988  * the rollback or update function will be called after this.
5989  * It returns an error (and rolls back, as required), if any nfsvno_advlock()
5990  * call fails. If it returns an error, it will unlock the list.
5991  */
5992 static int
5993 nfsrv_locallock(vnode_t vp, struct nfslockfile *lfp, int flags,
5994     uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p)
5995 {
5996 	struct nfslock *lop, *nlop;
5997 	int error = 0;
5998 
5999 	/* Loop through the list of locks. */
6000 	lop = LIST_FIRST(&lfp->lf_locallock);
6001 	while (first < end && lop != NULL) {
6002 		nlop = LIST_NEXT(lop, lo_lckowner);
6003 		if (first >= lop->lo_end) {
6004 			/* not there yet */
6005 			lop = nlop;
6006 		} else if (first < lop->lo_first) {
6007 			/* new one starts before entry in list */
6008 			if (end <= lop->lo_first) {
6009 				/* no overlap between old and new */
6010 				error = nfsrv_dolocal(vp, lfp, flags,
6011 				    NFSLCK_UNLOCK, first, end, cfp, p);
6012 				if (error != 0)
6013 					break;
6014 				first = end;
6015 			} else {
6016 				/* handle fragment overlapped with new one */
6017 				error = nfsrv_dolocal(vp, lfp, flags,
6018 				    NFSLCK_UNLOCK, first, lop->lo_first, cfp,
6019 				    p);
6020 				if (error != 0)
6021 					break;
6022 				first = lop->lo_first;
6023 			}
6024 		} else {
6025 			/* new one overlaps this entry in list */
6026 			if (end <= lop->lo_end) {
6027 				/* overlaps all of new one */
6028 				error = nfsrv_dolocal(vp, lfp, flags,
6029 				    lop->lo_flags, first, end, cfp, p);
6030 				if (error != 0)
6031 					break;
6032 				first = end;
6033 			} else {
6034 				/* handle fragment overlapped with new one */
6035 				error = nfsrv_dolocal(vp, lfp, flags,
6036 				    lop->lo_flags, first, lop->lo_end, cfp, p);
6037 				if (error != 0)
6038 					break;
6039 				first = lop->lo_end;
6040 				lop = nlop;
6041 			}
6042 		}
6043 	}
6044 	if (first < end && error == 0)
6045 		/* handle fragment past end of list */
6046 		error = nfsrv_dolocal(vp, lfp, flags, NFSLCK_UNLOCK, first,
6047 		    end, cfp, p);
6048 
6049 	NFSEXITCODE(error);
6050 	return (error);
6051 }
6052 
6053 /*
6054  * Local lock unlock. Unlock all byte ranges that are no longer locked
6055  * by NFSv4. To do this, unlock any subranges of first-->end that
6056  * do not overlap with the byte ranges of any lock in the lfp->lf_lock
6057  * list. This list has all locks for the file held by other
6058  * <clientid, lockowner> tuples. The list is ordered by increasing
6059  * lo_first value, but may have entries that overlap each other, for
6060  * the case of read locks.
6061  */
6062 static void
6063 nfsrv_localunlock(vnode_t vp, struct nfslockfile *lfp, uint64_t init_first,
6064     uint64_t init_end, NFSPROC_T *p)
6065 {
6066 	struct nfslock *lop;
6067 	uint64_t first, end, prevfirst __unused;
6068 
6069 	first = init_first;
6070 	end = init_end;
6071 	while (first < init_end) {
6072 		/* Loop through all nfs locks, adjusting first and end */
6073 		prevfirst = 0;
6074 		LIST_FOREACH(lop, &lfp->lf_lock, lo_lckfile) {
6075 			KASSERT(prevfirst <= lop->lo_first,
6076 			    ("nfsv4 locks out of order"));
6077 			KASSERT(lop->lo_first < lop->lo_end,
6078 			    ("nfsv4 bogus lock"));
6079 			prevfirst = lop->lo_first;
6080 			if (first >= lop->lo_first &&
6081 			    first < lop->lo_end)
6082 				/*
6083 				 * Overlaps with initial part, so trim
6084 				 * off that initial part by moving first past
6085 				 * it.
6086 				 */
6087 				first = lop->lo_end;
6088 			else if (end > lop->lo_first &&
6089 			    lop->lo_first > first) {
6090 				/*
6091 				 * This lock defines the end of the
6092 				 * segment to unlock, so set end to the
6093 				 * start of it and break out of the loop.
6094 				 */
6095 				end = lop->lo_first;
6096 				break;
6097 			}
6098 			if (first >= end)
6099 				/*
6100 				 * There is no segment left to do, so
6101 				 * break out of this loop and then exit
6102 				 * the outer while() since first will be set
6103 				 * to end, which must equal init_end here.
6104 				 */
6105 				break;
6106 		}
6107 		if (first < end) {
6108 			/* Unlock this segment */
6109 			(void) nfsrv_dolocal(vp, lfp, NFSLCK_UNLOCK,
6110 			    NFSLCK_READ, first, end, NULL, p);
6111 			nfsrv_locallock_commit(lfp, NFSLCK_UNLOCK,
6112 			    first, end);
6113 		}
6114 		/*
6115 		 * Now move past this segment and look for any further
6116 		 * segment in the range, if there is one.
6117 		 */
6118 		first = end;
6119 		end = init_end;
6120 	}
6121 }
6122 
6123 /*
6124  * Do the local lock operation and update the rollback list, as required.
6125  * Perform the rollback and return the error if nfsvno_advlock() fails.
6126  */
6127 static int
6128 nfsrv_dolocal(vnode_t vp, struct nfslockfile *lfp, int flags, int oldflags,
6129     uint64_t first, uint64_t end, struct nfslockconflict *cfp, NFSPROC_T *p)
6130 {
6131 	struct nfsrollback *rlp;
6132 	int error = 0, ltype, oldltype;
6133 
6134 	if (flags & NFSLCK_WRITE)
6135 		ltype = F_WRLCK;
6136 	else if (flags & NFSLCK_READ)
6137 		ltype = F_RDLCK;
6138 	else
6139 		ltype = F_UNLCK;
6140 	if (oldflags & NFSLCK_WRITE)
6141 		oldltype = F_WRLCK;
6142 	else if (oldflags & NFSLCK_READ)
6143 		oldltype = F_RDLCK;
6144 	else
6145 		oldltype = F_UNLCK;
6146 	if (ltype == oldltype || (oldltype == F_WRLCK && ltype == F_RDLCK))
6147 		/* nothing to do */
6148 		goto out;
6149 	error = nfsvno_advlock(vp, ltype, first, end, p);
6150 	if (error != 0) {
6151 		if (cfp != NULL) {
6152 			cfp->cl_clientid.lval[0] = 0;
6153 			cfp->cl_clientid.lval[1] = 0;
6154 			cfp->cl_first = 0;
6155 			cfp->cl_end = NFS64BITSSET;
6156 			cfp->cl_flags = NFSLCK_WRITE;
6157 			cfp->cl_ownerlen = 5;
6158 			NFSBCOPY("LOCAL", cfp->cl_owner, 5);
6159 		}
6160 		nfsrv_locallock_rollback(vp, lfp, p);
6161 	} else if (ltype != F_UNLCK) {
6162 		rlp = malloc(sizeof (struct nfsrollback), M_NFSDROLLBACK,
6163 		    M_WAITOK);
6164 		rlp->rlck_first = first;
6165 		rlp->rlck_end = end;
6166 		rlp->rlck_type = oldltype;
6167 		LIST_INSERT_HEAD(&lfp->lf_rollback, rlp, rlck_list);
6168 	}
6169 
6170 out:
6171 	NFSEXITCODE(error);
6172 	return (error);
6173 }
6174 
6175 /*
6176  * Roll back local lock changes and free up the rollback list.
6177  */
6178 static void
6179 nfsrv_locallock_rollback(vnode_t vp, struct nfslockfile *lfp, NFSPROC_T *p)
6180 {
6181 	struct nfsrollback *rlp, *nrlp;
6182 
6183 	LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp) {
6184 		(void) nfsvno_advlock(vp, rlp->rlck_type, rlp->rlck_first,
6185 		    rlp->rlck_end, p);
6186 		free(rlp, M_NFSDROLLBACK);
6187 	}
6188 	LIST_INIT(&lfp->lf_rollback);
6189 }
6190 
6191 /*
6192  * Update local lock list and delete rollback list (ie now committed to the
6193  * local locks). Most of the work is done by the internal function.
6194  */
6195 static void
6196 nfsrv_locallock_commit(struct nfslockfile *lfp, int flags, uint64_t first,
6197     uint64_t end)
6198 {
6199 	struct nfsrollback *rlp, *nrlp;
6200 	struct nfslock *new_lop, *other_lop;
6201 
6202 	new_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK, M_WAITOK);
6203 	if (flags & (NFSLCK_READ | NFSLCK_WRITE))
6204 		other_lop = malloc(sizeof (struct nfslock), M_NFSDLOCK,
6205 		    M_WAITOK);
6206 	else
6207 		other_lop = NULL;
6208 	new_lop->lo_flags = flags;
6209 	new_lop->lo_first = first;
6210 	new_lop->lo_end = end;
6211 	nfsrv_updatelock(NULL, &new_lop, &other_lop, lfp);
6212 	if (new_lop != NULL)
6213 		free(new_lop, M_NFSDLOCK);
6214 	if (other_lop != NULL)
6215 		free(other_lop, M_NFSDLOCK);
6216 
6217 	/* and get rid of the rollback list */
6218 	LIST_FOREACH_SAFE(rlp, &lfp->lf_rollback, rlck_list, nrlp)
6219 		free(rlp, M_NFSDROLLBACK);
6220 	LIST_INIT(&lfp->lf_rollback);
6221 }
6222 
6223 /*
6224  * Lock the struct nfslockfile for local lock updating.
6225  */
6226 static void
6227 nfsrv_locklf(struct nfslockfile *lfp)
6228 {
6229 	int gotlock;
6230 
6231 	/* lf_usecount ensures *lfp won't be free'd */
6232 	lfp->lf_usecount++;
6233 	do {
6234 		gotlock = nfsv4_lock(&lfp->lf_locallock_lck, 1, NULL,
6235 		    NFSSTATEMUTEXPTR, NULL);
6236 	} while (gotlock == 0);
6237 	lfp->lf_usecount--;
6238 }
6239 
6240 /*
6241  * Unlock the struct nfslockfile after local lock updating.
6242  */
6243 static void
6244 nfsrv_unlocklf(struct nfslockfile *lfp)
6245 {
6246 
6247 	nfsv4_unlock(&lfp->lf_locallock_lck, 0);
6248 }
6249 
6250 /*
6251  * Clear out all state for the NFSv4 server.
6252  * Must be called by a thread that can sleep when no nfsds are running.
6253  */
6254 void
6255 nfsrv_throwawayallstate(NFSPROC_T *p)
6256 {
6257 	struct nfsclient *clp, *nclp;
6258 	struct nfslockfile *lfp, *nlfp;
6259 	int i;
6260 
6261 	/*
6262 	 * For each client, clean out the state and then free the structure.
6263 	 */
6264 	for (i = 0; i < nfsrv_clienthashsize; i++) {
6265 		LIST_FOREACH_SAFE(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash,
6266 		    nclp) {
6267 			nfsrv_cleanclient(clp, p);
6268 			nfsrv_freedeleglist(&clp->lc_deleg);
6269 			nfsrv_freedeleglist(&clp->lc_olddeleg);
6270 			free(clp->lc_stateid, M_NFSDCLIENT);
6271 			free(clp, M_NFSDCLIENT);
6272 		}
6273 	}
6274 
6275 	/*
6276 	 * Also, free up any remaining lock file structures.
6277 	 */
6278 	for (i = 0; i < nfsrv_lockhashsize; i++) {
6279 		LIST_FOREACH_SAFE(lfp, &NFSD_VNET(nfslockhash)[i], lf_hash,
6280 		    nlfp) {
6281 			printf("nfsd unload: fnd a lock file struct\n");
6282 			nfsrv_freenfslockfile(lfp);
6283 		}
6284 	}
6285 
6286 	/* And get rid of the deviceid structures and layouts. */
6287 	nfsrv_freealllayoutsanddevids();
6288 }
6289 
6290 /*
6291  * Check the sequence# for the session and slot provided as an argument.
6292  * Also, renew the lease if the session will return NFS_OK.
6293  */
6294 int
6295 nfsrv_checksequence(struct nfsrv_descript *nd, uint32_t sequenceid,
6296     uint32_t *highest_slotidp, uint32_t *target_highest_slotidp, int cache_this,
6297     uint32_t *sflagsp, NFSPROC_T *p)
6298 {
6299 	struct nfsdsession *sep;
6300 	struct nfssessionhash *shp;
6301 	int error;
6302 
6303 	shp = NFSSESSIONHASH(nd->nd_sessionid);
6304 	NFSLOCKSESSION(shp);
6305 	sep = nfsrv_findsession(nd->nd_sessionid);
6306 	if (sep == NULL) {
6307 		NFSUNLOCKSESSION(shp);
6308 		return (NFSERR_BADSESSION);
6309 	}
6310 	error = nfsv4_seqsession(sequenceid, nd->nd_slotid, *highest_slotidp,
6311 	    sep->sess_slots, NULL, NFSV4_SLOTS - 1);
6312 	if (error != 0) {
6313 		NFSUNLOCKSESSION(shp);
6314 		return (error);
6315 	}
6316 	if (cache_this != 0)
6317 		nd->nd_flag |= ND_SAVEREPLY;
6318 	/* Renew the lease. */
6319 	sep->sess_clp->lc_expiry = nfsrv_leaseexpiry();
6320 	nd->nd_clientid.qval = sep->sess_clp->lc_clientid.qval;
6321 	nd->nd_flag |= ND_IMPLIEDCLID;
6322 
6323 	/* Handle the SP4_MECH_CRED case for NFSv4.1/4.2. */
6324 	if ((sep->sess_clp->lc_flags & LCL_MACHCRED) != 0 &&
6325 	    (nd->nd_flag & (ND_GSSINTEGRITY | ND_GSSPRIVACY)) != 0 &&
6326 	    nd->nd_princlen == sep->sess_clp->lc_namelen &&
6327 	    !NFSBCMP(sep->sess_clp->lc_name, nd->nd_principal,
6328 	    nd->nd_princlen)) {
6329 		nd->nd_flag |= ND_MACHCRED;
6330 		NFSSET_OPBIT(&nd->nd_allowops, &sep->sess_clp->lc_allowops);
6331 	}
6332 
6333 	/* Save maximum request and reply sizes. */
6334 	nd->nd_maxreq = sep->sess_maxreq;
6335 	nd->nd_maxresp = sep->sess_maxresp;
6336 
6337 	*sflagsp = 0;
6338 	if (sep->sess_clp->lc_req.nr_client == NULL ||
6339 	    (sep->sess_clp->lc_flags & LCL_CBDOWN) != 0)
6340 		*sflagsp |= NFSV4SEQ_CBPATHDOWN;
6341 	NFSUNLOCKSESSION(shp);
6342 	if (error == NFSERR_EXPIRED) {
6343 		*sflagsp |= NFSV4SEQ_EXPIREDALLSTATEREVOKED;
6344 		error = 0;
6345 	} else if (error == NFSERR_ADMINREVOKED) {
6346 		*sflagsp |= NFSV4SEQ_ADMINSTATEREVOKED;
6347 		error = 0;
6348 	}
6349 	*highest_slotidp = *target_highest_slotidp = NFSV4_SLOTS - 1;
6350 	return (0);
6351 }
6352 
6353 /*
6354  * Check/set reclaim complete for this session/clientid.
6355  */
6356 int
6357 nfsrv_checkreclaimcomplete(struct nfsrv_descript *nd, int onefs)
6358 {
6359 	struct nfsdsession *sep;
6360 	struct nfssessionhash *shp;
6361 	int error = 0;
6362 
6363 	shp = NFSSESSIONHASH(nd->nd_sessionid);
6364 	NFSLOCKSTATE();
6365 	NFSLOCKSESSION(shp);
6366 	sep = nfsrv_findsession(nd->nd_sessionid);
6367 	if (sep == NULL) {
6368 		NFSUNLOCKSESSION(shp);
6369 		NFSUNLOCKSTATE();
6370 		return (NFSERR_BADSESSION);
6371 	}
6372 
6373 	if (onefs != 0)
6374 		sep->sess_clp->lc_flags |= LCL_RECLAIMONEFS;
6375 		/* Check to see if reclaim complete has already happened. */
6376 	else if ((sep->sess_clp->lc_flags & LCL_RECLAIMCOMPLETE) != 0)
6377 		error = NFSERR_COMPLETEALREADY;
6378 	else {
6379 		sep->sess_clp->lc_flags |= LCL_RECLAIMCOMPLETE;
6380 		nfsrv_markreclaim(sep->sess_clp);
6381 	}
6382 	NFSUNLOCKSESSION(shp);
6383 	NFSUNLOCKSTATE();
6384 	return (error);
6385 }
6386 
6387 /*
6388  * Cache the reply in a session slot.
6389  */
6390 void
6391 nfsrv_cache_session(struct nfsrv_descript *nd, struct mbuf **m)
6392 {
6393 	struct nfsdsession *sep;
6394 	struct nfssessionhash *shp;
6395 	char *buf, *cp;
6396 #ifdef INET
6397 	struct sockaddr_in *sin;
6398 #endif
6399 #ifdef INET6
6400 	struct sockaddr_in6 *sin6;
6401 #endif
6402 
6403 	shp = NFSSESSIONHASH(nd->nd_sessionid);
6404 	NFSLOCKSESSION(shp);
6405 	sep = nfsrv_findsession(nd->nd_sessionid);
6406 	if (sep == NULL) {
6407 		NFSUNLOCKSESSION(shp);
6408 		if ((NFSD_VNET(nfsrv_stablefirst).nsf_flags &
6409 		     NFSNSF_GRACEOVER) != 0) {
6410 			buf = malloc(INET6_ADDRSTRLEN, M_TEMP, M_WAITOK);
6411 			switch (nd->nd_nam->sa_family) {
6412 #ifdef INET
6413 			case AF_INET:
6414 				sin = (struct sockaddr_in *)nd->nd_nam;
6415 				cp = inet_ntop(sin->sin_family,
6416 				    &sin->sin_addr.s_addr, buf,
6417 				    INET6_ADDRSTRLEN);
6418 				break;
6419 #endif
6420 #ifdef INET6
6421 			case AF_INET6:
6422 				sin6 = (struct sockaddr_in6 *)nd->nd_nam;
6423 				cp = inet_ntop(sin6->sin6_family,
6424 				    &sin6->sin6_addr, buf, INET6_ADDRSTRLEN);
6425 				break;
6426 #endif
6427 			default:
6428 				cp = NULL;
6429 			}
6430 			if (cp != NULL)
6431 				printf("nfsrv_cache_session: no session "
6432 				    "IPaddr=%s, check NFS clients for unique "
6433 				    "/etc/hostid's\n", cp);
6434 			else
6435 				printf("nfsrv_cache_session: no session, "
6436 				    "check NFS clients for unique "
6437 				    "/etc/hostid's\n");
6438 			free(buf, M_TEMP);
6439 		}
6440 		m_freem(*m);
6441 		return;
6442 	}
6443 	nfsv4_seqsess_cacherep(nd->nd_slotid, sep->sess_slots, nd->nd_repstat,
6444 	    m);
6445 	NFSUNLOCKSESSION(shp);
6446 }
6447 
6448 /*
6449  * Search for a session that matches the sessionid.
6450  */
6451 static struct nfsdsession *
6452 nfsrv_findsession(uint8_t *sessionid)
6453 {
6454 	struct nfsdsession *sep;
6455 	struct nfssessionhash *shp;
6456 
6457 	shp = NFSSESSIONHASH(sessionid);
6458 	LIST_FOREACH(sep, &shp->list, sess_hash) {
6459 		if (!NFSBCMP(sessionid, sep->sess_sessionid, NFSX_V4SESSIONID))
6460 			break;
6461 	}
6462 	return (sep);
6463 }
6464 
6465 /*
6466  * Destroy a session.
6467  */
6468 int
6469 nfsrv_destroysession(struct nfsrv_descript *nd, uint8_t *sessionid)
6470 {
6471 	int error, igotlock, samesess;
6472 
6473 	samesess = 0;
6474 	if (!NFSBCMP(sessionid, nd->nd_sessionid, NFSX_V4SESSIONID) &&
6475 	    (nd->nd_flag & ND_HASSEQUENCE) != 0) {
6476 		samesess = 1;
6477 		if ((nd->nd_flag & ND_LASTOP) == 0)
6478 			return (NFSERR_BADSESSION);
6479 	}
6480 
6481 	/* Lock out other nfsd threads */
6482 	NFSLOCKV4ROOTMUTEX();
6483 	nfsv4_relref(&nfsv4rootfs_lock);
6484 	do {
6485 		igotlock = nfsv4_lock(&nfsv4rootfs_lock, 1, NULL,
6486 		    NFSV4ROOTLOCKMUTEXPTR, NULL);
6487 	} while (igotlock == 0);
6488 	NFSUNLOCKV4ROOTMUTEX();
6489 
6490 	error = nfsrv_freesession(nd, NULL, sessionid);
6491 	if (error == 0 && samesess != 0)
6492 		nd->nd_flag &= ~ND_HASSEQUENCE;
6493 
6494 	NFSLOCKV4ROOTMUTEX();
6495 	nfsv4_unlock(&nfsv4rootfs_lock, 1);
6496 	NFSUNLOCKV4ROOTMUTEX();
6497 	return (error);
6498 }
6499 
6500 /*
6501  * Bind a connection to a session.
6502  * For now, only certain variants are supported, since the current session
6503  * structure can only handle a single backchannel entry, which will be
6504  * applied to all connections if it is set.
6505  */
6506 int
6507 nfsrv_bindconnsess(struct nfsrv_descript *nd, uint8_t *sessionid, int *foreaftp)
6508 {
6509 	struct nfssessionhash *shp;
6510 	struct nfsdsession *sep;
6511 	struct nfsclient *clp;
6512 	SVCXPRT *savxprt;
6513 	int error;
6514 
6515 	error = 0;
6516 	savxprt = NULL;
6517 	shp = NFSSESSIONHASH(sessionid);
6518 	NFSLOCKSTATE();
6519 	NFSLOCKSESSION(shp);
6520 	sep = nfsrv_findsession(sessionid);
6521 	if (sep != NULL) {
6522 		clp = sep->sess_clp;
6523 		error = nfsrv_checkmachcred(NFSV4OP_BINDCONNTOSESS, nd, clp);
6524 		if (error != 0)
6525 			goto out;
6526 		if (*foreaftp == NFSCDFC4_BACK ||
6527 		    *foreaftp == NFSCDFC4_BACK_OR_BOTH ||
6528 		    *foreaftp == NFSCDFC4_FORE_OR_BOTH) {
6529 			/* Try to set up a backchannel. */
6530 			if (clp->lc_req.nr_client == NULL) {
6531 				NFSD_DEBUG(2, "nfsrv_bindconnsess: acquire "
6532 				    "backchannel\n");
6533 				clp->lc_req.nr_client = (struct __rpc_client *)
6534 				    clnt_bck_create(nd->nd_xprt->xp_socket,
6535 				    sep->sess_cbprogram, NFSV4_CBVERS);
6536 			}
6537 			if (clp->lc_req.nr_client != NULL) {
6538 				NFSD_DEBUG(2, "nfsrv_bindconnsess: set up "
6539 				    "backchannel\n");
6540 				savxprt = sep->sess_cbsess.nfsess_xprt;
6541 				SVC_ACQUIRE(nd->nd_xprt);
6542 				CLNT_ACQUIRE(clp->lc_req.nr_client);
6543 				nd->nd_xprt->xp_p2 = clp->lc_req.nr_client;
6544 				/* Disable idle timeout. */
6545 				nd->nd_xprt->xp_idletimeout = 0;
6546 				sep->sess_cbsess.nfsess_xprt = nd->nd_xprt;
6547 				sep->sess_crflags |= NFSV4CRSESS_CONNBACKCHAN;
6548 				clp->lc_flags |= LCL_DONEBINDCONN |
6549 				    LCL_NEEDSCBNULL;
6550 				clp->lc_flags &= ~LCL_CBDOWN;
6551 				if (*foreaftp == NFSCDFS4_BACK)
6552 					*foreaftp = NFSCDFS4_BACK;
6553 				else
6554 					*foreaftp = NFSCDFS4_BOTH;
6555 			} else if (*foreaftp != NFSCDFC4_BACK) {
6556 				NFSD_DEBUG(2, "nfsrv_bindconnsess: can't set "
6557 				    "up backchannel\n");
6558 				sep->sess_crflags &= ~NFSV4CRSESS_CONNBACKCHAN;
6559 				clp->lc_flags |= LCL_DONEBINDCONN;
6560 				*foreaftp = NFSCDFS4_FORE;
6561 			} else {
6562 				error = NFSERR_NOTSUPP;
6563 				printf("nfsrv_bindconnsess: Can't add "
6564 				    "backchannel\n");
6565 			}
6566 		} else {
6567 			NFSD_DEBUG(2, "nfsrv_bindconnsess: Set forechannel\n");
6568 			clp->lc_flags |= LCL_DONEBINDCONN;
6569 			*foreaftp = NFSCDFS4_FORE;
6570 		}
6571 	} else
6572 		error = NFSERR_BADSESSION;
6573 out:
6574 	NFSUNLOCKSESSION(shp);
6575 	NFSUNLOCKSTATE();
6576 	if (savxprt != NULL)
6577 		SVC_RELEASE(savxprt);
6578 	return (error);
6579 }
6580 
6581 /*
6582  * Free up a session structure.
6583  */
6584 static int
6585 nfsrv_freesession(struct nfsrv_descript *nd, struct nfsdsession *sep,
6586     uint8_t *sessionid)
6587 {
6588 	struct nfssessionhash *shp;
6589 	int i;
6590 
6591 	NFSLOCKSTATE();
6592 	if (sep == NULL) {
6593 		shp = NFSSESSIONHASH(sessionid);
6594 		NFSLOCKSESSION(shp);
6595 		sep = nfsrv_findsession(sessionid);
6596 	} else {
6597 		shp = NFSSESSIONHASH(sep->sess_sessionid);
6598 		NFSLOCKSESSION(shp);
6599 	}
6600 	if (sep != NULL) {
6601 		/* Check for the SP4_MACH_CRED case. */
6602 		if (nd != NULL && nfsrv_checkmachcred(NFSV4OP_DESTROYSESSION,
6603 		    nd, sep->sess_clp) != 0) {
6604 			NFSUNLOCKSESSION(shp);
6605 			NFSUNLOCKSTATE();
6606 			return (NFSERR_AUTHERR | AUTH_TOOWEAK);
6607 		}
6608 
6609 		sep->sess_refcnt--;
6610 		if (sep->sess_refcnt > 0) {
6611 			NFSUNLOCKSESSION(shp);
6612 			NFSUNLOCKSTATE();
6613 			return (NFSERR_BACKCHANBUSY);
6614 		}
6615 		LIST_REMOVE(sep, sess_hash);
6616 		LIST_REMOVE(sep, sess_list);
6617 	}
6618 	NFSUNLOCKSESSION(shp);
6619 	NFSUNLOCKSTATE();
6620 	if (sep == NULL)
6621 		return (NFSERR_BADSESSION);
6622 	for (i = 0; i < NFSV4_SLOTS; i++)
6623 		if (sep->sess_slots[i].nfssl_reply != NULL)
6624 			m_freem(sep->sess_slots[i].nfssl_reply);
6625 	if (sep->sess_cbsess.nfsess_xprt != NULL)
6626 		SVC_RELEASE(sep->sess_cbsess.nfsess_xprt);
6627 	free(sep, M_NFSDSESSION);
6628 	return (0);
6629 }
6630 
6631 /*
6632  * Free a stateid.
6633  * RFC5661 says that it should fail when there are associated opens, locks
6634  * or delegations. Since stateids represent opens, I don't see how you can
6635  * free an open stateid (it will be free'd when closed), so this function
6636  * only works for lock stateids (freeing the lock_owner) or delegations.
6637  */
6638 int
6639 nfsrv_freestateid(struct nfsrv_descript *nd, nfsv4stateid_t *stateidp,
6640     NFSPROC_T *p)
6641 {
6642 	struct nfsclient *clp;
6643 	struct nfsstate *stp;
6644 	int error;
6645 
6646 	NFSLOCKSTATE();
6647 	/*
6648 	 * Look up the stateid
6649 	 */
6650 	error = nfsrv_getclient((nfsquad_t)((u_quad_t)0), CLOPS_RENEW, &clp,
6651 	    NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p);
6652 	if (error == 0) {
6653 		/* First, check for a delegation. */
6654 		LIST_FOREACH(stp, &clp->lc_deleg, ls_list) {
6655 			if (!NFSBCMP(stp->ls_stateid.other, stateidp->other,
6656 			    NFSX_STATEIDOTHER))
6657 				break;
6658 		}
6659 		if (stp != NULL) {
6660 			nfsrv_freedeleg(stp);
6661 			NFSUNLOCKSTATE();
6662 			return (error);
6663 		}
6664 	}
6665 	/* Not a delegation, try for a lock_owner. */
6666 	if (error == 0)
6667 		error = nfsrv_getstate(clp, stateidp, 0, &stp);
6668 	if (error == 0 && ((stp->ls_flags & (NFSLCK_OPEN | NFSLCK_DELEGREAD |
6669 	    NFSLCK_DELEGWRITE)) != 0 || (stp->ls_flags & NFSLCK_LOCK) == 0))
6670 		/* Not a lock_owner stateid. */
6671 		error = NFSERR_LOCKSHELD;
6672 	if (error == 0 && !LIST_EMPTY(&stp->ls_lock))
6673 		error = NFSERR_LOCKSHELD;
6674 	if (error == 0)
6675 		nfsrv_freelockowner(stp, NULL, 0, p);
6676 	NFSUNLOCKSTATE();
6677 	return (error);
6678 }
6679 
6680 /*
6681  * Test a stateid.
6682  */
6683 int
6684 nfsrv_teststateid(struct nfsrv_descript *nd, nfsv4stateid_t *stateidp,
6685     NFSPROC_T *p)
6686 {
6687 	struct nfsclient *clp;
6688 	struct nfsstate *stp;
6689 	int error;
6690 
6691 	NFSLOCKSTATE();
6692 	/*
6693 	 * Look up the stateid
6694 	 */
6695 	error = nfsrv_getclient((nfsquad_t)((u_quad_t)0), CLOPS_RENEW, &clp,
6696 	    NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p);
6697 	if (error == 0)
6698 		error = nfsrv_getstate(clp, stateidp, 0, &stp);
6699 	if (error == 0 && stateidp->seqid != 0 &&
6700 	    SEQ_LT(stateidp->seqid, stp->ls_stateid.seqid))
6701 		error = NFSERR_OLDSTATEID;
6702 	NFSUNLOCKSTATE();
6703 	return (error);
6704 }
6705 
6706 /*
6707  * Generate the xdr for an NFSv4.1 CBSequence Operation.
6708  */
6709 static int
6710 nfsv4_setcbsequence(struct nfsrv_descript *nd, struct nfsclient *clp,
6711     int dont_replycache, struct nfsdsession **sepp, int *slotposp)
6712 {
6713 	struct nfsdsession *sep;
6714 	uint32_t *tl, slotseq = 0;
6715 	int maxslot;
6716 	uint8_t sessionid[NFSX_V4SESSIONID];
6717 	int error;
6718 
6719 	error = nfsv4_getcbsession(clp, sepp);
6720 	if (error != 0)
6721 		return (error);
6722 	sep = *sepp;
6723 	nfsv4_sequencelookup(NULL, &sep->sess_cbsess, slotposp, &maxslot,
6724 	    &slotseq, sessionid, true);
6725 	KASSERT(maxslot >= 0, ("nfsv4_setcbsequence neg maxslot"));
6726 
6727 	/* Build the Sequence arguments. */
6728 	NFSM_BUILD(tl, uint32_t *, NFSX_V4SESSIONID + 5 * NFSX_UNSIGNED);
6729 	bcopy(sessionid, tl, NFSX_V4SESSIONID);
6730 	tl += NFSX_V4SESSIONID / NFSX_UNSIGNED;
6731 	nd->nd_slotseq = tl;
6732 	nd->nd_slotid = *slotposp;
6733 	nd->nd_flag |= ND_HASSLOTID;
6734 	*tl++ = txdr_unsigned(slotseq);
6735 	*tl++ = txdr_unsigned(*slotposp);
6736 	*tl++ = txdr_unsigned(maxslot);
6737 	if (dont_replycache == 0)
6738 		*tl++ = newnfs_true;
6739 	else
6740 		*tl++ = newnfs_false;
6741 	*tl = 0;			/* No referring call list, for now. */
6742 	nd->nd_flag |= ND_HASSEQUENCE;
6743 	return (0);
6744 }
6745 
6746 /*
6747  * Get a session for the callback.
6748  */
6749 static int
6750 nfsv4_getcbsession(struct nfsclient *clp, struct nfsdsession **sepp)
6751 {
6752 	struct nfsdsession *sep;
6753 
6754 	NFSLOCKSTATE();
6755 	LIST_FOREACH(sep, &clp->lc_session, sess_list) {
6756 		if ((sep->sess_crflags & NFSV4CRSESS_CONNBACKCHAN) != 0)
6757 			break;
6758 	}
6759 	if (sep == NULL) {
6760 		NFSUNLOCKSTATE();
6761 		return (NFSERR_BADSESSION);
6762 	}
6763 	sep->sess_refcnt++;
6764 	*sepp = sep;
6765 	NFSUNLOCKSTATE();
6766 	return (0);
6767 }
6768 
6769 /*
6770  * Free up all backchannel xprts.  This needs to be done when the nfsd threads
6771  * exit, since those transports will all be going away.
6772  * This is only called after all the nfsd threads are done performing RPCs,
6773  * so locking shouldn't be an issue.
6774  */
6775 void
6776 nfsrv_freeallbackchannel_xprts(void)
6777 {
6778 	struct nfsdsession *sep;
6779 	struct nfsclient *clp;
6780 	SVCXPRT *xprt;
6781 	int i;
6782 
6783 	for (i = 0; i < nfsrv_clienthashsize; i++) {
6784 		LIST_FOREACH(clp, &NFSD_VNET(nfsclienthash)[i], lc_hash) {
6785 			LIST_FOREACH(sep, &clp->lc_session, sess_list) {
6786 				xprt = sep->sess_cbsess.nfsess_xprt;
6787 				sep->sess_cbsess.nfsess_xprt = NULL;
6788 				if (xprt != NULL)
6789 					SVC_RELEASE(xprt);
6790 			}
6791 		}
6792 	}
6793 }
6794 
6795 /*
6796  * Do a layout commit.  Actually just call nfsrv_updatemdsattr().
6797  * I have no idea if the rest of these arguments will ever be useful?
6798  */
6799 int
6800 nfsrv_layoutcommit(struct nfsrv_descript *nd, vnode_t vp, int layouttype,
6801     int hasnewoff, uint64_t newoff, uint64_t offset, uint64_t len,
6802     int hasnewmtime, struct timespec *newmtimep, int reclaim,
6803     nfsv4stateid_t *stateidp, int maxcnt, char *layp, int *hasnewsizep,
6804     uint64_t *newsizep, struct ucred *cred, NFSPROC_T *p)
6805 {
6806 	struct nfsvattr na;
6807 	int error;
6808 
6809 	error = nfsrv_updatemdsattr(vp, &na, p);
6810 	if (error == 0) {
6811 		*hasnewsizep = 1;
6812 		*newsizep = na.na_size;
6813 	}
6814 	return (error);
6815 }
6816 
6817 /*
6818  * Try and get a layout.
6819  */
6820 int
6821 nfsrv_layoutget(struct nfsrv_descript *nd, vnode_t vp, struct nfsexstuff *exp,
6822     int layouttype, int *iomode, uint64_t *offset, uint64_t *len,
6823     uint64_t minlen, nfsv4stateid_t *stateidp, int maxcnt, int *retonclose,
6824     int *layoutlenp, char *layp, struct ucred *cred, NFSPROC_T *p)
6825 {
6826 	struct nfslayouthash *lhyp;
6827 	struct nfslayout *lyp;
6828 	char *devid;
6829 	fhandle_t fh, *dsfhp;
6830 	int error, mirrorcnt;
6831 
6832 	if (nfsrv_devidcnt == 0)
6833 		return (NFSERR_UNKNLAYOUTTYPE);
6834 
6835 	if (*offset != 0)
6836 		printf("nfsrv_layoutget: off=%ju len=%ju\n", (uintmax_t)*offset,
6837 		    (uintmax_t)*len);
6838 	error = nfsvno_getfh(vp, &fh, p);
6839 	NFSD_DEBUG(4, "layoutget getfh=%d\n", error);
6840 	if (error != 0)
6841 		return (error);
6842 
6843 	/*
6844 	 * For now, all layouts are for entire files.
6845 	 * Only issue Read/Write layouts if requested for a non-readonly fs.
6846 	 */
6847 	if (NFSVNO_EXRDONLY(exp)) {
6848 		if (*iomode == NFSLAYOUTIOMODE_RW)
6849 			return (NFSERR_LAYOUTTRYLATER);
6850 		*iomode = NFSLAYOUTIOMODE_READ;
6851 	}
6852 	if (*iomode != NFSLAYOUTIOMODE_RW)
6853 		*iomode = NFSLAYOUTIOMODE_READ;
6854 
6855 	/*
6856 	 * Check to see if a write layout can be issued for this file.
6857 	 * This is used during mirror recovery to avoid RW layouts being
6858 	 * issued for a file while it is being copied to the recovered
6859 	 * mirror.
6860 	 */
6861 	if (*iomode == NFSLAYOUTIOMODE_RW && nfsrv_dontlayout(&fh) != 0)
6862 		return (NFSERR_LAYOUTTRYLATER);
6863 
6864 	*retonclose = 0;
6865 	*offset = 0;
6866 	*len = UINT64_MAX;
6867 
6868 	/* First, see if a layout already exists and return if found. */
6869 	lhyp = NFSLAYOUTHASH(&fh);
6870 	NFSLOCKLAYOUT(lhyp);
6871 	error = nfsrv_findlayout(&nd->nd_clientid, &fh, layouttype, p, &lyp);
6872 	NFSD_DEBUG(4, "layoutget findlay=%d\n", error);
6873 	/*
6874 	 * Not sure if the seqid must be the same, so I won't check it.
6875 	 */
6876 	if (error == 0 && (stateidp->other[0] != lyp->lay_stateid.other[0] ||
6877 	    stateidp->other[1] != lyp->lay_stateid.other[1] ||
6878 	    stateidp->other[2] != lyp->lay_stateid.other[2])) {
6879 		if ((lyp->lay_flags & NFSLAY_CALLB) == 0) {
6880 			NFSUNLOCKLAYOUT(lhyp);
6881 			NFSD_DEBUG(1, "ret bad stateid\n");
6882 			return (NFSERR_BADSTATEID);
6883 		}
6884 		/*
6885 		 * I believe we get here because there is a race between
6886 		 * the client processing the CBLAYOUTRECALL and the layout
6887 		 * being deleted here on the server.
6888 		 * The client has now done a LayoutGet with a non-layout
6889 		 * stateid, as it would when there is no layout.
6890 		 * As such, free this layout and set error == NFSERR_BADSTATEID
6891 		 * so the code below will create a new layout structure as
6892 		 * would happen if no layout was found.
6893 		 * "lyp" will be set before being used below, but set it NULL
6894 		 * as a safety belt.
6895 		 */
6896 		nfsrv_freelayout(&lhyp->list, lyp);
6897 		lyp = NULL;
6898 		error = NFSERR_BADSTATEID;
6899 	}
6900 	if (error == 0) {
6901 		if (lyp->lay_layoutlen > maxcnt) {
6902 			NFSUNLOCKLAYOUT(lhyp);
6903 			NFSD_DEBUG(1, "ret layout too small\n");
6904 			return (NFSERR_TOOSMALL);
6905 		}
6906 		if (*iomode == NFSLAYOUTIOMODE_RW) {
6907 			if ((lyp->lay_flags & NFSLAY_NOSPC) != 0) {
6908 				NFSUNLOCKLAYOUT(lhyp);
6909 				NFSD_DEBUG(1, "ret layout nospace\n");
6910 				return (NFSERR_NOSPC);
6911 			}
6912 			lyp->lay_flags |= NFSLAY_RW;
6913 		} else
6914 			lyp->lay_flags |= NFSLAY_READ;
6915 		NFSBCOPY(lyp->lay_xdr, layp, lyp->lay_layoutlen);
6916 		*layoutlenp = lyp->lay_layoutlen;
6917 		if (++lyp->lay_stateid.seqid == 0)
6918 			lyp->lay_stateid.seqid = 1;
6919 		stateidp->seqid = lyp->lay_stateid.seqid;
6920 		NFSUNLOCKLAYOUT(lhyp);
6921 		NFSD_DEBUG(4, "ret fnd layout\n");
6922 		return (0);
6923 	}
6924 	NFSUNLOCKLAYOUT(lhyp);
6925 
6926 	/* Find the device id and file handle. */
6927 	dsfhp = malloc(sizeof(fhandle_t) * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK);
6928 	devid = malloc(NFSX_V4DEVICEID * NFSDEV_MAXMIRRORS, M_TEMP, M_WAITOK);
6929 	error = nfsrv_dsgetdevandfh(vp, p, &mirrorcnt, dsfhp, devid);
6930 	NFSD_DEBUG(4, "layoutget devandfh=%d\n", error);
6931 	if (error == 0) {
6932 		if (layouttype == NFSLAYOUT_NFSV4_1_FILES) {
6933 			if (NFSX_V4FILELAYOUT > maxcnt)
6934 				error = NFSERR_TOOSMALL;
6935 			else
6936 				lyp = nfsrv_filelayout(nd, *iomode, &fh, dsfhp,
6937 				    devid, vp->v_mount->mnt_stat.f_fsid);
6938 		} else {
6939 			if (NFSX_V4FLEXLAYOUT(mirrorcnt) > maxcnt)
6940 				error = NFSERR_TOOSMALL;
6941 			else
6942 				lyp = nfsrv_flexlayout(nd, *iomode, mirrorcnt,
6943 				    &fh, dsfhp, devid,
6944 				    vp->v_mount->mnt_stat.f_fsid);
6945 		}
6946 	}
6947 	free(dsfhp, M_TEMP);
6948 	free(devid, M_TEMP);
6949 	if (error != 0)
6950 		return (error);
6951 
6952 	/*
6953 	 * Now, add this layout to the list.
6954 	 */
6955 	error = nfsrv_addlayout(nd, &lyp, stateidp, layp, layoutlenp, p);
6956 	NFSD_DEBUG(4, "layoutget addl=%d\n", error);
6957 	/*
6958 	 * The lyp will be set to NULL by nfsrv_addlayout() if it
6959 	 * linked the new structure into the lists.
6960 	 */
6961 	free(lyp, M_NFSDSTATE);
6962 	return (error);
6963 }
6964 
6965 /*
6966  * Generate a File Layout.
6967  */
6968 static struct nfslayout *
6969 nfsrv_filelayout(struct nfsrv_descript *nd, int iomode, fhandle_t *fhp,
6970     fhandle_t *dsfhp, char *devid, fsid_t fs)
6971 {
6972 	uint32_t *tl;
6973 	struct nfslayout *lyp;
6974 	uint64_t pattern_offset;
6975 
6976 	lyp = malloc(sizeof(struct nfslayout) + NFSX_V4FILELAYOUT, M_NFSDSTATE,
6977 	    M_WAITOK | M_ZERO);
6978 	lyp->lay_type = NFSLAYOUT_NFSV4_1_FILES;
6979 	if (iomode == NFSLAYOUTIOMODE_RW)
6980 		lyp->lay_flags = NFSLAY_RW;
6981 	else
6982 		lyp->lay_flags = NFSLAY_READ;
6983 	NFSBCOPY(fhp, &lyp->lay_fh, sizeof(*fhp));
6984 	lyp->lay_clientid.qval = nd->nd_clientid.qval;
6985 	lyp->lay_fsid = fs;
6986 	NFSBCOPY(devid, lyp->lay_deviceid, NFSX_V4DEVICEID);
6987 
6988 	/* Fill in the xdr for the files layout. */
6989 	tl = (uint32_t *)lyp->lay_xdr;
6990 	NFSBCOPY(devid, tl, NFSX_V4DEVICEID);		/* Device ID. */
6991 	tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
6992 
6993 	/* Set the stripe size to the maximum I/O size. */
6994 	*tl++ = txdr_unsigned(nfs_srvmaxio & NFSFLAYUTIL_STRIPE_MASK);
6995 	*tl++ = 0;					/* 1st stripe index. */
6996 	pattern_offset = 0;
6997 	txdr_hyper(pattern_offset, tl); tl += 2;	/* Pattern offset. */
6998 	*tl++ = txdr_unsigned(1);			/* 1 file handle. */
6999 	*tl++ = txdr_unsigned(NFSX_V4PNFSFH);
7000 	NFSBCOPY(dsfhp, tl, sizeof(*dsfhp));
7001 	lyp->lay_layoutlen = NFSX_V4FILELAYOUT;
7002 	return (lyp);
7003 }
7004 
7005 #define	FLEX_OWNERID	"999"
7006 #define	FLEX_UID0	"0"
7007 /*
7008  * Generate a Flex File Layout.
7009  * The FLEX_OWNERID can be any string of 3 decimal digits. Although this
7010  * string goes on the wire, it isn't supposed to be used by the client,
7011  * since this server uses tight coupling.
7012  * Although not recommended by the spec., if vfs.nfsd.flexlinuxhack=1 use
7013  * a string of "0". This works around the Linux Flex File Layout driver bug
7014  * which uses the synthetic uid/gid strings for the "tightly coupled" case.
7015  */
7016 static struct nfslayout *
7017 nfsrv_flexlayout(struct nfsrv_descript *nd, int iomode, int mirrorcnt,
7018     fhandle_t *fhp, fhandle_t *dsfhp, char *devid, fsid_t fs)
7019 {
7020 	uint32_t *tl;
7021 	struct nfslayout *lyp;
7022 	uint64_t lenval;
7023 	int i;
7024 
7025 	lyp = malloc(sizeof(struct nfslayout) + NFSX_V4FLEXLAYOUT(mirrorcnt),
7026 	    M_NFSDSTATE, M_WAITOK | M_ZERO);
7027 	lyp->lay_type = NFSLAYOUT_FLEXFILE;
7028 	if (iomode == NFSLAYOUTIOMODE_RW)
7029 		lyp->lay_flags = NFSLAY_RW;
7030 	else
7031 		lyp->lay_flags = NFSLAY_READ;
7032 	NFSBCOPY(fhp, &lyp->lay_fh, sizeof(*fhp));
7033 	lyp->lay_clientid.qval = nd->nd_clientid.qval;
7034 	lyp->lay_fsid = fs;
7035 	lyp->lay_mirrorcnt = mirrorcnt;
7036 	NFSBCOPY(devid, lyp->lay_deviceid, NFSX_V4DEVICEID);
7037 
7038 	/* Fill in the xdr for the files layout. */
7039 	tl = (uint32_t *)lyp->lay_xdr;
7040 	lenval = 0;
7041 	txdr_hyper(lenval, tl); tl += 2;		/* Stripe unit. */
7042 	*tl++ = txdr_unsigned(mirrorcnt);		/* # of mirrors. */
7043 	for (i = 0; i < mirrorcnt; i++) {
7044 		*tl++ = txdr_unsigned(1);		/* One stripe. */
7045 		NFSBCOPY(devid, tl, NFSX_V4DEVICEID);	/* Device ID. */
7046 		tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
7047 		devid += NFSX_V4DEVICEID;
7048 		*tl++ = txdr_unsigned(1);		/* Efficiency. */
7049 		*tl++ = 0;				/* Proxy Stateid. */
7050 		*tl++ = 0x55555555;
7051 		*tl++ = 0x55555555;
7052 		*tl++ = 0x55555555;
7053 		*tl++ = txdr_unsigned(1);		/* 1 file handle. */
7054 		*tl++ = txdr_unsigned(NFSX_V4PNFSFH);
7055 		NFSBCOPY(dsfhp, tl, sizeof(*dsfhp));
7056 		tl += (NFSM_RNDUP(NFSX_V4PNFSFH) / NFSX_UNSIGNED);
7057 		dsfhp++;
7058 		if (nfsrv_flexlinuxhack != 0) {
7059 			*tl++ = txdr_unsigned(strlen(FLEX_UID0));
7060 			*tl = 0;		/* 0 pad string. */
7061 			NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
7062 			*tl++ = txdr_unsigned(strlen(FLEX_UID0));
7063 			*tl = 0;		/* 0 pad string. */
7064 			NFSBCOPY(FLEX_UID0, tl++, strlen(FLEX_UID0));
7065 		} else {
7066 			*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
7067 			NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
7068 			*tl++ = txdr_unsigned(strlen(FLEX_OWNERID));
7069 			NFSBCOPY(FLEX_OWNERID, tl++, NFSX_UNSIGNED);
7070 		}
7071 	}
7072 	*tl++ = txdr_unsigned(0);		/* ff_flags. */
7073 	*tl = txdr_unsigned(60);		/* Status interval hint. */
7074 	lyp->lay_layoutlen = NFSX_V4FLEXLAYOUT(mirrorcnt);
7075 	return (lyp);
7076 }
7077 
7078 /*
7079  * Parse and process Flex File errors returned via LayoutReturn.
7080  */
7081 static void
7082 nfsrv_flexlayouterr(struct nfsrv_descript *nd, uint32_t *layp, int maxcnt,
7083     NFSPROC_T *p)
7084 {
7085 	uint32_t *tl;
7086 	int cnt, errcnt, i, j, opnum, stat;
7087 	char devid[NFSX_V4DEVICEID];
7088 
7089 	tl = layp;
7090 	maxcnt -= NFSX_UNSIGNED;
7091 	if (maxcnt > 0)
7092 		cnt = fxdr_unsigned(int, *tl++);
7093 	else
7094 		cnt = 0;
7095 	NFSD_DEBUG(4, "flexlayouterr cnt=%d\n", cnt);
7096 	for (i = 0; i < cnt; i++) {
7097 		maxcnt -= NFSX_STATEID + 2 * NFSX_HYPER +
7098 		    NFSX_UNSIGNED;
7099 		if (maxcnt <= 0)
7100 			break;
7101 		/* Skip offset, length and stateid for now. */
7102 		tl += (4 + NFSX_STATEID / NFSX_UNSIGNED);
7103 		errcnt = fxdr_unsigned(int, *tl++);
7104 		NFSD_DEBUG(4, "flexlayouterr errcnt=%d\n", errcnt);
7105 		for (j = 0; j < errcnt; j++) {
7106 			maxcnt -= NFSX_V4DEVICEID + 2 * NFSX_UNSIGNED;
7107 			if (maxcnt < 0)
7108 				break;
7109 			NFSBCOPY(tl, devid, NFSX_V4DEVICEID);
7110 			tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED);
7111 			stat = fxdr_unsigned(int, *tl++);
7112 			opnum = fxdr_unsigned(int, *tl++);
7113 			NFSD_DEBUG(4, "flexlayouterr op=%d stat=%d\n", opnum,
7114 			    stat);
7115 			/*
7116 			 * Except for NFSERR_ACCES, NFSERR_STALE and
7117 			 * NFSERR_NOSPC errors, disable the mirror.
7118 			 */
7119 			if (stat != NFSERR_ACCES && stat != NFSERR_STALE &&
7120 			    stat != NFSERR_NOSPC)
7121 				nfsrv_delds(devid, p);
7122 
7123 			/* For NFSERR_NOSPC, mark all devids and layouts. */
7124 			if (stat == NFSERR_NOSPC)
7125 				nfsrv_marknospc(devid, true);
7126 		}
7127 	}
7128 }
7129 
7130 /*
7131  * This function removes all flex file layouts which has a mirror with
7132  * a device id that matches the argument.
7133  * Called when the DS represented by the device id has failed.
7134  */
7135 void
7136 nfsrv_flexmirrordel(char *devid, NFSPROC_T *p)
7137 {
7138 	uint32_t *tl;
7139 	struct nfslayout *lyp, *nlyp;
7140 	struct nfslayouthash *lhyp;
7141 	struct nfslayouthead loclyp;
7142 	int i, j;
7143 
7144 	NFSD_DEBUG(4, "flexmirrordel\n");
7145 	/* Move all layouts found onto a local list. */
7146 	TAILQ_INIT(&loclyp);
7147 	for (i = 0; i < nfsrv_layouthashsize; i++) {
7148 		lhyp = &nfslayouthash[i];
7149 		NFSLOCKLAYOUT(lhyp);
7150 		TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp) {
7151 			if (lyp->lay_type == NFSLAYOUT_FLEXFILE &&
7152 			    lyp->lay_mirrorcnt > 1) {
7153 				NFSD_DEBUG(4, "possible match\n");
7154 				tl = lyp->lay_xdr;
7155 				tl += 3;
7156 				for (j = 0; j < lyp->lay_mirrorcnt; j++) {
7157 					tl++;
7158 					if (NFSBCMP(devid, tl, NFSX_V4DEVICEID)
7159 					    == 0) {
7160 						/* Found one. */
7161 						NFSD_DEBUG(4, "fnd one\n");
7162 						TAILQ_REMOVE(&lhyp->list, lyp,
7163 						    lay_list);
7164 						TAILQ_INSERT_HEAD(&loclyp, lyp,
7165 						    lay_list);
7166 						break;
7167 					}
7168 					tl += (NFSX_V4DEVICEID / NFSX_UNSIGNED +
7169 					    NFSM_RNDUP(NFSX_V4PNFSFH) /
7170 					    NFSX_UNSIGNED + 11 * NFSX_UNSIGNED);
7171 				}
7172 			}
7173 		}
7174 		NFSUNLOCKLAYOUT(lhyp);
7175 	}
7176 
7177 	/* Now, try to do a Layout recall for each one found. */
7178 	TAILQ_FOREACH_SAFE(lyp, &loclyp, lay_list, nlyp) {
7179 		NFSD_DEBUG(4, "do layout recall\n");
7180 		/*
7181 		 * The layout stateid.seqid needs to be incremented
7182 		 * before doing a LAYOUT_RECALL callback.
7183 		 */
7184 		if (++lyp->lay_stateid.seqid == 0)
7185 			lyp->lay_stateid.seqid = 1;
7186 		nfsrv_recalllayout(lyp->lay_clientid, &lyp->lay_stateid,
7187 		    &lyp->lay_fh, lyp, 1, lyp->lay_type, p);
7188 		nfsrv_freelayout(&loclyp, lyp);
7189 	}
7190 }
7191 
7192 /*
7193  * Do a recall callback to the client for this layout.
7194  */
7195 static int
7196 nfsrv_recalllayout(nfsquad_t clid, nfsv4stateid_t *stateidp, fhandle_t *fhp,
7197     struct nfslayout *lyp, int changed, int laytype, NFSPROC_T *p)
7198 {
7199 	struct nfsclient *clp;
7200 	int error;
7201 
7202 	NFSD_DEBUG(4, "nfsrv_recalllayout\n");
7203 	error = nfsrv_getclient(clid, 0, &clp, NULL, (nfsquad_t)((u_quad_t)0),
7204 	    0, NULL, p);
7205 	NFSD_DEBUG(4, "aft nfsrv_getclient=%d\n", error);
7206 	if (error != 0) {
7207 		printf("nfsrv_recalllayout: getclient err=%d\n", error);
7208 		return (error);
7209 	}
7210 	if ((clp->lc_flags & LCL_NFSV41) != 0) {
7211 		error = nfsrv_docallback(clp, NFSV4OP_CBLAYOUTRECALL,
7212 		    stateidp, changed, fhp, NULL, NULL, laytype, p);
7213 		/* If lyp != NULL, handle an error return here. */
7214 		if (error != 0 && lyp != NULL) {
7215 			NFSDRECALLLOCK();
7216 			/*
7217 			 * Mark it returned, since no layout recall
7218 			 * has been done.
7219 			 * All errors seem to be non-recoverable, although
7220 			 * NFSERR_NOMATCHLAYOUT is a normal event.
7221 			 */
7222 			if ((lyp->lay_flags & NFSLAY_RECALL) != 0) {
7223 				lyp->lay_flags |= NFSLAY_RETURNED;
7224 				wakeup(lyp);
7225 			}
7226 			NFSDRECALLUNLOCK();
7227 			if (error != NFSERR_NOMATCHLAYOUT)
7228 				printf("nfsrv_recalllayout: err=%d\n", error);
7229 		}
7230 	} else
7231 		printf("nfsrv_recalllayout: clp not NFSv4.1\n");
7232 	return (error);
7233 }
7234 
7235 /*
7236  * Find a layout to recall when we exceed our high water mark.
7237  */
7238 void
7239 nfsrv_recalloldlayout(NFSPROC_T *p)
7240 {
7241 	struct nfslayouthash *lhyp;
7242 	struct nfslayout *lyp;
7243 	nfsquad_t clientid;
7244 	nfsv4stateid_t stateid;
7245 	fhandle_t fh;
7246 	int error, laytype = 0, ret;
7247 
7248 	lhyp = &nfslayouthash[arc4random() % nfsrv_layouthashsize];
7249 	NFSLOCKLAYOUT(lhyp);
7250 	TAILQ_FOREACH_REVERSE(lyp, &lhyp->list, nfslayouthead, lay_list) {
7251 		if ((lyp->lay_flags & NFSLAY_CALLB) == 0) {
7252 			lyp->lay_flags |= NFSLAY_CALLB;
7253 			/*
7254 			 * The layout stateid.seqid needs to be incremented
7255 			 * before doing a LAYOUT_RECALL callback.
7256 			 */
7257 			if (++lyp->lay_stateid.seqid == 0)
7258 				lyp->lay_stateid.seqid = 1;
7259 			clientid = lyp->lay_clientid;
7260 			stateid = lyp->lay_stateid;
7261 			NFSBCOPY(&lyp->lay_fh, &fh, sizeof(fh));
7262 			laytype = lyp->lay_type;
7263 			break;
7264 		}
7265 	}
7266 	NFSUNLOCKLAYOUT(lhyp);
7267 	if (lyp != NULL) {
7268 		error = nfsrv_recalllayout(clientid, &stateid, &fh, NULL, 0,
7269 		    laytype, p);
7270 		if (error != 0 && error != NFSERR_NOMATCHLAYOUT)
7271 			NFSD_DEBUG(4, "recallold=%d\n", error);
7272 		if (error != 0) {
7273 			NFSLOCKLAYOUT(lhyp);
7274 			/*
7275 			 * Since the hash list was unlocked, we need to
7276 			 * find it again.
7277 			 */
7278 			ret = nfsrv_findlayout(&clientid, &fh, laytype, p,
7279 			    &lyp);
7280 			if (ret == 0 &&
7281 			    (lyp->lay_flags & NFSLAY_CALLB) != 0 &&
7282 			    lyp->lay_stateid.other[0] == stateid.other[0] &&
7283 			    lyp->lay_stateid.other[1] == stateid.other[1] &&
7284 			    lyp->lay_stateid.other[2] == stateid.other[2]) {
7285 				/*
7286 				 * The client no longer knows this layout, so
7287 				 * it can be free'd now.
7288 				 */
7289 				if (error == NFSERR_NOMATCHLAYOUT)
7290 					nfsrv_freelayout(&lhyp->list, lyp);
7291 				else {
7292 					/*
7293 					 * Leave it to be tried later by
7294 					 * clearing NFSLAY_CALLB and moving
7295 					 * it to the head of the list, so it
7296 					 * won't be tried again for a while.
7297 					 */
7298 					lyp->lay_flags &= ~NFSLAY_CALLB;
7299 					TAILQ_REMOVE(&lhyp->list, lyp,
7300 					    lay_list);
7301 					TAILQ_INSERT_HEAD(&lhyp->list, lyp,
7302 					    lay_list);
7303 				}
7304 			}
7305 			NFSUNLOCKLAYOUT(lhyp);
7306 		}
7307 	}
7308 }
7309 
7310 /*
7311  * Try and return layout(s).
7312  */
7313 int
7314 nfsrv_layoutreturn(struct nfsrv_descript *nd, vnode_t vp,
7315     int layouttype, int iomode, uint64_t offset, uint64_t len, int reclaim,
7316     int kind, nfsv4stateid_t *stateidp, int maxcnt, uint32_t *layp, int *fndp,
7317     struct ucred *cred, NFSPROC_T *p)
7318 {
7319 	struct nfsvattr na;
7320 	struct nfslayouthash *lhyp;
7321 	struct nfslayout *lyp;
7322 	fhandle_t fh;
7323 	int error = 0;
7324 
7325 	*fndp = 0;
7326 	if (kind == NFSV4LAYOUTRET_FILE) {
7327 		error = nfsvno_getfh(vp, &fh, p);
7328 		if (error == 0) {
7329 			error = nfsrv_updatemdsattr(vp, &na, p);
7330 			if (error != 0)
7331 				printf("nfsrv_layoutreturn: updatemdsattr"
7332 				    " failed=%d\n", error);
7333 		}
7334 		if (error == 0) {
7335 			if (reclaim == newnfs_true) {
7336 				error = nfsrv_checkgrace(NULL, NULL,
7337 				    NFSLCK_RECLAIM);
7338 				if (error != NFSERR_NOGRACE)
7339 					error = 0;
7340 				return (error);
7341 			}
7342 			lhyp = NFSLAYOUTHASH(&fh);
7343 			NFSDRECALLLOCK();
7344 			NFSLOCKLAYOUT(lhyp);
7345 			error = nfsrv_findlayout(&nd->nd_clientid, &fh,
7346 			    layouttype, p, &lyp);
7347 			NFSD_DEBUG(4, "layoutret findlay=%d\n", error);
7348 			if (error == 0 &&
7349 			    stateidp->other[0] == lyp->lay_stateid.other[0] &&
7350 			    stateidp->other[1] == lyp->lay_stateid.other[1] &&
7351 			    stateidp->other[2] == lyp->lay_stateid.other[2]) {
7352 				NFSD_DEBUG(4, "nfsrv_layoutreturn: stateid %d"
7353 				    " %x %x %x laystateid %d %x %x %x"
7354 				    " off=%ju len=%ju flgs=0x%x\n",
7355 				    stateidp->seqid, stateidp->other[0],
7356 				    stateidp->other[1], stateidp->other[2],
7357 				    lyp->lay_stateid.seqid,
7358 				    lyp->lay_stateid.other[0],
7359 				    lyp->lay_stateid.other[1],
7360 				    lyp->lay_stateid.other[2],
7361 				    (uintmax_t)offset, (uintmax_t)len,
7362 				    lyp->lay_flags);
7363 				if (++lyp->lay_stateid.seqid == 0)
7364 					lyp->lay_stateid.seqid = 1;
7365 				stateidp->seqid = lyp->lay_stateid.seqid;
7366 				if (offset == 0 && len == UINT64_MAX) {
7367 					if ((iomode & NFSLAYOUTIOMODE_READ) !=
7368 					    0)
7369 						lyp->lay_flags &= ~NFSLAY_READ;
7370 					if ((iomode & NFSLAYOUTIOMODE_RW) != 0)
7371 						lyp->lay_flags &= ~NFSLAY_RW;
7372 					if ((lyp->lay_flags & (NFSLAY_READ |
7373 					    NFSLAY_RW)) == 0)
7374 						nfsrv_freelayout(&lhyp->list,
7375 						    lyp);
7376 					else
7377 						*fndp = 1;
7378 				} else
7379 					*fndp = 1;
7380 			}
7381 			NFSUNLOCKLAYOUT(lhyp);
7382 			/* Search the nfsrv_recalllist for a match. */
7383 			TAILQ_FOREACH(lyp, &nfsrv_recalllisthead, lay_list) {
7384 				if (NFSBCMP(&lyp->lay_fh, &fh,
7385 				    sizeof(fh)) == 0 &&
7386 				    lyp->lay_clientid.qval ==
7387 				    nd->nd_clientid.qval &&
7388 				    stateidp->other[0] ==
7389 				    lyp->lay_stateid.other[0] &&
7390 				    stateidp->other[1] ==
7391 				    lyp->lay_stateid.other[1] &&
7392 				    stateidp->other[2] ==
7393 				    lyp->lay_stateid.other[2]) {
7394 					lyp->lay_flags |= NFSLAY_RETURNED;
7395 					wakeup(lyp);
7396 					error = 0;
7397 				}
7398 			}
7399 			NFSDRECALLUNLOCK();
7400 		}
7401 		if (layouttype == NFSLAYOUT_FLEXFILE && layp != NULL)
7402 			nfsrv_flexlayouterr(nd, layp, maxcnt, p);
7403 	} else if (kind == NFSV4LAYOUTRET_FSID)
7404 		nfsrv_freelayouts(&nd->nd_clientid,
7405 		    &vp->v_mount->mnt_stat.f_fsid, layouttype, iomode);
7406 	else if (kind == NFSV4LAYOUTRET_ALL)
7407 		nfsrv_freelayouts(&nd->nd_clientid, NULL, layouttype, iomode);
7408 	else
7409 		error = NFSERR_INVAL;
7410 	if (error == -1)
7411 		error = 0;
7412 	return (error);
7413 }
7414 
7415 /*
7416  * Look for an existing layout.
7417  */
7418 static int
7419 nfsrv_findlayout(nfsquad_t *clientidp, fhandle_t *fhp, int laytype,
7420     NFSPROC_T *p, struct nfslayout **lypp)
7421 {
7422 	struct nfslayouthash *lhyp;
7423 	struct nfslayout *lyp;
7424 	int ret;
7425 
7426 	*lypp = NULL;
7427 	ret = 0;
7428 	lhyp = NFSLAYOUTHASH(fhp);
7429 	TAILQ_FOREACH(lyp, &lhyp->list, lay_list) {
7430 		if (NFSBCMP(&lyp->lay_fh, fhp, sizeof(*fhp)) == 0 &&
7431 		    lyp->lay_clientid.qval == clientidp->qval &&
7432 		    lyp->lay_type == laytype)
7433 			break;
7434 	}
7435 	if (lyp != NULL)
7436 		*lypp = lyp;
7437 	else
7438 		ret = -1;
7439 	return (ret);
7440 }
7441 
7442 /*
7443  * Add the new layout, as required.
7444  */
7445 static int
7446 nfsrv_addlayout(struct nfsrv_descript *nd, struct nfslayout **lypp,
7447     nfsv4stateid_t *stateidp, char *layp, int *layoutlenp, NFSPROC_T *p)
7448 {
7449 	struct nfsclient *clp;
7450 	struct nfslayouthash *lhyp;
7451 	struct nfslayout *lyp, *nlyp;
7452 	fhandle_t *fhp;
7453 	int error;
7454 
7455 	KASSERT((nd->nd_flag & ND_IMPLIEDCLID) != 0,
7456 	    ("nfsrv_layoutget: no nd_clientid\n"));
7457 	lyp = *lypp;
7458 	fhp = &lyp->lay_fh;
7459 	NFSLOCKSTATE();
7460 	error = nfsrv_getclient((nfsquad_t)((u_quad_t)0), CLOPS_RENEW, &clp,
7461 	    NULL, (nfsquad_t)((u_quad_t)0), 0, nd, p);
7462 	if (error != 0) {
7463 		NFSUNLOCKSTATE();
7464 		return (error);
7465 	}
7466 	lyp->lay_stateid.seqid = stateidp->seqid = 1;
7467 	lyp->lay_stateid.other[0] = stateidp->other[0] =
7468 	    clp->lc_clientid.lval[0];
7469 	lyp->lay_stateid.other[1] = stateidp->other[1] =
7470 	    clp->lc_clientid.lval[1];
7471 	lyp->lay_stateid.other[2] = stateidp->other[2] =
7472 	    nfsrv_nextstateindex(clp);
7473 	NFSUNLOCKSTATE();
7474 
7475 	lhyp = NFSLAYOUTHASH(fhp);
7476 	NFSLOCKLAYOUT(lhyp);
7477 	TAILQ_FOREACH(nlyp, &lhyp->list, lay_list) {
7478 		if (NFSBCMP(&nlyp->lay_fh, fhp, sizeof(*fhp)) == 0 &&
7479 		    nlyp->lay_clientid.qval == nd->nd_clientid.qval)
7480 			break;
7481 	}
7482 	if (nlyp != NULL) {
7483 		/* A layout already exists, so use it. */
7484 		nlyp->lay_flags |= (lyp->lay_flags & (NFSLAY_READ | NFSLAY_RW));
7485 		NFSBCOPY(nlyp->lay_xdr, layp, nlyp->lay_layoutlen);
7486 		*layoutlenp = nlyp->lay_layoutlen;
7487 		if (++nlyp->lay_stateid.seqid == 0)
7488 			nlyp->lay_stateid.seqid = 1;
7489 		stateidp->seqid = nlyp->lay_stateid.seqid;
7490 		stateidp->other[0] = nlyp->lay_stateid.other[0];
7491 		stateidp->other[1] = nlyp->lay_stateid.other[1];
7492 		stateidp->other[2] = nlyp->lay_stateid.other[2];
7493 		NFSUNLOCKLAYOUT(lhyp);
7494 		return (0);
7495 	}
7496 
7497 	/* Insert the new layout in the lists. */
7498 	*lypp = NULL;
7499 	atomic_add_int(&nfsrv_layoutcnt, 1);
7500 	NFSD_VNET(nfsstatsv1_p)->srvlayouts++;
7501 	NFSBCOPY(lyp->lay_xdr, layp, lyp->lay_layoutlen);
7502 	*layoutlenp = lyp->lay_layoutlen;
7503 	TAILQ_INSERT_HEAD(&lhyp->list, lyp, lay_list);
7504 	NFSUNLOCKLAYOUT(lhyp);
7505 	return (0);
7506 }
7507 
7508 /*
7509  * Get the devinfo for a deviceid.
7510  */
7511 int
7512 nfsrv_getdevinfo(char *devid, int layouttype, uint32_t *maxcnt,
7513     uint32_t *notify, int *devaddrlen, char **devaddr)
7514 {
7515 	struct nfsdevice *ds;
7516 
7517 	if ((layouttype != NFSLAYOUT_NFSV4_1_FILES && layouttype !=
7518 	     NFSLAYOUT_FLEXFILE) ||
7519 	    (nfsrv_maxpnfsmirror > 1 && layouttype == NFSLAYOUT_NFSV4_1_FILES))
7520 		return (NFSERR_UNKNLAYOUTTYPE);
7521 
7522 	/*
7523 	 * Now, search for the device id.  Note that the structures won't go
7524 	 * away, but the order changes in the list.  As such, the lock only
7525 	 * needs to be held during the search through the list.
7526 	 */
7527 	NFSDDSLOCK();
7528 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
7529 		if (NFSBCMP(devid, ds->nfsdev_deviceid, NFSX_V4DEVICEID) == 0 &&
7530 		    ds->nfsdev_nmp != NULL)
7531 			break;
7532 	}
7533 	NFSDDSUNLOCK();
7534 	if (ds == NULL)
7535 		return (NFSERR_NOENT);
7536 
7537 	/* If the correct nfsdev_XXXXaddrlen is > 0, we have the device info. */
7538 	*devaddrlen = 0;
7539 	if (layouttype == NFSLAYOUT_NFSV4_1_FILES) {
7540 		*devaddrlen = ds->nfsdev_fileaddrlen;
7541 		*devaddr = ds->nfsdev_fileaddr;
7542 	} else if (layouttype == NFSLAYOUT_FLEXFILE) {
7543 		*devaddrlen = ds->nfsdev_flexaddrlen;
7544 		*devaddr = ds->nfsdev_flexaddr;
7545 	}
7546 	if (*devaddrlen == 0)
7547 		return (NFSERR_UNKNLAYOUTTYPE);
7548 
7549 	/*
7550 	 * The XDR overhead is 3 unsigned values: layout_type,
7551 	 * length_of_address and notify bitmap.
7552 	 * If the notify array is changed to not all zeros, the
7553 	 * count of unsigned values must be increased.
7554 	 */
7555 	if (*maxcnt > 0 && *maxcnt < NFSM_RNDUP(*devaddrlen) +
7556 	    3 * NFSX_UNSIGNED) {
7557 		*maxcnt = NFSM_RNDUP(*devaddrlen) + 3 * NFSX_UNSIGNED;
7558 		return (NFSERR_TOOSMALL);
7559 	}
7560 	return (0);
7561 }
7562 
7563 /*
7564  * Free a list of layout state structures.
7565  */
7566 static void
7567 nfsrv_freelayoutlist(nfsquad_t clientid)
7568 {
7569 	struct nfslayouthash *lhyp;
7570 	struct nfslayout *lyp, *nlyp;
7571 	int i;
7572 
7573 	for (i = 0; i < nfsrv_layouthashsize; i++) {
7574 		lhyp = &nfslayouthash[i];
7575 		NFSLOCKLAYOUT(lhyp);
7576 		TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp) {
7577 			if (lyp->lay_clientid.qval == clientid.qval)
7578 				nfsrv_freelayout(&lhyp->list, lyp);
7579 		}
7580 		NFSUNLOCKLAYOUT(lhyp);
7581 	}
7582 }
7583 
7584 /*
7585  * Free up a layout.
7586  */
7587 static void
7588 nfsrv_freelayout(struct nfslayouthead *lhp, struct nfslayout *lyp)
7589 {
7590 
7591 	NFSD_DEBUG(4, "Freelayout=%p\n", lyp);
7592 	atomic_add_int(&nfsrv_layoutcnt, -1);
7593 	NFSD_VNET(nfsstatsv1_p)->srvlayouts--;
7594 	TAILQ_REMOVE(lhp, lyp, lay_list);
7595 	free(lyp, M_NFSDSTATE);
7596 }
7597 
7598 /*
7599  * Free up a device id.
7600  */
7601 void
7602 nfsrv_freeonedevid(struct nfsdevice *ds)
7603 {
7604 	int i;
7605 
7606 	atomic_add_int(&nfsrv_devidcnt, -1);
7607 	vrele(ds->nfsdev_dvp);
7608 	for (i = 0; i < nfsrv_dsdirsize; i++)
7609 		if (ds->nfsdev_dsdir[i] != NULL)
7610 			vrele(ds->nfsdev_dsdir[i]);
7611 	free(ds->nfsdev_fileaddr, M_NFSDSTATE);
7612 	free(ds->nfsdev_flexaddr, M_NFSDSTATE);
7613 	free(ds->nfsdev_host, M_NFSDSTATE);
7614 	free(ds, M_NFSDSTATE);
7615 }
7616 
7617 /*
7618  * Free up a device id and its mirrors.
7619  */
7620 static void
7621 nfsrv_freedevid(struct nfsdevice *ds)
7622 {
7623 
7624 	TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list);
7625 	nfsrv_freeonedevid(ds);
7626 }
7627 
7628 /*
7629  * Free all layouts and device ids.
7630  * Done when the nfsd threads are shut down since there may be a new
7631  * modified device id list created when the nfsd is restarted.
7632  */
7633 void
7634 nfsrv_freealllayoutsanddevids(void)
7635 {
7636 	struct nfsdontlist *mrp, *nmrp;
7637 	struct nfslayout *lyp, *nlyp;
7638 
7639 	/* Get rid of the deviceid structures. */
7640 	nfsrv_freealldevids();
7641 	TAILQ_INIT(&nfsrv_devidhead);
7642 	nfsrv_devidcnt = 0;
7643 
7644 	/* Get rid of all layouts. */
7645 	nfsrv_freealllayouts();
7646 
7647 	/* Get rid of any nfsdontlist entries. */
7648 	LIST_FOREACH_SAFE(mrp, &nfsrv_dontlisthead, nfsmr_list, nmrp)
7649 		free(mrp, M_NFSDSTATE);
7650 	LIST_INIT(&nfsrv_dontlisthead);
7651 	nfsrv_dontlistlen = 0;
7652 
7653 	/* Free layouts in the recall list. */
7654 	TAILQ_FOREACH_SAFE(lyp, &nfsrv_recalllisthead, lay_list, nlyp)
7655 		nfsrv_freelayout(&nfsrv_recalllisthead, lyp);
7656 	TAILQ_INIT(&nfsrv_recalllisthead);
7657 }
7658 
7659 /*
7660  * Free layouts that match the arguments.
7661  */
7662 static void
7663 nfsrv_freelayouts(nfsquad_t *clid, fsid_t *fs, int laytype, int iomode)
7664 {
7665 	struct nfslayouthash *lhyp;
7666 	struct nfslayout *lyp, *nlyp;
7667 	int i;
7668 
7669 	for (i = 0; i < nfsrv_layouthashsize; i++) {
7670 		lhyp = &nfslayouthash[i];
7671 		NFSLOCKLAYOUT(lhyp);
7672 		TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp) {
7673 			if (clid->qval != lyp->lay_clientid.qval)
7674 				continue;
7675 			if (fs != NULL && fsidcmp(fs, &lyp->lay_fsid) != 0)
7676 				continue;
7677 			if (laytype != lyp->lay_type)
7678 				continue;
7679 			if ((iomode & NFSLAYOUTIOMODE_READ) != 0)
7680 				lyp->lay_flags &= ~NFSLAY_READ;
7681 			if ((iomode & NFSLAYOUTIOMODE_RW) != 0)
7682 				lyp->lay_flags &= ~NFSLAY_RW;
7683 			if ((lyp->lay_flags & (NFSLAY_READ | NFSLAY_RW)) == 0)
7684 				nfsrv_freelayout(&lhyp->list, lyp);
7685 		}
7686 		NFSUNLOCKLAYOUT(lhyp);
7687 	}
7688 }
7689 
7690 /*
7691  * Free all layouts for the argument file.
7692  */
7693 void
7694 nfsrv_freefilelayouts(fhandle_t *fhp)
7695 {
7696 	struct nfslayouthash *lhyp;
7697 	struct nfslayout *lyp, *nlyp;
7698 
7699 	lhyp = NFSLAYOUTHASH(fhp);
7700 	NFSLOCKLAYOUT(lhyp);
7701 	TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp) {
7702 		if (NFSBCMP(&lyp->lay_fh, fhp, sizeof(*fhp)) == 0)
7703 			nfsrv_freelayout(&lhyp->list, lyp);
7704 	}
7705 	NFSUNLOCKLAYOUT(lhyp);
7706 }
7707 
7708 /*
7709  * Free all layouts.
7710  */
7711 static void
7712 nfsrv_freealllayouts(void)
7713 {
7714 	struct nfslayouthash *lhyp;
7715 	struct nfslayout *lyp, *nlyp;
7716 	int i;
7717 
7718 	for (i = 0; i < nfsrv_layouthashsize; i++) {
7719 		lhyp = &nfslayouthash[i];
7720 		NFSLOCKLAYOUT(lhyp);
7721 		TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp)
7722 			nfsrv_freelayout(&lhyp->list, lyp);
7723 		NFSUNLOCKLAYOUT(lhyp);
7724 	}
7725 }
7726 
7727 /*
7728  * Look up the mount path for the DS server.
7729  */
7730 static int
7731 nfsrv_setdsserver(char *dspathp, char *mdspathp, NFSPROC_T *p,
7732     struct nfsdevice **dsp)
7733 {
7734 	struct nameidata nd;
7735 	struct nfsdevice *ds;
7736 	struct mount *mp;
7737 	int error, i;
7738 	char *dsdirpath;
7739 	size_t dsdirsize;
7740 
7741 	NFSD_DEBUG(4, "setdssrv path=%s\n", dspathp);
7742 	*dsp = NULL;
7743 	if (jailed(p->td_ucred)) {
7744 		printf("A pNFS nfsd cannot run in a jail\n");
7745 		return (EPERM);
7746 	}
7747 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
7748 	    dspathp);
7749 	error = namei(&nd);
7750 	NFSD_DEBUG(4, "lookup=%d\n", error);
7751 	if (error != 0)
7752 		return (error);
7753 	if (nd.ni_vp->v_type != VDIR) {
7754 		vput(nd.ni_vp);
7755 		NFSD_DEBUG(4, "dspath not dir\n");
7756 		return (ENOTDIR);
7757 	}
7758 	if (strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name, "nfs") != 0) {
7759 		vput(nd.ni_vp);
7760 		NFSD_DEBUG(4, "dspath not an NFS mount\n");
7761 		return (ENXIO);
7762 	}
7763 
7764 	/*
7765 	 * Allocate a DS server structure with the NFS mounted directory
7766 	 * vnode reference counted, so that a non-forced dismount will
7767 	 * fail with EBUSY.
7768 	 * This structure is always linked into the list, even if an error
7769 	 * is being returned.  The caller will free the entire list upon
7770 	 * an error return.
7771 	 */
7772 	*dsp = ds = malloc(sizeof(*ds) + nfsrv_dsdirsize * sizeof(vnode_t),
7773 	    M_NFSDSTATE, M_WAITOK | M_ZERO);
7774 	ds->nfsdev_dvp = nd.ni_vp;
7775 	ds->nfsdev_nmp = VFSTONFS(nd.ni_vp->v_mount);
7776 	NFSVOPUNLOCK(nd.ni_vp);
7777 
7778 	dsdirsize = strlen(dspathp) + 16;
7779 	dsdirpath = malloc(dsdirsize, M_TEMP, M_WAITOK);
7780 	/* Now, create the DS directory structures. */
7781 	for (i = 0; i < nfsrv_dsdirsize; i++) {
7782 		snprintf(dsdirpath, dsdirsize, "%s/ds%d", dspathp, i);
7783 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
7784 		    UIO_SYSSPACE, dsdirpath);
7785 		error = namei(&nd);
7786 		NFSD_DEBUG(4, "dsdirpath=%s lookup=%d\n", dsdirpath, error);
7787 		if (error != 0)
7788 			break;
7789 		if (nd.ni_vp->v_type != VDIR) {
7790 			vput(nd.ni_vp);
7791 			error = ENOTDIR;
7792 			NFSD_DEBUG(4, "dsdirpath not a VDIR\n");
7793 			break;
7794 		}
7795 		if (strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name, "nfs") != 0) {
7796 			vput(nd.ni_vp);
7797 			error = ENXIO;
7798 			NFSD_DEBUG(4, "dsdirpath not an NFS mount\n");
7799 			break;
7800 		}
7801 		ds->nfsdev_dsdir[i] = nd.ni_vp;
7802 		NFSVOPUNLOCK(nd.ni_vp);
7803 	}
7804 	free(dsdirpath, M_TEMP);
7805 
7806 	if (strlen(mdspathp) > 0) {
7807 		/*
7808 		 * This DS stores file for a specific MDS exported file
7809 		 * system.
7810 		 */
7811 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
7812 		    UIO_SYSSPACE, mdspathp);
7813 		error = namei(&nd);
7814 		NFSD_DEBUG(4, "mds lookup=%d\n", error);
7815 		if (error != 0)
7816 			goto out;
7817 		if (nd.ni_vp->v_type != VDIR) {
7818 			vput(nd.ni_vp);
7819 			error = ENOTDIR;
7820 			NFSD_DEBUG(4, "mdspath not dir\n");
7821 			goto out;
7822 		}
7823 		mp = nd.ni_vp->v_mount;
7824 		if ((mp->mnt_flag & MNT_EXPORTED) == 0) {
7825 			vput(nd.ni_vp);
7826 			error = ENXIO;
7827 			NFSD_DEBUG(4, "mdspath not an exported fs\n");
7828 			goto out;
7829 		}
7830 		ds->nfsdev_mdsfsid = mp->mnt_stat.f_fsid;
7831 		ds->nfsdev_mdsisset = 1;
7832 		vput(nd.ni_vp);
7833 	}
7834 
7835 out:
7836 	TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list);
7837 	atomic_add_int(&nfsrv_devidcnt, 1);
7838 	return (error);
7839 }
7840 
7841 /*
7842  * Look up the mount path for the DS server and delete it.
7843  */
7844 int
7845 nfsrv_deldsserver(int op, char *dspathp, NFSPROC_T *p)
7846 {
7847 	struct mount *mp;
7848 	struct nfsmount *nmp;
7849 	struct nfsdevice *ds;
7850 	int error;
7851 
7852 	NFSD_DEBUG(4, "deldssrv path=%s\n", dspathp);
7853 	/*
7854 	 * Search for the path in the mount list.  Avoid looking the path
7855 	 * up, since this mount point may be hung, with associated locked
7856 	 * vnodes, etc.
7857 	 * Set NFSMNTP_CANCELRPCS so that any forced dismount will be blocked
7858 	 * until this completes.
7859 	 * As noted in the man page, this should be done before any forced
7860 	 * dismount on the mount point, but at least the handshake on
7861 	 * NFSMNTP_CANCELRPCS should make it safe.
7862 	 */
7863 	error = 0;
7864 	ds = NULL;
7865 	nmp = NULL;
7866 	mtx_lock(&mountlist_mtx);
7867 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
7868 		if (strcmp(mp->mnt_stat.f_mntonname, dspathp) == 0 &&
7869 		    strcmp(mp->mnt_stat.f_fstypename, "nfs") == 0 &&
7870 		    mp->mnt_data != NULL) {
7871 			nmp = VFSTONFS(mp);
7872 			NFSLOCKMNT(nmp);
7873 			if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
7874 			     NFSMNTP_CANCELRPCS)) == 0) {
7875 				nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
7876 				NFSUNLOCKMNT(nmp);
7877 			} else {
7878 				NFSUNLOCKMNT(nmp);
7879 				nmp = NULL;
7880 			}
7881 			break;
7882 		}
7883 	}
7884 	mtx_unlock(&mountlist_mtx);
7885 
7886 	if (nmp != NULL) {
7887 		ds = nfsrv_deldsnmp(op, nmp, p);
7888 		NFSD_DEBUG(4, "deldsnmp=%p\n", ds);
7889 		if (ds != NULL) {
7890 			nfsrv_killrpcs(nmp);
7891 			NFSD_DEBUG(4, "aft killrpcs\n");
7892 		} else
7893 			error = ENXIO;
7894 		NFSLOCKMNT(nmp);
7895 		nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
7896 		wakeup(nmp);
7897 		NFSUNLOCKMNT(nmp);
7898 	} else
7899 		error = EINVAL;
7900 	return (error);
7901 }
7902 
7903 /*
7904  * Search for and remove a DS entry which matches the "nmp" argument.
7905  * The nfsdevice structure pointer is returned so that the caller can
7906  * free it via nfsrv_freeonedevid().
7907  * For the forced case, do not try to do LayoutRecalls, since the server
7908  * must be shut down now anyhow.
7909  */
7910 struct nfsdevice *
7911 nfsrv_deldsnmp(int op, struct nfsmount *nmp, NFSPROC_T *p)
7912 {
7913 	struct nfsdevice *fndds;
7914 
7915 	NFSD_DEBUG(4, "deldsdvp\n");
7916 	NFSDDSLOCK();
7917 	if (op == PNFSDOP_FORCEDELDS)
7918 		fndds = nfsv4_findmirror(nmp);
7919 	else
7920 		fndds = nfsrv_findmirroredds(nmp);
7921 	if (fndds != NULL)
7922 		nfsrv_deleteds(fndds);
7923 	NFSDDSUNLOCK();
7924 	if (fndds != NULL) {
7925 		if (op != PNFSDOP_FORCEDELDS)
7926 			nfsrv_flexmirrordel(fndds->nfsdev_deviceid, p);
7927 		printf("pNFS server: mirror %s failed\n", fndds->nfsdev_host);
7928 	}
7929 	return (fndds);
7930 }
7931 
7932 /*
7933  * Similar to nfsrv_deldsnmp(), except that the DS is indicated by deviceid.
7934  * This function also calls nfsrv_killrpcs() to unblock RPCs on the mount
7935  * point.
7936  * Also, returns an error instead of the nfsdevice found.
7937  */
7938 int
7939 nfsrv_delds(char *devid, NFSPROC_T *p)
7940 {
7941 	struct nfsdevice *ds, *fndds;
7942 	struct nfsmount *nmp;
7943 	int fndmirror;
7944 
7945 	NFSD_DEBUG(4, "delds\n");
7946 	/*
7947 	 * Search the DS server list for a match with devid.
7948 	 * Remove the DS entry if found and there is a mirror.
7949 	 */
7950 	fndds = NULL;
7951 	nmp = NULL;
7952 	fndmirror = 0;
7953 	NFSDDSLOCK();
7954 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
7955 		if (NFSBCMP(ds->nfsdev_deviceid, devid, NFSX_V4DEVICEID) == 0 &&
7956 		    ds->nfsdev_nmp != NULL) {
7957 			NFSD_DEBUG(4, "fnd main ds\n");
7958 			fndds = ds;
7959 			break;
7960 		}
7961 	}
7962 	if (fndds == NULL) {
7963 		NFSDDSUNLOCK();
7964 		return (ENXIO);
7965 	}
7966 	if (fndds->nfsdev_mdsisset == 0 && nfsrv_faildscnt > 0)
7967 		fndmirror = 1;
7968 	else if (fndds->nfsdev_mdsisset != 0) {
7969 		/* For the fsid is set case, search for a mirror. */
7970 		TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
7971 			if (ds != fndds && ds->nfsdev_nmp != NULL &&
7972 			    ds->nfsdev_mdsisset != 0 &&
7973 			    fsidcmp(&ds->nfsdev_mdsfsid,
7974 			    &fndds->nfsdev_mdsfsid) == 0) {
7975 				fndmirror = 1;
7976 				break;
7977 			}
7978 		}
7979 	}
7980 	if (fndmirror != 0) {
7981 		nmp = fndds->nfsdev_nmp;
7982 		NFSLOCKMNT(nmp);
7983 		if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
7984 		     NFSMNTP_CANCELRPCS)) == 0) {
7985 			nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
7986 			NFSUNLOCKMNT(nmp);
7987 			nfsrv_deleteds(fndds);
7988 		} else {
7989 			NFSUNLOCKMNT(nmp);
7990 			nmp = NULL;
7991 		}
7992 	}
7993 	NFSDDSUNLOCK();
7994 	if (nmp != NULL) {
7995 		nfsrv_flexmirrordel(fndds->nfsdev_deviceid, p);
7996 		printf("pNFS server: mirror %s failed\n", fndds->nfsdev_host);
7997 		nfsrv_killrpcs(nmp);
7998 		NFSLOCKMNT(nmp);
7999 		nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
8000 		wakeup(nmp);
8001 		NFSUNLOCKMNT(nmp);
8002 		return (0);
8003 	}
8004 	return (ENXIO);
8005 }
8006 
8007 /*
8008  * Mark a DS as disabled by setting nfsdev_nmp = NULL.
8009  */
8010 static void
8011 nfsrv_deleteds(struct nfsdevice *fndds)
8012 {
8013 
8014 	NFSD_DEBUG(4, "deleteds: deleting a mirror\n");
8015 	fndds->nfsdev_nmp = NULL;
8016 	if (fndds->nfsdev_mdsisset == 0)
8017 		nfsrv_faildscnt--;
8018 }
8019 
8020 /*
8021  * Fill in the addr structures for the File and Flex File layouts.
8022  */
8023 static void
8024 nfsrv_allocdevid(struct nfsdevice *ds, char *addr, char *dnshost)
8025 {
8026 	uint32_t *tl;
8027 	char *netprot;
8028 	int addrlen;
8029 	static uint64_t new_devid = 0;
8030 
8031 	if (strchr(addr, ':') != NULL)
8032 		netprot = "tcp6";
8033 	else
8034 		netprot = "tcp";
8035 
8036 	/* Fill in the device id. */
8037 	NFSBCOPY(&nfsdev_time, ds->nfsdev_deviceid, sizeof(nfsdev_time));
8038 	new_devid++;
8039 	NFSBCOPY(&new_devid, &ds->nfsdev_deviceid[sizeof(nfsdev_time)],
8040 	    sizeof(new_devid));
8041 
8042 	/*
8043 	 * Fill in the file addr (actually the nfsv4_file_layout_ds_addr4
8044 	 * as defined in RFC5661) in XDR.
8045 	 */
8046 	addrlen = NFSM_RNDUP(strlen(addr)) + NFSM_RNDUP(strlen(netprot)) +
8047 	    6 * NFSX_UNSIGNED;
8048 	NFSD_DEBUG(4, "hn=%s addr=%s netprot=%s\n", dnshost, addr, netprot);
8049 	ds->nfsdev_fileaddrlen = addrlen;
8050 	tl = malloc(addrlen, M_NFSDSTATE, M_WAITOK | M_ZERO);
8051 	ds->nfsdev_fileaddr = (char *)tl;
8052 	*tl++ = txdr_unsigned(1);		/* One stripe with index 0. */
8053 	*tl++ = 0;
8054 	*tl++ = txdr_unsigned(1);		/* One multipath list */
8055 	*tl++ = txdr_unsigned(1);		/* with one entry in it. */
8056 	/* The netaddr for this one entry. */
8057 	*tl++ = txdr_unsigned(strlen(netprot));
8058 	NFSBCOPY(netprot, tl, strlen(netprot));
8059 	tl += (NFSM_RNDUP(strlen(netprot)) / NFSX_UNSIGNED);
8060 	*tl++ = txdr_unsigned(strlen(addr));
8061 	NFSBCOPY(addr, tl, strlen(addr));
8062 
8063 	/*
8064 	 * Fill in the flex file addr (actually the ff_device_addr4
8065 	 * as defined for Flexible File Layout) in XDR.
8066 	 */
8067 	addrlen = NFSM_RNDUP(strlen(addr)) + NFSM_RNDUP(strlen(netprot)) +
8068 	    14 * NFSX_UNSIGNED;
8069 	ds->nfsdev_flexaddrlen = addrlen;
8070 	tl = malloc(addrlen, M_NFSDSTATE, M_WAITOK | M_ZERO);
8071 	ds->nfsdev_flexaddr = (char *)tl;
8072 	*tl++ = txdr_unsigned(1);		/* One multipath entry. */
8073 	/* The netaddr for this one entry. */
8074 	*tl++ = txdr_unsigned(strlen(netprot));
8075 	NFSBCOPY(netprot, tl, strlen(netprot));
8076 	tl += (NFSM_RNDUP(strlen(netprot)) / NFSX_UNSIGNED);
8077 	*tl++ = txdr_unsigned(strlen(addr));
8078 	NFSBCOPY(addr, tl, strlen(addr));
8079 	tl += (NFSM_RNDUP(strlen(addr)) / NFSX_UNSIGNED);
8080 	*tl++ = txdr_unsigned(2);		/* Two NFS Versions. */
8081 	*tl++ = txdr_unsigned(NFS_VER4);	/* NFSv4. */
8082 	*tl++ = txdr_unsigned(NFSV42_MINORVERSION); /* Minor version 2. */
8083 	*tl++ = txdr_unsigned(nfs_srvmaxio);	/* DS max rsize. */
8084 	*tl++ = txdr_unsigned(nfs_srvmaxio);	/* DS max wsize. */
8085 	*tl++ = newnfs_true;			/* Tightly coupled. */
8086 	*tl++ = txdr_unsigned(NFS_VER4);	/* NFSv4. */
8087 	*tl++ = txdr_unsigned(NFSV41_MINORVERSION); /* Minor version 1. */
8088 	*tl++ = txdr_unsigned(nfs_srvmaxio);	/* DS max rsize. */
8089 	*tl++ = txdr_unsigned(nfs_srvmaxio);	/* DS max wsize. */
8090 	*tl = newnfs_true;			/* Tightly coupled. */
8091 
8092 	ds->nfsdev_hostnamelen = strlen(dnshost);
8093 	ds->nfsdev_host = malloc(ds->nfsdev_hostnamelen + 1, M_NFSDSTATE,
8094 	    M_WAITOK);
8095 	NFSBCOPY(dnshost, ds->nfsdev_host, ds->nfsdev_hostnamelen + 1);
8096 }
8097 
8098 /*
8099  * Create the device id list.
8100  * Return 0 if the nfsd threads are to run and ENXIO if the "-p" argument
8101  * is misconfigured.
8102  */
8103 int
8104 nfsrv_createdevids(struct nfsd_nfsd_args *args, NFSPROC_T *p)
8105 {
8106 	struct nfsdevice *ds;
8107 	char *addrp, *dnshostp, *dspathp, *mdspathp;
8108 	int error, i;
8109 
8110 	addrp = args->addr;
8111 	dnshostp = args->dnshost;
8112 	dspathp = args->dspath;
8113 	mdspathp = args->mdspath;
8114 	nfsrv_maxpnfsmirror = args->mirrorcnt;
8115 	if (addrp == NULL || dnshostp == NULL || dspathp == NULL ||
8116 	    mdspathp == NULL)
8117 		return (0);
8118 
8119 	/*
8120 	 * Loop around for each nul-terminated string in args->addr,
8121 	 * args->dnshost, args->dnspath and args->mdspath.
8122 	 */
8123 	while (addrp < (args->addr + args->addrlen) &&
8124 	    dnshostp < (args->dnshost + args->dnshostlen) &&
8125 	    dspathp < (args->dspath + args->dspathlen) &&
8126 	    mdspathp < (args->mdspath + args->mdspathlen)) {
8127 		error = nfsrv_setdsserver(dspathp, mdspathp, p, &ds);
8128 		if (error != 0) {
8129 			/* Free all DS servers. */
8130 			nfsrv_freealldevids();
8131 			nfsrv_devidcnt = 0;
8132 			return (ENXIO);
8133 		}
8134 		nfsrv_allocdevid(ds, addrp, dnshostp);
8135 		addrp += (strlen(addrp) + 1);
8136 		dnshostp += (strlen(dnshostp) + 1);
8137 		dspathp += (strlen(dspathp) + 1);
8138 		mdspathp += (strlen(mdspathp) + 1);
8139 	}
8140 	if (nfsrv_devidcnt < nfsrv_maxpnfsmirror) {
8141 		/* Free all DS servers. */
8142 		nfsrv_freealldevids();
8143 		nfsrv_devidcnt = 0;
8144 		nfsrv_maxpnfsmirror = 1;
8145 		return (ENXIO);
8146 	}
8147 	/* We can fail at most one less DS than the mirror level. */
8148 	nfsrv_faildscnt = nfsrv_maxpnfsmirror - 1;
8149 
8150 	/*
8151 	 * Allocate the nfslayout hash table now, since this is a pNFS server.
8152 	 * Make it 1% of the high water mark and at least 100.
8153 	 */
8154 	if (nfslayouthash == NULL) {
8155 		nfsrv_layouthashsize = nfsrv_layouthighwater / 100;
8156 		if (nfsrv_layouthashsize < 100)
8157 			nfsrv_layouthashsize = 100;
8158 		nfslayouthash = mallocarray(nfsrv_layouthashsize,
8159 		    sizeof(struct nfslayouthash), M_NFSDSESSION, M_WAITOK |
8160 		    M_ZERO);
8161 		for (i = 0; i < nfsrv_layouthashsize; i++) {
8162 			mtx_init(&nfslayouthash[i].mtx, "nfslm", NULL, MTX_DEF);
8163 			TAILQ_INIT(&nfslayouthash[i].list);
8164 		}
8165 	}
8166 	return (0);
8167 }
8168 
8169 /*
8170  * Free all device ids.
8171  */
8172 static void
8173 nfsrv_freealldevids(void)
8174 {
8175 	struct nfsdevice *ds, *nds;
8176 
8177 	TAILQ_FOREACH_SAFE(ds, &nfsrv_devidhead, nfsdev_list, nds)
8178 		nfsrv_freedevid(ds);
8179 }
8180 
8181 /*
8182  * Check to see if there is a Read/Write Layout plus either:
8183  * - A Write Delegation
8184  * or
8185  * - An Open with Write_access.
8186  * Return 1 if this is the case and 0 otherwise.
8187  * This function is used by nfsrv_proxyds() to decide if doing a Proxy
8188  * Getattr RPC to the Data Server (DS) is necessary.
8189  */
8190 #define	NFSCLIDVECSIZE	6
8191 int
8192 nfsrv_checkdsattr(vnode_t vp, NFSPROC_T *p)
8193 {
8194 	fhandle_t fh, *tfhp;
8195 	struct nfsstate *stp;
8196 	struct nfslayout *lyp;
8197 	struct nfslayouthash *lhyp;
8198 	struct nfslockhashhead *hp;
8199 	struct nfslockfile *lfp;
8200 	nfsquad_t clid[NFSCLIDVECSIZE];
8201 	int clidcnt, ret;
8202 
8203 	ret = nfsvno_getfh(vp, &fh, p);
8204 	if (ret != 0)
8205 		return (0);
8206 
8207 	/* First check for a Read/Write Layout. */
8208 	clidcnt = 0;
8209 	lhyp = NFSLAYOUTHASH(&fh);
8210 	NFSLOCKLAYOUT(lhyp);
8211 	TAILQ_FOREACH(lyp, &lhyp->list, lay_list) {
8212 		if (NFSBCMP(&lyp->lay_fh, &fh, sizeof(fh)) == 0 &&
8213 		    ((lyp->lay_flags & NFSLAY_RW) != 0 ||
8214 		     ((lyp->lay_flags & NFSLAY_READ) != 0 &&
8215 		      nfsrv_pnfsatime != 0))) {
8216 			if (clidcnt < NFSCLIDVECSIZE)
8217 				clid[clidcnt].qval = lyp->lay_clientid.qval;
8218 			clidcnt++;
8219 		}
8220 	}
8221 	NFSUNLOCKLAYOUT(lhyp);
8222 	if (clidcnt == 0) {
8223 		/* None found, so return 0. */
8224 		return (0);
8225 	}
8226 
8227 	/* Get the nfslockfile for this fh. */
8228 	NFSLOCKSTATE();
8229 	hp = NFSLOCKHASH(&fh);
8230 	LIST_FOREACH(lfp, hp, lf_hash) {
8231 		tfhp = &lfp->lf_fh;
8232 		if (NFSVNO_CMPFH(&fh, tfhp))
8233 			break;
8234 	}
8235 	if (lfp == NULL) {
8236 		/* None found, so return 0. */
8237 		NFSUNLOCKSTATE();
8238 		return (0);
8239 	}
8240 
8241 	/* Now, look for a Write delegation for this clientid. */
8242 	LIST_FOREACH(stp, &lfp->lf_deleg, ls_file) {
8243 		if ((stp->ls_flags & NFSLCK_DELEGWRITE) != 0 &&
8244 		    nfsrv_fndclid(clid, stp->ls_clp->lc_clientid, clidcnt) != 0)
8245 			break;
8246 	}
8247 	if (stp != NULL) {
8248 		/* Found one, so return 1. */
8249 		NFSUNLOCKSTATE();
8250 		return (1);
8251 	}
8252 
8253 	/* No Write delegation, so look for an Open with Write_access. */
8254 	LIST_FOREACH(stp, &lfp->lf_open, ls_file) {
8255 		KASSERT((stp->ls_flags & NFSLCK_OPEN) != 0,
8256 		    ("nfsrv_checkdsattr: Non-open in Open list\n"));
8257 		if ((stp->ls_flags & NFSLCK_WRITEACCESS) != 0 &&
8258 		    nfsrv_fndclid(clid, stp->ls_clp->lc_clientid, clidcnt) != 0)
8259 			break;
8260 	}
8261 	NFSUNLOCKSTATE();
8262 	if (stp != NULL)
8263 		return (1);
8264 	return (0);
8265 }
8266 
8267 /*
8268  * Look for a matching clientid in the vector. Return 1 if one might match.
8269  */
8270 static int
8271 nfsrv_fndclid(nfsquad_t *clidvec, nfsquad_t clid, int clidcnt)
8272 {
8273 	int i;
8274 
8275 	/* If too many for the vector, return 1 since there might be a match. */
8276 	if (clidcnt > NFSCLIDVECSIZE)
8277 		return (1);
8278 
8279 	for (i = 0; i < clidcnt; i++)
8280 		if (clidvec[i].qval == clid.qval)
8281 			return (1);
8282 	return (0);
8283 }
8284 
8285 /*
8286  * Check the don't list for "vp" and see if issuing an rw layout is allowed.
8287  * Return 1 if issuing an rw layout isn't allowed, 0 otherwise.
8288  */
8289 static int
8290 nfsrv_dontlayout(fhandle_t *fhp)
8291 {
8292 	struct nfsdontlist *mrp;
8293 	int ret;
8294 
8295 	if (nfsrv_dontlistlen == 0)
8296 		return (0);
8297 	ret = 0;
8298 	NFSDDONTLISTLOCK();
8299 	LIST_FOREACH(mrp, &nfsrv_dontlisthead, nfsmr_list) {
8300 		if (NFSBCMP(fhp, &mrp->nfsmr_fh, sizeof(*fhp)) == 0 &&
8301 		    (mrp->nfsmr_flags & NFSMR_DONTLAYOUT) != 0) {
8302 			ret = 1;
8303 			break;
8304 		}
8305 	}
8306 	NFSDDONTLISTUNLOCK();
8307 	return (ret);
8308 }
8309 
8310 #define	PNFSDS_COPYSIZ	65536
8311 /*
8312  * Create a new file on a DS and copy the contents of an extant DS file to it.
8313  * This can be used for recovery of a DS file onto a recovered DS.
8314  * The steps are:
8315  * - When called, the MDS file's vnode is locked, blocking LayoutGet operations.
8316  * - Disable issuing of read/write layouts for the file via the nfsdontlist,
8317  *   so that they will be disabled after the MDS file's vnode is unlocked.
8318  * - Set up the nfsrv_recalllist so that recall of read/write layouts can
8319  *   be done.
8320  * - Unlock the MDS file's vnode, so that the client(s) can perform proxied
8321  *   writes, LayoutCommits and LayoutReturns for the file when completing the
8322  *   LayoutReturn requested by the LayoutRecall callback.
8323  * - Issue a LayoutRecall callback for all read/write layouts and wait for
8324  *   them to be returned. (If the LayoutRecall callback replies
8325  *   NFSERR_NOMATCHLAYOUT, they are gone and no LayoutReturn is needed.)
8326  * - Exclusively lock the MDS file's vnode.  This ensures that no proxied
8327  *   writes are in progress or can occur during the DS file copy.
8328  *   It also blocks Setattr operations.
8329  * - Create the file on the recovered mirror.
8330  * - Copy the file from the operational DS.
8331  * - Copy any ACL from the MDS file to the new DS file.
8332  * - Set the modify time of the new DS file to that of the MDS file.
8333  * - Update the extended attribute for the MDS file.
8334  * - Enable issuing of rw layouts by deleting the nfsdontlist entry.
8335  * - The caller will unlock the MDS file's vnode allowing operations
8336  *   to continue normally, since it is now on the mirror again.
8337  */
8338 int
8339 nfsrv_copymr(vnode_t vp, vnode_t fvp, vnode_t dvp, struct nfsdevice *ds,
8340     struct pnfsdsfile *pf, struct pnfsdsfile *wpf, int mirrorcnt,
8341     struct ucred *cred, NFSPROC_T *p)
8342 {
8343 	struct nfsdontlist *mrp, *nmrp;
8344 	struct nfslayouthash *lhyp;
8345 	struct nfslayout *lyp, *nlyp;
8346 	struct nfslayouthead thl;
8347 	struct mount *mp, *tvmp;
8348 	struct acl *aclp;
8349 	struct vattr va;
8350 	struct timespec mtime;
8351 	fhandle_t fh;
8352 	vnode_t tvp;
8353 	off_t rdpos, wrpos;
8354 	ssize_t aresid;
8355 	char *dat;
8356 	int didprintf, ret, retacl, xfer;
8357 
8358 	ASSERT_VOP_LOCKED(fvp, "nfsrv_copymr fvp");
8359 	ASSERT_VOP_LOCKED(vp, "nfsrv_copymr vp");
8360 	/*
8361 	 * Allocate a nfsdontlist entry and set the NFSMR_DONTLAYOUT flag
8362 	 * so that no more RW layouts will get issued.
8363 	 */
8364 	ret = nfsvno_getfh(vp, &fh, p);
8365 	if (ret != 0) {
8366 		NFSD_DEBUG(4, "nfsrv_copymr: getfh=%d\n", ret);
8367 		return (ret);
8368 	}
8369 	nmrp = malloc(sizeof(*nmrp), M_NFSDSTATE, M_WAITOK);
8370 	nmrp->nfsmr_flags = NFSMR_DONTLAYOUT;
8371 	NFSBCOPY(&fh, &nmrp->nfsmr_fh, sizeof(fh));
8372 	NFSDDONTLISTLOCK();
8373 	LIST_FOREACH(mrp, &nfsrv_dontlisthead, nfsmr_list) {
8374 		if (NFSBCMP(&fh, &mrp->nfsmr_fh, sizeof(fh)) == 0)
8375 			break;
8376 	}
8377 	if (mrp == NULL) {
8378 		LIST_INSERT_HEAD(&nfsrv_dontlisthead, nmrp, nfsmr_list);
8379 		mrp = nmrp;
8380 		nmrp = NULL;
8381 		nfsrv_dontlistlen++;
8382 		NFSD_DEBUG(4, "nfsrv_copymr: in dontlist\n");
8383 	} else {
8384 		NFSDDONTLISTUNLOCK();
8385 		free(nmrp, M_NFSDSTATE);
8386 		NFSD_DEBUG(4, "nfsrv_copymr: dup dontlist\n");
8387 		return (ENXIO);
8388 	}
8389 	NFSDDONTLISTUNLOCK();
8390 
8391 	/*
8392 	 * Search for all RW layouts for this file.  Move them to the
8393 	 * recall list, so they can be recalled and their return noted.
8394 	 */
8395 	lhyp = NFSLAYOUTHASH(&fh);
8396 	NFSDRECALLLOCK();
8397 	NFSLOCKLAYOUT(lhyp);
8398 	TAILQ_FOREACH_SAFE(lyp, &lhyp->list, lay_list, nlyp) {
8399 		if (NFSBCMP(&lyp->lay_fh, &fh, sizeof(fh)) == 0 &&
8400 		    (lyp->lay_flags & NFSLAY_RW) != 0) {
8401 			TAILQ_REMOVE(&lhyp->list, lyp, lay_list);
8402 			TAILQ_INSERT_HEAD(&nfsrv_recalllisthead, lyp, lay_list);
8403 			lyp->lay_trycnt = 0;
8404 		}
8405 	}
8406 	NFSUNLOCKLAYOUT(lhyp);
8407 	NFSDRECALLUNLOCK();
8408 
8409 	ret = 0;
8410 	mp = tvmp = NULL;
8411 	didprintf = 0;
8412 	TAILQ_INIT(&thl);
8413 	/* Unlock the MDS vp, so that a LayoutReturn can be done on it. */
8414 	NFSVOPUNLOCK(vp);
8415 	/* Now, do a recall for all layouts not yet recalled. */
8416 tryagain:
8417 	NFSDRECALLLOCK();
8418 	TAILQ_FOREACH(lyp, &nfsrv_recalllisthead, lay_list) {
8419 		if (NFSBCMP(&lyp->lay_fh, &fh, sizeof(fh)) == 0 &&
8420 		    (lyp->lay_flags & NFSLAY_RECALL) == 0) {
8421 			lyp->lay_flags |= NFSLAY_RECALL;
8422 			/*
8423 			 * The layout stateid.seqid needs to be incremented
8424 			 * before doing a LAYOUT_RECALL callback.
8425 			 */
8426 			if (++lyp->lay_stateid.seqid == 0)
8427 				lyp->lay_stateid.seqid = 1;
8428 			NFSDRECALLUNLOCK();
8429 			nfsrv_recalllayout(lyp->lay_clientid, &lyp->lay_stateid,
8430 			    &lyp->lay_fh, lyp, 0, lyp->lay_type, p);
8431 			NFSD_DEBUG(4, "nfsrv_copymr: recalled layout\n");
8432 			goto tryagain;
8433 		}
8434 	}
8435 
8436 	/* Now wait for them to be returned. */
8437 tryagain2:
8438 	TAILQ_FOREACH(lyp, &nfsrv_recalllisthead, lay_list) {
8439 		if (NFSBCMP(&lyp->lay_fh, &fh, sizeof(fh)) == 0) {
8440 			if ((lyp->lay_flags & NFSLAY_RETURNED) != 0) {
8441 				TAILQ_REMOVE(&nfsrv_recalllisthead, lyp,
8442 				    lay_list);
8443 				TAILQ_INSERT_HEAD(&thl, lyp, lay_list);
8444 				NFSD_DEBUG(4,
8445 				    "nfsrv_copymr: layout returned\n");
8446 			} else {
8447 				lyp->lay_trycnt++;
8448 				ret = mtx_sleep(lyp, NFSDRECALLMUTEXPTR,
8449 				    PVFS | PCATCH, "nfsmrl", hz);
8450 				NFSD_DEBUG(4, "nfsrv_copymr: aft sleep=%d\n",
8451 				    ret);
8452 				if (ret == EINTR || ret == ERESTART)
8453 					break;
8454 				if ((lyp->lay_flags & NFSLAY_RETURNED) == 0) {
8455 					/*
8456 					 * Give up after 60sec and return
8457 					 * ENXIO, failing the copymr.
8458 					 * This layout will remain on the
8459 					 * recalllist.  It can only be cleared
8460 					 * by restarting the nfsd.
8461 					 * This seems the safe way to handle
8462 					 * it, since it cannot be safely copied
8463 					 * with an outstanding RW layout.
8464 					 */
8465 					if (lyp->lay_trycnt >= 60) {
8466 						ret = ENXIO;
8467 						break;
8468 					}
8469 					if (didprintf == 0) {
8470 						printf("nfsrv_copymr: layout "
8471 						    "not returned\n");
8472 						didprintf = 1;
8473 					}
8474 				}
8475 			}
8476 			goto tryagain2;
8477 		}
8478 	}
8479 	NFSDRECALLUNLOCK();
8480 	/* We can now get rid of the layouts that have been returned. */
8481 	TAILQ_FOREACH_SAFE(lyp, &thl, lay_list, nlyp)
8482 		nfsrv_freelayout(&thl, lyp);
8483 
8484 	/*
8485 	 * Do the vn_start_write() calls here, before the MDS vnode is
8486 	 * locked and the tvp is created (locked) in the NFS file system
8487 	 * that dvp is in.
8488 	 * For tvmp, this probably isn't necessary, since it will be an
8489 	 * NFS mount and they are not suspendable at this time.
8490 	 */
8491 	if (ret == 0)
8492 		ret = vn_start_write(vp, &mp, V_WAIT | V_PCATCH);
8493 	if (ret == 0) {
8494 		tvmp = dvp->v_mount;
8495 		ret = vn_start_write(NULL, &tvmp, V_WAIT | V_PCATCH);
8496 	}
8497 
8498 	/*
8499 	 * LK_EXCLUSIVE lock the MDS vnode, so that any
8500 	 * proxied writes through the MDS will be blocked until we have
8501 	 * completed the copy and update of the extended attributes.
8502 	 * This will also ensure that any attributes and ACL will not be
8503 	 * changed until the copy is complete.
8504 	 */
8505 	NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
8506 	if (ret == 0 && VN_IS_DOOMED(vp)) {
8507 		NFSD_DEBUG(4, "nfsrv_copymr: lk_exclusive doomed\n");
8508 		ret = ESTALE;
8509 	}
8510 
8511 	/* Create the data file on the recovered DS. */
8512 	if (ret == 0)
8513 		ret = nfsrv_createdsfile(vp, &fh, pf, dvp, ds, cred, p, &tvp);
8514 
8515 	/* Copy the DS file, if created successfully. */
8516 	if (ret == 0) {
8517 		/*
8518 		 * Get any NFSv4 ACL on the MDS file, so that it can be set
8519 		 * on the new DS file.
8520 		 */
8521 		aclp = acl_alloc(M_WAITOK | M_ZERO);
8522 		retacl = VOP_GETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
8523 		if (retacl != 0 && retacl != ENOATTR)
8524 			NFSD_DEBUG(1, "nfsrv_copymr: vop_getacl=%d\n", retacl);
8525 		dat = malloc(PNFSDS_COPYSIZ, M_TEMP, M_WAITOK);
8526 		/* Malloc a block of 0s used to check for holes. */
8527 		if (nfsrv_zeropnfsdat == NULL)
8528 			nfsrv_zeropnfsdat = malloc(PNFSDS_COPYSIZ, M_TEMP,
8529 			    M_WAITOK | M_ZERO);
8530 		rdpos = wrpos = 0;
8531 		ret = VOP_GETATTR(fvp, &va, cred);
8532 		aresid = 0;
8533 		while (ret == 0 && aresid == 0) {
8534 			ret = vn_rdwr(UIO_READ, fvp, dat, PNFSDS_COPYSIZ,
8535 			    rdpos, UIO_SYSSPACE, IO_NODELOCKED, cred, NULL,
8536 			    &aresid, p);
8537 			xfer = PNFSDS_COPYSIZ - aresid;
8538 			if (ret == 0 && xfer > 0) {
8539 				rdpos += xfer;
8540 				/*
8541 				 * Skip the write for holes, except for the
8542 				 * last block.
8543 				 */
8544 				if (xfer < PNFSDS_COPYSIZ || rdpos ==
8545 				    va.va_size || NFSBCMP(dat,
8546 				    nfsrv_zeropnfsdat, PNFSDS_COPYSIZ) != 0)
8547 					ret = vn_rdwr(UIO_WRITE, tvp, dat, xfer,
8548 					    wrpos, UIO_SYSSPACE, IO_NODELOCKED,
8549 					    cred, NULL, NULL, p);
8550 				if (ret == 0)
8551 					wrpos += xfer;
8552 			}
8553 		}
8554 
8555 		/* If there is an ACL and the copy succeeded, set the ACL. */
8556 		if (ret == 0 && retacl == 0) {
8557 			ret = VOP_SETACL(tvp, ACL_TYPE_NFS4, aclp, cred, p);
8558 			/*
8559 			 * Don't consider these as errors, since VOP_GETACL()
8560 			 * can return an ACL when they are not actually
8561 			 * supported.  For example, for UFS, VOP_GETACL()
8562 			 * will return a trivial ACL based on the uid/gid/mode
8563 			 * when there is no ACL on the file.
8564 			 * This case should be recognized as a trivial ACL
8565 			 * by UFS's VOP_SETACL() and succeed, but...
8566 			 */
8567 			if (ret == ENOATTR || ret == EOPNOTSUPP || ret == EPERM)
8568 				ret = 0;
8569 		}
8570 
8571 		if (ret == 0)
8572 			ret = VOP_FSYNC(tvp, MNT_WAIT, p);
8573 
8574 		/* Set the DS data file's modify time that of the MDS file. */
8575 		if (ret == 0)
8576 			ret = VOP_GETATTR(vp, &va, cred);
8577 		if (ret == 0) {
8578 			mtime = va.va_mtime;
8579 			VATTR_NULL(&va);
8580 			va.va_mtime = mtime;
8581 			ret = VOP_SETATTR(tvp, &va, cred);
8582 		}
8583 
8584 		vput(tvp);
8585 		acl_free(aclp);
8586 		free(dat, M_TEMP);
8587 	}
8588 	if (tvmp != NULL)
8589 		vn_finished_write(tvmp);
8590 
8591 	/* Update the extended attributes for the newly created DS file. */
8592 	if (ret == 0)
8593 		ret = vn_extattr_set(vp, IO_NODELOCKED,
8594 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile",
8595 		    sizeof(*wpf) * mirrorcnt, (char *)wpf, p);
8596 	if (mp != NULL)
8597 		vn_finished_write(mp);
8598 
8599 	/* Get rid of the dontlist entry, so that Layouts can be issued. */
8600 	NFSDDONTLISTLOCK();
8601 	LIST_REMOVE(mrp, nfsmr_list);
8602 	NFSDDONTLISTUNLOCK();
8603 	free(mrp, M_NFSDSTATE);
8604 	return (ret);
8605 }
8606 
8607 /*
8608  * Create a data storage file on the recovered DS.
8609  */
8610 static int
8611 nfsrv_createdsfile(vnode_t vp, fhandle_t *fhp, struct pnfsdsfile *pf,
8612     vnode_t dvp, struct nfsdevice *ds, struct ucred *cred, NFSPROC_T *p,
8613     vnode_t *tvpp)
8614 {
8615 	struct vattr va, nva;
8616 	int error;
8617 
8618 	/* Make data file name based on FH. */
8619 	error = VOP_GETATTR(vp, &va, cred);
8620 	if (error == 0) {
8621 		/* Set the attributes for "vp" to Setattr the DS vp. */
8622 		VATTR_NULL(&nva);
8623 		nva.va_uid = va.va_uid;
8624 		nva.va_gid = va.va_gid;
8625 		nva.va_mode = va.va_mode;
8626 		nva.va_size = 0;
8627 		VATTR_NULL(&va);
8628 		va.va_type = VREG;
8629 		va.va_mode = nva.va_mode;
8630 		NFSD_DEBUG(4, "nfsrv_dscreatefile: dvp=%p pf=%p\n", dvp, pf);
8631 		error = nfsrv_dscreate(dvp, &va, &nva, fhp, pf, NULL,
8632 		    pf->dsf_filename, cred, p, tvpp);
8633 	}
8634 	return (error);
8635 }
8636 
8637 /*
8638  * Look up the MDS file shared locked, and then get the extended attribute
8639  * to find the extant DS file to be copied to the new mirror.
8640  * If successful, *vpp is set to the MDS file's vp and *nvpp is
8641  * set to a DS data file for the MDS file, both exclusively locked.
8642  * The "buf" argument has the pnfsdsfile structure from the MDS file
8643  * in it and buflen is set to its length.
8644  */
8645 int
8646 nfsrv_mdscopymr(char *mdspathp, char *dspathp, char *curdspathp, char *buf,
8647     int *buflenp, char *fname, NFSPROC_T *p, struct vnode **vpp,
8648     struct vnode **nvpp, struct pnfsdsfile **pfp, struct nfsdevice **dsp,
8649     struct nfsdevice **fdsp)
8650 {
8651 	struct nameidata nd;
8652 	struct vnode *vp, *curvp;
8653 	struct pnfsdsfile *pf;
8654 	struct nfsmount *nmp, *curnmp;
8655 	int dsdir, error, mirrorcnt, ippos;
8656 
8657 	vp = NULL;
8658 	curvp = NULL;
8659 	curnmp = NULL;
8660 	*dsp = NULL;
8661 	*fdsp = NULL;
8662 	if (dspathp == NULL && curdspathp != NULL)
8663 		return (EPERM);
8664 
8665 	/*
8666 	 * Look up the MDS file shared locked.  The lock will be upgraded
8667 	 * to an exclusive lock after any rw layouts have been returned.
8668 	 */
8669 	NFSD_DEBUG(4, "mdsopen path=%s\n", mdspathp);
8670 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF, UIO_SYSSPACE,
8671 	    mdspathp);
8672 	error = namei(&nd);
8673 	NFSD_DEBUG(4, "lookup=%d\n", error);
8674 	if (error != 0)
8675 		return (error);
8676 	if (nd.ni_vp->v_type != VREG) {
8677 		vput(nd.ni_vp);
8678 		NFSD_DEBUG(4, "mdspath not reg\n");
8679 		return (EISDIR);
8680 	}
8681 	vp = nd.ni_vp;
8682 
8683 	if (curdspathp != NULL) {
8684 		/*
8685 		 * Look up the current DS path and find the nfsdev structure for
8686 		 * it.
8687 		 */
8688 		NFSD_DEBUG(4, "curmdsdev path=%s\n", curdspathp);
8689 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
8690 		    UIO_SYSSPACE, curdspathp);
8691 		error = namei(&nd);
8692 		NFSD_DEBUG(4, "ds lookup=%d\n", error);
8693 		if (error != 0) {
8694 			vput(vp);
8695 			return (error);
8696 		}
8697 		if (nd.ni_vp->v_type != VDIR) {
8698 			vput(nd.ni_vp);
8699 			vput(vp);
8700 			NFSD_DEBUG(4, "curdspath not dir\n");
8701 			return (ENOTDIR);
8702 		}
8703 		if (strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name, "nfs") != 0) {
8704 			vput(nd.ni_vp);
8705 			vput(vp);
8706 			NFSD_DEBUG(4, "curdspath not an NFS mount\n");
8707 			return (ENXIO);
8708 		}
8709 		curnmp = VFSTONFS(nd.ni_vp->v_mount);
8710 
8711 		/* Search the nfsdev list for a match. */
8712 		NFSDDSLOCK();
8713 		*fdsp = nfsv4_findmirror(curnmp);
8714 		NFSDDSUNLOCK();
8715 		if (*fdsp == NULL)
8716 			curnmp = NULL;
8717 		if (curnmp == NULL) {
8718 			vput(nd.ni_vp);
8719 			vput(vp);
8720 			NFSD_DEBUG(4, "mdscopymr: no current ds\n");
8721 			return (ENXIO);
8722 		}
8723 		curvp = nd.ni_vp;
8724 	}
8725 
8726 	if (dspathp != NULL) {
8727 		/* Look up the nfsdev path and find the nfsdev structure. */
8728 		NFSD_DEBUG(4, "mdsdev path=%s\n", dspathp);
8729 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF,
8730 		    UIO_SYSSPACE, dspathp);
8731 		error = namei(&nd);
8732 		NFSD_DEBUG(4, "ds lookup=%d\n", error);
8733 		if (error != 0) {
8734 			vput(vp);
8735 			if (curvp != NULL)
8736 				vput(curvp);
8737 			return (error);
8738 		}
8739 		if (nd.ni_vp->v_type != VDIR || nd.ni_vp == curvp) {
8740 			vput(nd.ni_vp);
8741 			vput(vp);
8742 			if (curvp != NULL)
8743 				vput(curvp);
8744 			NFSD_DEBUG(4, "dspath not dir\n");
8745 			if (nd.ni_vp == curvp)
8746 				return (EPERM);
8747 			return (ENOTDIR);
8748 		}
8749 		if (strcmp(nd.ni_vp->v_mount->mnt_vfc->vfc_name, "nfs") != 0) {
8750 			vput(nd.ni_vp);
8751 			vput(vp);
8752 			if (curvp != NULL)
8753 				vput(curvp);
8754 			NFSD_DEBUG(4, "dspath not an NFS mount\n");
8755 			return (ENXIO);
8756 		}
8757 		nmp = VFSTONFS(nd.ni_vp->v_mount);
8758 
8759 		/*
8760 		 * Search the nfsdevice list for a match.  If curnmp == NULL,
8761 		 * this is a recovery and there must be a mirror.
8762 		 */
8763 		NFSDDSLOCK();
8764 		if (curnmp == NULL)
8765 			*dsp = nfsrv_findmirroredds(nmp);
8766 		else
8767 			*dsp = nfsv4_findmirror(nmp);
8768 		NFSDDSUNLOCK();
8769 		if (*dsp == NULL) {
8770 			vput(nd.ni_vp);
8771 			vput(vp);
8772 			if (curvp != NULL)
8773 				vput(curvp);
8774 			NFSD_DEBUG(4, "mdscopymr: no ds\n");
8775 			return (ENXIO);
8776 		}
8777 	} else {
8778 		nd.ni_vp = NULL;
8779 		nmp = NULL;
8780 	}
8781 
8782 	/*
8783 	 * Get a vp for an available DS data file using the extended
8784 	 * attribute on the MDS file.
8785 	 * If there is a valid entry for the new DS in the extended attribute
8786 	 * on the MDS file (as checked via the nmp argument),
8787 	 * nfsrv_dsgetsockmnt() returns EEXIST, so no copying will occur.
8788 	 */
8789 	error = nfsrv_dsgetsockmnt(vp, 0, buf, buflenp, &mirrorcnt, p,
8790 	    NULL, NULL, NULL, fname, nvpp, &nmp, curnmp, &ippos, &dsdir);
8791 	if (curvp != NULL)
8792 		vput(curvp);
8793 	if (nd.ni_vp == NULL) {
8794 		if (error == 0 && nmp != NULL) {
8795 			/* Search the nfsdev list for a match. */
8796 			NFSDDSLOCK();
8797 			*dsp = nfsrv_findmirroredds(nmp);
8798 			NFSDDSUNLOCK();
8799 		}
8800 		if (error == 0 && (nmp == NULL || *dsp == NULL)) {
8801 			if (nvpp != NULL && *nvpp != NULL) {
8802 				vput(*nvpp);
8803 				*nvpp = NULL;
8804 			}
8805 			error = ENXIO;
8806 		}
8807 	} else
8808 		vput(nd.ni_vp);
8809 
8810 	/*
8811 	 * When dspathp != NULL and curdspathp == NULL, this is a recovery
8812 	 * and is only allowed if there is a 0.0.0.0 IP address entry.
8813 	 * When curdspathp != NULL, the ippos will be set to that entry.
8814 	 */
8815 	if (error == 0 && dspathp != NULL && ippos == -1) {
8816 		if (nvpp != NULL && *nvpp != NULL) {
8817 			vput(*nvpp);
8818 			*nvpp = NULL;
8819 		}
8820 		error = ENXIO;
8821 	}
8822 	if (error == 0) {
8823 		*vpp = vp;
8824 
8825 		pf = (struct pnfsdsfile *)buf;
8826 		if (ippos == -1) {
8827 			/* If no zeroip pnfsdsfile, add one. */
8828 			ippos = *buflenp / sizeof(*pf);
8829 			*buflenp += sizeof(*pf);
8830 			pf += ippos;
8831 			pf->dsf_dir = dsdir;
8832 			strlcpy(pf->dsf_filename, fname,
8833 			    sizeof(pf->dsf_filename));
8834 		} else
8835 			pf += ippos;
8836 		*pfp = pf;
8837 	} else
8838 		vput(vp);
8839 	return (error);
8840 }
8841 
8842 /*
8843  * Search for a matching pnfsd mirror device structure, base on the nmp arg.
8844  * Return one if found, NULL otherwise.
8845  */
8846 static struct nfsdevice *
8847 nfsrv_findmirroredds(struct nfsmount *nmp)
8848 {
8849 	struct nfsdevice *ds, *fndds;
8850 	int fndmirror;
8851 
8852 	mtx_assert(NFSDDSMUTEXPTR, MA_OWNED);
8853 	/*
8854 	 * Search the DS server list for a match with nmp.
8855 	 * Remove the DS entry if found and there is a mirror.
8856 	 */
8857 	fndds = NULL;
8858 	fndmirror = 0;
8859 	if (nfsrv_devidcnt == 0)
8860 		return (fndds);
8861 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
8862 		if (ds->nfsdev_nmp == nmp) {
8863 			NFSD_DEBUG(4, "nfsrv_findmirroredds: fnd main ds\n");
8864 			fndds = ds;
8865 			break;
8866 		}
8867 	}
8868 	if (fndds == NULL)
8869 		return (fndds);
8870 	if (fndds->nfsdev_mdsisset == 0 && nfsrv_faildscnt > 0)
8871 		fndmirror = 1;
8872 	else if (fndds->nfsdev_mdsisset != 0) {
8873 		/* For the fsid is set case, search for a mirror. */
8874 		TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
8875 			if (ds != fndds && ds->nfsdev_nmp != NULL &&
8876 			    ds->nfsdev_mdsisset != 0 &&
8877 			    fsidcmp(&ds->nfsdev_mdsfsid,
8878 			    &fndds->nfsdev_mdsfsid) == 0) {
8879 				fndmirror = 1;
8880 				break;
8881 			}
8882 		}
8883 	}
8884 	if (fndmirror == 0) {
8885 		NFSD_DEBUG(4, "nfsrv_findmirroredds: no mirror for DS\n");
8886 		return (NULL);
8887 	}
8888 	return (fndds);
8889 }
8890 
8891 /*
8892  * Mark the appropriate devid and all associated layout as "out of space".
8893  */
8894 void
8895 nfsrv_marknospc(char *devid, bool setit)
8896 {
8897 	struct nfsdevice *ds;
8898 	struct nfslayout *lyp;
8899 	struct nfslayouthash *lhyp;
8900 	int i;
8901 
8902 	NFSDDSLOCK();
8903 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
8904 		if (NFSBCMP(ds->nfsdev_deviceid, devid, NFSX_V4DEVICEID) == 0) {
8905 			NFSD_DEBUG(1, "nfsrv_marknospc: devid %d\n", setit);
8906 			ds->nfsdev_nospc = setit;
8907 		}
8908 	}
8909 	NFSDDSUNLOCK();
8910 
8911 	for (i = 0; i < nfsrv_layouthashsize; i++) {
8912 		lhyp = &nfslayouthash[i];
8913 		NFSLOCKLAYOUT(lhyp);
8914 		TAILQ_FOREACH(lyp, &lhyp->list, lay_list) {
8915 			if (NFSBCMP(lyp->lay_deviceid, devid,
8916 			    NFSX_V4DEVICEID) == 0) {
8917 				NFSD_DEBUG(1, "nfsrv_marknospc: layout %d\n",
8918 				    setit);
8919 				if (setit)
8920 					lyp->lay_flags |= NFSLAY_NOSPC;
8921 				else
8922 					lyp->lay_flags &= ~NFSLAY_NOSPC;
8923 			}
8924 		}
8925 		NFSUNLOCKLAYOUT(lhyp);
8926 	}
8927 }
8928 
8929 /*
8930  * Check to see if SP4_MACH_CRED is in use and, if it is, check that the
8931  * correct machine credential is being used.
8932  */
8933 static int
8934 nfsrv_checkmachcred(int op, struct nfsrv_descript *nd, struct nfsclient *clp)
8935 {
8936 
8937 	if ((clp->lc_flags & LCL_MACHCRED) == 0 ||
8938 	    !NFSISSET_OPBIT(&clp->lc_mustops, op))
8939 		return (0);
8940 	KASSERT((nd->nd_flag & ND_NFSV41) != 0,
8941 	    ("nfsrv_checkmachcred: MachCred for NFSv4.0"));
8942 	if ((nd->nd_flag & (ND_GSSINTEGRITY | ND_GSSPRIVACY)) != 0 &&
8943 	    nd->nd_princlen == clp->lc_namelen &&
8944 	    !NFSBCMP(nd->nd_principal, clp->lc_name, nd->nd_princlen))
8945 		return (0);
8946 	return (NFSERR_AUTHERR | AUTH_TOOWEAK);
8947 }
8948