xref: /freebsd/sys/fs/nfsserver/nfs_nfsdport.c (revision 6966ac055c3b7a39266fb982493330df7a097997)
1 /*-
2  * SPDX-License-Identifier: BSD-3-Clause
3  *
4  * Copyright (c) 1989, 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Rick Macklem at The University of Guelph.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  */
35 
36 #include <sys/cdefs.h>
37 __FBSDID("$FreeBSD$");
38 
39 #include <sys/capsicum.h>
40 #include <sys/extattr.h>
41 
42 /*
43  * Functions that perform the vfs operations required by the routines in
44  * nfsd_serv.c. It is hoped that this change will make the server more
45  * portable.
46  */
47 
48 #include <fs/nfs/nfsport.h>
49 #include <security/mac/mac_framework.h>
50 #include <sys/filio.h>
51 #include <sys/hash.h>
52 #include <sys/sysctl.h>
53 #include <nlm/nlm_prot.h>
54 #include <nlm/nlm.h>
55 
56 FEATURE(nfsd, "NFSv4 server");
57 
58 extern u_int32_t newnfs_true, newnfs_false, newnfs_xdrneg1;
59 extern int nfsrv_useacl;
60 extern int newnfs_numnfsd;
61 extern struct mount nfsv4root_mnt;
62 extern struct nfsrv_stablefirst nfsrv_stablefirst;
63 extern void (*nfsd_call_servertimer)(void);
64 extern SVCPOOL	*nfsrvd_pool;
65 extern struct nfsv4lock nfsd_suspend_lock;
66 extern struct nfsclienthashhead *nfsclienthash;
67 extern struct nfslockhashhead *nfslockhash;
68 extern struct nfssessionhash *nfssessionhash;
69 extern int nfsrv_sessionhashsize;
70 extern struct nfsstatsv1 nfsstatsv1;
71 extern struct nfslayouthash *nfslayouthash;
72 extern int nfsrv_layouthashsize;
73 extern struct mtx nfsrv_dslock_mtx;
74 extern int nfs_pnfsiothreads;
75 extern struct nfsdontlisthead nfsrv_dontlisthead;
76 extern volatile int nfsrv_dontlistlen;
77 extern volatile int nfsrv_devidcnt;
78 extern int nfsrv_maxpnfsmirror;
79 struct vfsoptlist nfsv4root_opt, nfsv4root_newopt;
80 NFSDLOCKMUTEX;
81 NFSSTATESPINLOCK;
82 struct nfsrchash_bucket nfsrchash_table[NFSRVCACHE_HASHSIZE];
83 struct nfsrchash_bucket nfsrcahash_table[NFSRVCACHE_HASHSIZE];
84 struct mtx nfsrc_udpmtx;
85 struct mtx nfs_v4root_mutex;
86 struct mtx nfsrv_dontlistlock_mtx;
87 struct mtx nfsrv_recalllock_mtx;
88 struct nfsrvfh nfs_rootfh, nfs_pubfh;
89 int nfs_pubfhset = 0, nfs_rootfhset = 0;
90 struct proc *nfsd_master_proc = NULL;
91 int nfsd_debuglevel = 0;
92 static pid_t nfsd_master_pid = (pid_t)-1;
93 static char nfsd_master_comm[MAXCOMLEN + 1];
94 static struct timeval nfsd_master_start;
95 static uint32_t nfsv4_sysid = 0;
96 static fhandle_t zerofh;
97 
98 static int nfssvc_srvcall(struct thread *, struct nfssvc_args *,
99     struct ucred *);
100 
101 int nfsrv_enable_crossmntpt = 1;
102 static int nfs_commit_blks;
103 static int nfs_commit_miss;
104 extern int nfsrv_issuedelegs;
105 extern int nfsrv_dolocallocks;
106 extern int nfsd_enable_stringtouid;
107 extern struct nfsdevicehead nfsrv_devidhead;
108 
109 static int nfsrv_createiovec(int, struct mbuf **, struct mbuf **,
110     struct iovec **);
111 static int nfsrv_createiovecw(int, struct mbuf *, char *, struct iovec **,
112     int *);
113 static void nfsrv_pnfscreate(struct vnode *, struct vattr *, struct ucred *,
114     NFSPROC_T *);
115 static void nfsrv_pnfsremovesetup(struct vnode *, NFSPROC_T *, struct vnode **,
116     int *, char *, fhandle_t *);
117 static void nfsrv_pnfsremove(struct vnode **, int, char *, fhandle_t *,
118     NFSPROC_T *);
119 static int nfsrv_proxyds(struct vnode *, off_t, int, struct ucred *,
120     struct thread *, int, struct mbuf **, char *, struct mbuf **,
121     struct nfsvattr *, struct acl *, off_t *, int, bool *);
122 static int nfsrv_setextattr(struct vnode *, struct nfsvattr *, NFSPROC_T *);
123 static int nfsrv_readdsrpc(fhandle_t *, off_t, int, struct ucred *,
124     NFSPROC_T *, struct nfsmount *, struct mbuf **, struct mbuf **);
125 static int nfsrv_writedsrpc(fhandle_t *, off_t, int, struct ucred *,
126     NFSPROC_T *, struct vnode *, struct nfsmount **, int, struct mbuf **,
127     char *, int *);
128 static int nfsrv_allocatedsrpc(fhandle_t *, off_t, off_t, struct ucred *,
129     NFSPROC_T *, struct vnode *, struct nfsmount **, int, int *);
130 static int nfsrv_setacldsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
131     struct vnode *, struct nfsmount **, int, struct acl *, int *);
132 static int nfsrv_setattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
133     struct vnode *, struct nfsmount **, int, struct nfsvattr *, int *);
134 static int nfsrv_getattrdsrpc(fhandle_t *, struct ucred *, NFSPROC_T *,
135     struct vnode *, struct nfsmount *, struct nfsvattr *);
136 static int nfsrv_seekdsrpc(fhandle_t *, off_t *, int, bool *, struct ucred *,
137     NFSPROC_T *, struct nfsmount *);
138 static int nfsrv_putfhname(fhandle_t *, char *);
139 static int nfsrv_pnfslookupds(struct vnode *, struct vnode *,
140     struct pnfsdsfile *, struct vnode **, NFSPROC_T *);
141 static void nfsrv_pnfssetfh(struct vnode *, struct pnfsdsfile *, char *, char *,
142     struct vnode *, NFSPROC_T *);
143 static int nfsrv_dsremove(struct vnode *, char *, struct ucred *, NFSPROC_T *);
144 static int nfsrv_dssetacl(struct vnode *, struct acl *, struct ucred *,
145     NFSPROC_T *);
146 static int nfsrv_pnfsstatfs(struct statfs *, struct mount *);
147 
148 int nfs_pnfsio(task_fn_t *, void *);
149 
150 SYSCTL_NODE(_vfs, OID_AUTO, nfsd, CTLFLAG_RW, 0, "NFS server");
151 SYSCTL_INT(_vfs_nfsd, OID_AUTO, mirrormnt, CTLFLAG_RW,
152     &nfsrv_enable_crossmntpt, 0, "Enable nfsd to cross mount points");
153 SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_blks, CTLFLAG_RW, &nfs_commit_blks,
154     0, "");
155 SYSCTL_INT(_vfs_nfsd, OID_AUTO, commit_miss, CTLFLAG_RW, &nfs_commit_miss,
156     0, "");
157 SYSCTL_INT(_vfs_nfsd, OID_AUTO, issue_delegations, CTLFLAG_RW,
158     &nfsrv_issuedelegs, 0, "Enable nfsd to issue delegations");
159 SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_locallocks, CTLFLAG_RW,
160     &nfsrv_dolocallocks, 0, "Enable nfsd to acquire local locks on files");
161 SYSCTL_INT(_vfs_nfsd, OID_AUTO, debuglevel, CTLFLAG_RW, &nfsd_debuglevel,
162     0, "Debug level for NFS server");
163 SYSCTL_INT(_vfs_nfsd, OID_AUTO, enable_stringtouid, CTLFLAG_RW,
164     &nfsd_enable_stringtouid, 0, "Enable nfsd to accept numeric owner_names");
165 static int nfsrv_pnfsgetdsattr = 1;
166 SYSCTL_INT(_vfs_nfsd, OID_AUTO, pnfsgetdsattr, CTLFLAG_RW,
167     &nfsrv_pnfsgetdsattr, 0, "When set getattr gets DS attributes via RPC");
168 
169 /*
170  * nfsrv_dsdirsize can only be increased and only when the nfsd threads are
171  * not running.
172  * The dsN subdirectories for the increased values must have been created
173  * on all DS servers before this increase is done.
174  */
175 u_int	nfsrv_dsdirsize = 20;
176 static int
177 sysctl_dsdirsize(SYSCTL_HANDLER_ARGS)
178 {
179 	int error, newdsdirsize;
180 
181 	newdsdirsize = nfsrv_dsdirsize;
182 	error = sysctl_handle_int(oidp, &newdsdirsize, 0, req);
183 	if (error != 0 || req->newptr == NULL)
184 		return (error);
185 	if (newdsdirsize <= nfsrv_dsdirsize || newdsdirsize > 10000 ||
186 	    newnfs_numnfsd != 0)
187 		return (EINVAL);
188 	nfsrv_dsdirsize = newdsdirsize;
189 	return (0);
190 }
191 SYSCTL_PROC(_vfs_nfsd, OID_AUTO, dsdirsize,
192     CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(nfsrv_dsdirsize),
193     sysctl_dsdirsize, "IU", "Number of dsN subdirs on the DS servers");
194 
195 #define	MAX_REORDERED_RPC	16
196 #define	NUM_HEURISTIC		1031
197 #define	NHUSE_INIT		64
198 #define	NHUSE_INC		16
199 #define	NHUSE_MAX		2048
200 
201 static struct nfsheur {
202 	struct vnode *nh_vp;	/* vp to match (unreferenced pointer) */
203 	off_t nh_nextoff;	/* next offset for sequential detection */
204 	int nh_use;		/* use count for selection */
205 	int nh_seqcount;	/* heuristic */
206 } nfsheur[NUM_HEURISTIC];
207 
208 
209 /*
210  * Heuristic to detect sequential operation.
211  */
212 static struct nfsheur *
213 nfsrv_sequential_heuristic(struct uio *uio, struct vnode *vp)
214 {
215 	struct nfsheur *nh;
216 	int hi, try;
217 
218 	/* Locate best candidate. */
219 	try = 32;
220 	hi = ((int)(vm_offset_t)vp / sizeof(struct vnode)) % NUM_HEURISTIC;
221 	nh = &nfsheur[hi];
222 	while (try--) {
223 		if (nfsheur[hi].nh_vp == vp) {
224 			nh = &nfsheur[hi];
225 			break;
226 		}
227 		if (nfsheur[hi].nh_use > 0)
228 			--nfsheur[hi].nh_use;
229 		hi = (hi + 1) % NUM_HEURISTIC;
230 		if (nfsheur[hi].nh_use < nh->nh_use)
231 			nh = &nfsheur[hi];
232 	}
233 
234 	/* Initialize hint if this is a new file. */
235 	if (nh->nh_vp != vp) {
236 		nh->nh_vp = vp;
237 		nh->nh_nextoff = uio->uio_offset;
238 		nh->nh_use = NHUSE_INIT;
239 		if (uio->uio_offset == 0)
240 			nh->nh_seqcount = 4;
241 		else
242 			nh->nh_seqcount = 1;
243 	}
244 
245 	/* Calculate heuristic. */
246 	if ((uio->uio_offset == 0 && nh->nh_seqcount > 0) ||
247 	    uio->uio_offset == nh->nh_nextoff) {
248 		/* See comments in vfs_vnops.c:sequential_heuristic(). */
249 		nh->nh_seqcount += howmany(uio->uio_resid, 16384);
250 		if (nh->nh_seqcount > IO_SEQMAX)
251 			nh->nh_seqcount = IO_SEQMAX;
252 	} else if (qabs(uio->uio_offset - nh->nh_nextoff) <= MAX_REORDERED_RPC *
253 	    imax(vp->v_mount->mnt_stat.f_iosize, uio->uio_resid)) {
254 		/* Probably a reordered RPC, leave seqcount alone. */
255 	} else if (nh->nh_seqcount > 1) {
256 		nh->nh_seqcount /= 2;
257 	} else {
258 		nh->nh_seqcount = 0;
259 	}
260 	nh->nh_use += NHUSE_INC;
261 	if (nh->nh_use > NHUSE_MAX)
262 		nh->nh_use = NHUSE_MAX;
263 	return (nh);
264 }
265 
266 /*
267  * Get attributes into nfsvattr structure.
268  */
269 int
270 nfsvno_getattr(struct vnode *vp, struct nfsvattr *nvap,
271     struct nfsrv_descript *nd, struct thread *p, int vpislocked,
272     nfsattrbit_t *attrbitp)
273 {
274 	int error, gotattr, lockedit = 0;
275 	struct nfsvattr na;
276 
277 	if (vpislocked == 0) {
278 		/*
279 		 * When vpislocked == 0, the vnode is either exclusively
280 		 * locked by this thread or not locked by this thread.
281 		 * As such, shared lock it, if not exclusively locked.
282 		 */
283 		if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
284 			lockedit = 1;
285 			NFSVOPLOCK(vp, LK_SHARED | LK_RETRY);
286 		}
287 	}
288 
289 	/*
290 	 * Acquire the Change, Size, TimeAccess, TimeModify and SpaceUsed
291 	 * attributes, as required.
292 	 * This needs to be done for regular files if:
293 	 * - non-NFSv4 RPCs or
294 	 * - when attrbitp == NULL or
295 	 * - an NFSv4 RPC with any of the above attributes in attrbitp.
296 	 * A return of 0 for nfsrv_proxyds() indicates that it has acquired
297 	 * these attributes.  nfsrv_proxyds() will return an error if the
298 	 * server is not a pNFS one.
299 	 */
300 	gotattr = 0;
301 	if (vp->v_type == VREG && nfsrv_devidcnt > 0 && (attrbitp == NULL ||
302 	    (nd->nd_flag & ND_NFSV4) == 0 ||
303 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_CHANGE) ||
304 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SIZE) ||
305 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEACCESS) ||
306 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_TIMEMODIFY) ||
307 	    NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEUSED))) {
308 		error = nfsrv_proxyds(vp, 0, 0, nd->nd_cred, p,
309 		    NFSPROC_GETATTR, NULL, NULL, NULL, &na, NULL, NULL, 0,
310 		    NULL);
311 		if (error == 0)
312 			gotattr = 1;
313 	}
314 
315 	error = VOP_GETATTR(vp, &nvap->na_vattr, nd->nd_cred);
316 	if (lockedit != 0)
317 		NFSVOPUNLOCK(vp);
318 
319 	/*
320 	 * If we got the Change, Size and Modify Time from the DS,
321 	 * replace them.
322 	 */
323 	if (gotattr != 0) {
324 		nvap->na_atime = na.na_atime;
325 		nvap->na_mtime = na.na_mtime;
326 		nvap->na_filerev = na.na_filerev;
327 		nvap->na_size = na.na_size;
328 		nvap->na_bytes = na.na_bytes;
329 	}
330 	NFSD_DEBUG(4, "nfsvno_getattr: gotattr=%d err=%d chg=%ju\n", gotattr,
331 	    error, (uintmax_t)na.na_filerev);
332 
333 	NFSEXITCODE(error);
334 	return (error);
335 }
336 
337 /*
338  * Get a file handle for a vnode.
339  */
340 int
341 nfsvno_getfh(struct vnode *vp, fhandle_t *fhp, struct thread *p)
342 {
343 	int error;
344 
345 	NFSBZERO((caddr_t)fhp, sizeof(fhandle_t));
346 	fhp->fh_fsid = vp->v_mount->mnt_stat.f_fsid;
347 	error = VOP_VPTOFH(vp, &fhp->fh_fid);
348 
349 	NFSEXITCODE(error);
350 	return (error);
351 }
352 
353 /*
354  * Perform access checking for vnodes obtained from file handles that would
355  * refer to files already opened by a Unix client. You cannot just use
356  * vn_writechk() and VOP_ACCESSX() for two reasons.
357  * 1 - You must check for exported rdonly as well as MNT_RDONLY for the write
358  *     case.
359  * 2 - The owner is to be given access irrespective of mode bits for some
360  *     operations, so that processes that chmod after opening a file don't
361  *     break.
362  */
363 int
364 nfsvno_accchk(struct vnode *vp, accmode_t accmode, struct ucred *cred,
365     struct nfsexstuff *exp, struct thread *p, int override, int vpislocked,
366     u_int32_t *supportedtypep)
367 {
368 	struct vattr vattr;
369 	int error = 0, getret = 0;
370 
371 	if (vpislocked == 0) {
372 		if (NFSVOPLOCK(vp, LK_SHARED) != 0) {
373 			error = EPERM;
374 			goto out;
375 		}
376 	}
377 	if (accmode & VWRITE) {
378 		/* Just vn_writechk() changed to check rdonly */
379 		/*
380 		 * Disallow write attempts on read-only file systems;
381 		 * unless the file is a socket or a block or character
382 		 * device resident on the file system.
383 		 */
384 		if (NFSVNO_EXRDONLY(exp) ||
385 		    (vp->v_mount->mnt_flag & MNT_RDONLY)) {
386 			switch (vp->v_type) {
387 			case VREG:
388 			case VDIR:
389 			case VLNK:
390 				error = EROFS;
391 			default:
392 				break;
393 			}
394 		}
395 		/*
396 		 * If there's shared text associated with
397 		 * the inode, try to free it up once.  If
398 		 * we fail, we can't allow writing.
399 		 */
400 		if (VOP_IS_TEXT(vp) && error == 0)
401 			error = ETXTBSY;
402 	}
403 	if (error != 0) {
404 		if (vpislocked == 0)
405 			NFSVOPUNLOCK(vp);
406 		goto out;
407 	}
408 
409 	/*
410 	 * Should the override still be applied when ACLs are enabled?
411 	 */
412 	error = VOP_ACCESSX(vp, accmode, cred, p);
413 	if (error != 0 && (accmode & (VDELETE | VDELETE_CHILD))) {
414 		/*
415 		 * Try again with VEXPLICIT_DENY, to see if the test for
416 		 * deletion is supported.
417 		 */
418 		error = VOP_ACCESSX(vp, accmode | VEXPLICIT_DENY, cred, p);
419 		if (error == 0) {
420 			if (vp->v_type == VDIR) {
421 				accmode &= ~(VDELETE | VDELETE_CHILD);
422 				accmode |= VWRITE;
423 				error = VOP_ACCESSX(vp, accmode, cred, p);
424 			} else if (supportedtypep != NULL) {
425 				*supportedtypep &= ~NFSACCESS_DELETE;
426 			}
427 		}
428 	}
429 
430 	/*
431 	 * Allow certain operations for the owner (reads and writes
432 	 * on files that are already open).
433 	 */
434 	if (override != NFSACCCHK_NOOVERRIDE &&
435 	    (error == EPERM || error == EACCES)) {
436 		if (cred->cr_uid == 0 && (override & NFSACCCHK_ALLOWROOT))
437 			error = 0;
438 		else if (override & NFSACCCHK_ALLOWOWNER) {
439 			getret = VOP_GETATTR(vp, &vattr, cred);
440 			if (getret == 0 && cred->cr_uid == vattr.va_uid)
441 				error = 0;
442 		}
443 	}
444 	if (vpislocked == 0)
445 		NFSVOPUNLOCK(vp);
446 
447 out:
448 	NFSEXITCODE(error);
449 	return (error);
450 }
451 
452 /*
453  * Set attribute(s) vnop.
454  */
455 int
456 nfsvno_setattr(struct vnode *vp, struct nfsvattr *nvap, struct ucred *cred,
457     struct thread *p, struct nfsexstuff *exp)
458 {
459 	u_quad_t savsize = 0;
460 	int error, savedit;
461 
462 	/*
463 	 * If this is an exported file system and a pNFS service is running,
464 	 * don't VOP_SETATTR() of size for the MDS file system.
465 	 */
466 	savedit = 0;
467 	error = 0;
468 	if (vp->v_type == VREG && (vp->v_mount->mnt_flag & MNT_EXPORTED) != 0 &&
469 	    nfsrv_devidcnt != 0 && nvap->na_vattr.va_size != VNOVAL &&
470 	    nvap->na_vattr.va_size > 0) {
471 		savsize = nvap->na_vattr.va_size;
472 		nvap->na_vattr.va_size = VNOVAL;
473 		if (nvap->na_vattr.va_uid != (uid_t)VNOVAL ||
474 		    nvap->na_vattr.va_gid != (gid_t)VNOVAL ||
475 		    nvap->na_vattr.va_mode != (mode_t)VNOVAL ||
476 		    nvap->na_vattr.va_atime.tv_sec != VNOVAL ||
477 		    nvap->na_vattr.va_mtime.tv_sec != VNOVAL)
478 			savedit = 1;
479 		else
480 			savedit = 2;
481 	}
482 	if (savedit != 2)
483 		error = VOP_SETATTR(vp, &nvap->na_vattr, cred);
484 	if (savedit != 0)
485 		nvap->na_vattr.va_size = savsize;
486 	if (error == 0 && (nvap->na_vattr.va_uid != (uid_t)VNOVAL ||
487 	    nvap->na_vattr.va_gid != (gid_t)VNOVAL ||
488 	    nvap->na_vattr.va_size != VNOVAL ||
489 	    nvap->na_vattr.va_mode != (mode_t)VNOVAL ||
490 	    nvap->na_vattr.va_atime.tv_sec != VNOVAL ||
491 	    nvap->na_vattr.va_mtime.tv_sec != VNOVAL)) {
492 		/* For a pNFS server, set the attributes on the DS file. */
493 		error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETATTR,
494 		    NULL, NULL, NULL, nvap, NULL, NULL, 0, NULL);
495 		if (error == ENOENT)
496 			error = 0;
497 	}
498 	NFSEXITCODE(error);
499 	return (error);
500 }
501 
502 /*
503  * Set up nameidata for a lookup() call and do it.
504  */
505 int
506 nfsvno_namei(struct nfsrv_descript *nd, struct nameidata *ndp,
507     struct vnode *dp, int islocked, struct nfsexstuff *exp, struct thread *p,
508     struct vnode **retdirp)
509 {
510 	struct componentname *cnp = &ndp->ni_cnd;
511 	int i;
512 	struct iovec aiov;
513 	struct uio auio;
514 	int lockleaf = (cnp->cn_flags & LOCKLEAF) != 0, linklen;
515 	int error = 0;
516 	char *cp;
517 
518 	*retdirp = NULL;
519 	cnp->cn_nameptr = cnp->cn_pnbuf;
520 	ndp->ni_lcf = 0;
521 	/*
522 	 * Extract and set starting directory.
523 	 */
524 	if (dp->v_type != VDIR) {
525 		if (islocked)
526 			vput(dp);
527 		else
528 			vrele(dp);
529 		nfsvno_relpathbuf(ndp);
530 		error = ENOTDIR;
531 		goto out1;
532 	}
533 	if (islocked)
534 		NFSVOPUNLOCK(dp);
535 	VREF(dp);
536 	*retdirp = dp;
537 	if (NFSVNO_EXRDONLY(exp))
538 		cnp->cn_flags |= RDONLY;
539 	ndp->ni_segflg = UIO_SYSSPACE;
540 
541 	if (nd->nd_flag & ND_PUBLOOKUP) {
542 		ndp->ni_loopcnt = 0;
543 		if (cnp->cn_pnbuf[0] == '/') {
544 			vrele(dp);
545 			/*
546 			 * Check for degenerate pathnames here, since lookup()
547 			 * panics on them.
548 			 */
549 			for (i = 1; i < ndp->ni_pathlen; i++)
550 				if (cnp->cn_pnbuf[i] != '/')
551 					break;
552 			if (i == ndp->ni_pathlen) {
553 				error = NFSERR_ACCES;
554 				goto out;
555 			}
556 			dp = rootvnode;
557 			VREF(dp);
558 		}
559 	} else if ((nfsrv_enable_crossmntpt == 0 && NFSVNO_EXPORTED(exp)) ||
560 	    (nd->nd_flag & ND_NFSV4) == 0) {
561 		/*
562 		 * Only cross mount points for NFSv4 when doing a
563 		 * mount while traversing the file system above
564 		 * the mount point, unless nfsrv_enable_crossmntpt is set.
565 		 */
566 		cnp->cn_flags |= NOCROSSMOUNT;
567 	}
568 
569 	/*
570 	 * Initialize for scan, set ni_startdir and bump ref on dp again
571 	 * because lookup() will dereference ni_startdir.
572 	 */
573 
574 	cnp->cn_thread = p;
575 	ndp->ni_startdir = dp;
576 	ndp->ni_rootdir = rootvnode;
577 	ndp->ni_topdir = NULL;
578 
579 	if (!lockleaf)
580 		cnp->cn_flags |= LOCKLEAF;
581 	for (;;) {
582 		cnp->cn_nameptr = cnp->cn_pnbuf;
583 		/*
584 		 * Call lookup() to do the real work.  If an error occurs,
585 		 * ndp->ni_vp and ni_dvp are left uninitialized or NULL and
586 		 * we do not have to dereference anything before returning.
587 		 * In either case ni_startdir will be dereferenced and NULLed
588 		 * out.
589 		 */
590 		error = lookup(ndp);
591 		if (error)
592 			break;
593 
594 		/*
595 		 * Check for encountering a symbolic link.  Trivial
596 		 * termination occurs if no symlink encountered.
597 		 */
598 		if ((cnp->cn_flags & ISSYMLINK) == 0) {
599 			if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0)
600 				nfsvno_relpathbuf(ndp);
601 			if (ndp->ni_vp && !lockleaf)
602 				NFSVOPUNLOCK(ndp->ni_vp);
603 			break;
604 		}
605 
606 		/*
607 		 * Validate symlink
608 		 */
609 		if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1)
610 			NFSVOPUNLOCK(ndp->ni_dvp);
611 		if (!(nd->nd_flag & ND_PUBLOOKUP)) {
612 			error = EINVAL;
613 			goto badlink2;
614 		}
615 
616 		if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
617 			error = ELOOP;
618 			goto badlink2;
619 		}
620 		if (ndp->ni_pathlen > 1)
621 			cp = uma_zalloc(namei_zone, M_WAITOK);
622 		else
623 			cp = cnp->cn_pnbuf;
624 		aiov.iov_base = cp;
625 		aiov.iov_len = MAXPATHLEN;
626 		auio.uio_iov = &aiov;
627 		auio.uio_iovcnt = 1;
628 		auio.uio_offset = 0;
629 		auio.uio_rw = UIO_READ;
630 		auio.uio_segflg = UIO_SYSSPACE;
631 		auio.uio_td = NULL;
632 		auio.uio_resid = MAXPATHLEN;
633 		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
634 		if (error) {
635 		badlink1:
636 			if (ndp->ni_pathlen > 1)
637 				uma_zfree(namei_zone, cp);
638 		badlink2:
639 			vrele(ndp->ni_dvp);
640 			vput(ndp->ni_vp);
641 			break;
642 		}
643 		linklen = MAXPATHLEN - auio.uio_resid;
644 		if (linklen == 0) {
645 			error = ENOENT;
646 			goto badlink1;
647 		}
648 		if (linklen + ndp->ni_pathlen >= MAXPATHLEN) {
649 			error = ENAMETOOLONG;
650 			goto badlink1;
651 		}
652 
653 		/*
654 		 * Adjust or replace path
655 		 */
656 		if (ndp->ni_pathlen > 1) {
657 			NFSBCOPY(ndp->ni_next, cp + linklen, ndp->ni_pathlen);
658 			uma_zfree(namei_zone, cnp->cn_pnbuf);
659 			cnp->cn_pnbuf = cp;
660 		} else
661 			cnp->cn_pnbuf[linklen] = '\0';
662 		ndp->ni_pathlen += linklen;
663 
664 		/*
665 		 * Cleanup refs for next loop and check if root directory
666 		 * should replace current directory.  Normally ni_dvp
667 		 * becomes the new base directory and is cleaned up when
668 		 * we loop.  Explicitly null pointers after invalidation
669 		 * to clarify operation.
670 		 */
671 		vput(ndp->ni_vp);
672 		ndp->ni_vp = NULL;
673 
674 		if (cnp->cn_pnbuf[0] == '/') {
675 			vrele(ndp->ni_dvp);
676 			ndp->ni_dvp = ndp->ni_rootdir;
677 			VREF(ndp->ni_dvp);
678 		}
679 		ndp->ni_startdir = ndp->ni_dvp;
680 		ndp->ni_dvp = NULL;
681 	}
682 	if (!lockleaf)
683 		cnp->cn_flags &= ~LOCKLEAF;
684 
685 out:
686 	if (error) {
687 		nfsvno_relpathbuf(ndp);
688 		ndp->ni_vp = NULL;
689 		ndp->ni_dvp = NULL;
690 		ndp->ni_startdir = NULL;
691 	} else if ((ndp->ni_cnd.cn_flags & (WANTPARENT|LOCKPARENT)) == 0) {
692 		ndp->ni_dvp = NULL;
693 	}
694 
695 out1:
696 	NFSEXITCODE2(error, nd);
697 	return (error);
698 }
699 
700 /*
701  * Set up a pathname buffer and return a pointer to it and, optionally
702  * set a hash pointer.
703  */
704 void
705 nfsvno_setpathbuf(struct nameidata *ndp, char **bufpp, u_long **hashpp)
706 {
707 	struct componentname *cnp = &ndp->ni_cnd;
708 
709 	cnp->cn_flags |= (NOMACCHECK | HASBUF);
710 	cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
711 	if (hashpp != NULL)
712 		*hashpp = NULL;
713 	*bufpp = cnp->cn_pnbuf;
714 }
715 
716 /*
717  * Release the above path buffer, if not released by nfsvno_namei().
718  */
719 void
720 nfsvno_relpathbuf(struct nameidata *ndp)
721 {
722 
723 	if ((ndp->ni_cnd.cn_flags & HASBUF) == 0)
724 		panic("nfsrelpath");
725 	uma_zfree(namei_zone, ndp->ni_cnd.cn_pnbuf);
726 	ndp->ni_cnd.cn_flags &= ~HASBUF;
727 }
728 
729 /*
730  * Readlink vnode op into an mbuf list.
731  */
732 int
733 nfsvno_readlink(struct vnode *vp, struct ucred *cred, struct thread *p,
734     struct mbuf **mpp, struct mbuf **mpendp, int *lenp)
735 {
736 	struct iovec *iv;
737 	struct uio io, *uiop = &io;
738 	struct mbuf *mp, *mp3;
739 	int len, tlen, error = 0;
740 
741 	len = NFS_MAXPATHLEN;
742 	uiop->uio_iovcnt = nfsrv_createiovec(len, &mp3, &mp, &iv);
743 	uiop->uio_iov = iv;
744 	uiop->uio_offset = 0;
745 	uiop->uio_resid = len;
746 	uiop->uio_rw = UIO_READ;
747 	uiop->uio_segflg = UIO_SYSSPACE;
748 	uiop->uio_td = NULL;
749 	error = VOP_READLINK(vp, uiop, cred);
750 	free(iv, M_TEMP);
751 	if (error) {
752 		m_freem(mp3);
753 		*lenp = 0;
754 		goto out;
755 	}
756 	if (uiop->uio_resid > 0) {
757 		len -= uiop->uio_resid;
758 		tlen = NFSM_RNDUP(len);
759 		nfsrv_adj(mp3, NFS_MAXPATHLEN - tlen, tlen - len);
760 	}
761 	*lenp = len;
762 	*mpp = mp3;
763 	*mpendp = mp;
764 
765 out:
766 	NFSEXITCODE(error);
767 	return (error);
768 }
769 
770 /*
771  * Create an mbuf chain and an associated iovec that can be used to Read
772  * or Getextattr of data.
773  * Upon success, return pointers to the first and last mbufs in the chain
774  * plus the malloc'd iovec and its iovlen.
775  */
776 static int
777 nfsrv_createiovec(int len, struct mbuf **mpp, struct mbuf **mpendp,
778     struct iovec **ivp)
779 {
780 	struct mbuf *m, *m2 = NULL, *m3;
781 	struct iovec *iv;
782 	int i, left, siz;
783 
784 	left = len;
785 	m3 = NULL;
786 	/*
787 	 * Generate the mbuf list with the uio_iov ref. to it.
788 	 */
789 	i = 0;
790 	while (left > 0) {
791 		NFSMGET(m);
792 		MCLGET(m, M_WAITOK);
793 		m->m_len = 0;
794 		siz = min(M_TRAILINGSPACE(m), left);
795 		left -= siz;
796 		i++;
797 		if (m3)
798 			m2->m_next = m;
799 		else
800 			m3 = m;
801 		m2 = m;
802 	}
803 	*ivp = iv = malloc(i * sizeof (struct iovec), M_TEMP, M_WAITOK);
804 	m = m3;
805 	left = len;
806 	i = 0;
807 	while (left > 0) {
808 		if (m == NULL)
809 			panic("nfsvno_read iov");
810 		siz = min(M_TRAILINGSPACE(m), left);
811 		if (siz > 0) {
812 			iv->iov_base = mtod(m, caddr_t) + m->m_len;
813 			iv->iov_len = siz;
814 			m->m_len += siz;
815 			left -= siz;
816 			iv++;
817 			i++;
818 		}
819 		m = m->m_next;
820 	}
821 	*mpp = m3;
822 	*mpendp = m2;
823 	return (i);
824 }
825 
826 /*
827  * Read vnode op call into mbuf list.
828  */
829 int
830 nfsvno_read(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
831     struct thread *p, struct mbuf **mpp, struct mbuf **mpendp)
832 {
833 	struct mbuf *m;
834 	struct iovec *iv;
835 	int error = 0, len, tlen, ioflag = 0;
836 	struct mbuf *m3;
837 	struct uio io, *uiop = &io;
838 	struct nfsheur *nh;
839 
840 	/*
841 	 * Attempt to read from a DS file. A return of ENOENT implies
842 	 * there is no DS file to read.
843 	 */
844 	error = nfsrv_proxyds(vp, off, cnt, cred, p, NFSPROC_READDS, mpp,
845 	    NULL, mpendp, NULL, NULL, NULL, 0, NULL);
846 	if (error != ENOENT)
847 		return (error);
848 
849 	len = NFSM_RNDUP(cnt);
850 	uiop->uio_iovcnt = nfsrv_createiovec(len, &m3, &m, &iv);
851 	uiop->uio_iov = iv;
852 	uiop->uio_offset = off;
853 	uiop->uio_resid = len;
854 	uiop->uio_rw = UIO_READ;
855 	uiop->uio_segflg = UIO_SYSSPACE;
856 	uiop->uio_td = NULL;
857 	nh = nfsrv_sequential_heuristic(uiop, vp);
858 	ioflag |= nh->nh_seqcount << IO_SEQSHIFT;
859 	/* XXX KDM make this more systematic? */
860 	nfsstatsv1.srvbytes[NFSV4OP_READ] += uiop->uio_resid;
861 	error = VOP_READ(vp, uiop, IO_NODELOCKED | ioflag, cred);
862 	free(iv, M_TEMP);
863 	if (error) {
864 		m_freem(m3);
865 		*mpp = NULL;
866 		goto out;
867 	}
868 	nh->nh_nextoff = uiop->uio_offset;
869 	tlen = len - uiop->uio_resid;
870 	cnt = cnt < tlen ? cnt : tlen;
871 	tlen = NFSM_RNDUP(cnt);
872 	if (tlen == 0) {
873 		m_freem(m3);
874 		m3 = NULL;
875 	} else if (len != tlen || tlen != cnt)
876 		nfsrv_adj(m3, len - tlen, tlen - cnt);
877 	*mpp = m3;
878 	*mpendp = m;
879 
880 out:
881 	NFSEXITCODE(error);
882 	return (error);
883 }
884 
885 /*
886  * Create the iovec for the mbuf chain passed in as an argument.
887  * The "cp" argument is where the data starts within the first mbuf in
888  * the chain. It returns the iovec and the iovcnt.
889  */
890 static int
891 nfsrv_createiovecw(int retlen, struct mbuf *m, char *cp, struct iovec **ivpp,
892     int *iovcntp)
893 {
894 	struct mbuf *mp;
895 	struct iovec *ivp;
896 	int cnt, i, len;
897 
898 	/*
899 	 * Loop through the mbuf chain, counting how many mbufs are a
900 	 * part of this write operation, so the iovec size is known.
901 	 */
902 	cnt = 0;
903 	len = retlen;
904 	mp = m;
905 	i = mtod(mp, caddr_t) + mbuf_len(mp) - cp;
906 	while (len > 0) {
907 		if (i > 0) {
908 			len -= i;
909 			cnt++;
910 		}
911 		mp = mbuf_next(mp);
912 		if (!mp) {
913 			if (len > 0)
914 				return (EBADRPC);
915 		} else
916 			i = mbuf_len(mp);
917 	}
918 
919 	/* Now, create the iovec. */
920 	mp = m;
921 	*ivpp = ivp = malloc(cnt * sizeof (struct iovec), M_TEMP,
922 	    M_WAITOK);
923 	*iovcntp = cnt;
924 	i = mtod(mp, caddr_t) + mp->m_len - cp;
925 	len = retlen;
926 	while (len > 0) {
927 		if (mp == NULL)
928 			panic("nfsvno_write");
929 		if (i > 0) {
930 			i = min(i, len);
931 			ivp->iov_base = cp;
932 			ivp->iov_len = i;
933 			ivp++;
934 			len -= i;
935 		}
936 		mp = mp->m_next;
937 		if (mp) {
938 			i = mp->m_len;
939 			cp = mtod(mp, caddr_t);
940 		}
941 	}
942 	return (0);
943 }
944 
945 /*
946  * Write vnode op from an mbuf list.
947  */
948 int
949 nfsvno_write(struct vnode *vp, off_t off, int retlen, int *stable,
950     struct mbuf *mp, char *cp, struct ucred *cred, struct thread *p)
951 {
952 	struct iovec *iv;
953 	int cnt, ioflags, error;
954 	struct uio io, *uiop = &io;
955 	struct nfsheur *nh;
956 
957 	/*
958 	 * Attempt to write to a DS file. A return of ENOENT implies
959 	 * there is no DS file to write.
960 	 */
961 	error = nfsrv_proxyds(vp, off, retlen, cred, p, NFSPROC_WRITEDS,
962 	    &mp, cp, NULL, NULL, NULL, NULL, 0, NULL);
963 	if (error != ENOENT) {
964 		*stable = NFSWRITE_FILESYNC;
965 		return (error);
966 	}
967 
968 
969 	if (*stable == NFSWRITE_UNSTABLE)
970 		ioflags = IO_NODELOCKED;
971 	else
972 		ioflags = (IO_SYNC | IO_NODELOCKED);
973 	error = nfsrv_createiovecw(retlen, mp, cp, &iv, &cnt);
974 	if (error != 0)
975 		return (error);
976 	uiop->uio_iov = iv;
977 	uiop->uio_iovcnt = cnt;
978 	uiop->uio_resid = retlen;
979 	uiop->uio_rw = UIO_WRITE;
980 	uiop->uio_segflg = UIO_SYSSPACE;
981 	NFSUIOPROC(uiop, p);
982 	uiop->uio_offset = off;
983 	nh = nfsrv_sequential_heuristic(uiop, vp);
984 	ioflags |= nh->nh_seqcount << IO_SEQSHIFT;
985 	/* XXX KDM make this more systematic? */
986 	nfsstatsv1.srvbytes[NFSV4OP_WRITE] += uiop->uio_resid;
987 	error = VOP_WRITE(vp, uiop, ioflags, cred);
988 	if (error == 0)
989 		nh->nh_nextoff = uiop->uio_offset;
990 	free(iv, M_TEMP);
991 
992 	NFSEXITCODE(error);
993 	return (error);
994 }
995 
996 /*
997  * Common code for creating a regular file (plus special files for V2).
998  */
999 int
1000 nfsvno_createsub(struct nfsrv_descript *nd, struct nameidata *ndp,
1001     struct vnode **vpp, struct nfsvattr *nvap, int *exclusive_flagp,
1002     int32_t *cverf, NFSDEV_T rdev, struct nfsexstuff *exp)
1003 {
1004 	u_quad_t tempsize;
1005 	int error;
1006 	struct thread *p = curthread;
1007 
1008 	error = nd->nd_repstat;
1009 	if (!error && ndp->ni_vp == NULL) {
1010 		if (nvap->na_type == VREG || nvap->na_type == VSOCK) {
1011 			vrele(ndp->ni_startdir);
1012 			error = VOP_CREATE(ndp->ni_dvp,
1013 			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
1014 			/* For a pNFS server, create the data file on a DS. */
1015 			if (error == 0 && nvap->na_type == VREG) {
1016 				/*
1017 				 * Create a data file on a DS for a pNFS server.
1018 				 * This function just returns if not
1019 				 * running a pNFS DS or the creation fails.
1020 				 */
1021 				nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr,
1022 				    nd->nd_cred, p);
1023 			}
1024 			vput(ndp->ni_dvp);
1025 			nfsvno_relpathbuf(ndp);
1026 			if (!error) {
1027 				if (*exclusive_flagp) {
1028 					*exclusive_flagp = 0;
1029 					NFSVNO_ATTRINIT(nvap);
1030 					nvap->na_atime.tv_sec = cverf[0];
1031 					nvap->na_atime.tv_nsec = cverf[1];
1032 					error = VOP_SETATTR(ndp->ni_vp,
1033 					    &nvap->na_vattr, nd->nd_cred);
1034 					if (error != 0) {
1035 						vput(ndp->ni_vp);
1036 						ndp->ni_vp = NULL;
1037 						error = NFSERR_NOTSUPP;
1038 					}
1039 				}
1040 			}
1041 		/*
1042 		 * NFS V2 Only. nfsrvd_mknod() does this for V3.
1043 		 * (This implies, just get out on an error.)
1044 		 */
1045 		} else if (nvap->na_type == VCHR || nvap->na_type == VBLK ||
1046 			nvap->na_type == VFIFO) {
1047 			if (nvap->na_type == VCHR && rdev == 0xffffffff)
1048 				nvap->na_type = VFIFO;
1049                         if (nvap->na_type != VFIFO &&
1050 			    (error = priv_check_cred(nd->nd_cred, PRIV_VFS_MKNOD_DEV))) {
1051 				vrele(ndp->ni_startdir);
1052 				nfsvno_relpathbuf(ndp);
1053 				vput(ndp->ni_dvp);
1054 				goto out;
1055 			}
1056 			nvap->na_rdev = rdev;
1057 			error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
1058 			    &ndp->ni_cnd, &nvap->na_vattr);
1059 			vput(ndp->ni_dvp);
1060 			nfsvno_relpathbuf(ndp);
1061 			vrele(ndp->ni_startdir);
1062 			if (error)
1063 				goto out;
1064 		} else {
1065 			vrele(ndp->ni_startdir);
1066 			nfsvno_relpathbuf(ndp);
1067 			vput(ndp->ni_dvp);
1068 			error = ENXIO;
1069 			goto out;
1070 		}
1071 		*vpp = ndp->ni_vp;
1072 	} else {
1073 		/*
1074 		 * Handle cases where error is already set and/or
1075 		 * the file exists.
1076 		 * 1 - clean up the lookup
1077 		 * 2 - iff !error and na_size set, truncate it
1078 		 */
1079 		vrele(ndp->ni_startdir);
1080 		nfsvno_relpathbuf(ndp);
1081 		*vpp = ndp->ni_vp;
1082 		if (ndp->ni_dvp == *vpp)
1083 			vrele(ndp->ni_dvp);
1084 		else
1085 			vput(ndp->ni_dvp);
1086 		if (!error && nvap->na_size != VNOVAL) {
1087 			error = nfsvno_accchk(*vpp, VWRITE,
1088 			    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
1089 			    NFSACCCHK_VPISLOCKED, NULL);
1090 			if (!error) {
1091 				tempsize = nvap->na_size;
1092 				NFSVNO_ATTRINIT(nvap);
1093 				nvap->na_size = tempsize;
1094 				error = VOP_SETATTR(*vpp,
1095 				    &nvap->na_vattr, nd->nd_cred);
1096 			}
1097 		}
1098 		if (error)
1099 			vput(*vpp);
1100 	}
1101 
1102 out:
1103 	NFSEXITCODE(error);
1104 	return (error);
1105 }
1106 
1107 /*
1108  * Do a mknod vnode op.
1109  */
1110 int
1111 nfsvno_mknod(struct nameidata *ndp, struct nfsvattr *nvap, struct ucred *cred,
1112     struct thread *p)
1113 {
1114 	int error = 0;
1115 	enum vtype vtyp;
1116 
1117 	vtyp = nvap->na_type;
1118 	/*
1119 	 * Iff doesn't exist, create it.
1120 	 */
1121 	if (ndp->ni_vp) {
1122 		vrele(ndp->ni_startdir);
1123 		nfsvno_relpathbuf(ndp);
1124 		vput(ndp->ni_dvp);
1125 		vrele(ndp->ni_vp);
1126 		error = EEXIST;
1127 		goto out;
1128 	}
1129 	if (vtyp != VCHR && vtyp != VBLK && vtyp != VSOCK && vtyp != VFIFO) {
1130 		vrele(ndp->ni_startdir);
1131 		nfsvno_relpathbuf(ndp);
1132 		vput(ndp->ni_dvp);
1133 		error = NFSERR_BADTYPE;
1134 		goto out;
1135 	}
1136 	if (vtyp == VSOCK) {
1137 		vrele(ndp->ni_startdir);
1138 		error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp,
1139 		    &ndp->ni_cnd, &nvap->na_vattr);
1140 		vput(ndp->ni_dvp);
1141 		nfsvno_relpathbuf(ndp);
1142 	} else {
1143 		if (nvap->na_type != VFIFO &&
1144 		    (error = priv_check_cred(cred, PRIV_VFS_MKNOD_DEV))) {
1145 			vrele(ndp->ni_startdir);
1146 			nfsvno_relpathbuf(ndp);
1147 			vput(ndp->ni_dvp);
1148 			goto out;
1149 		}
1150 		error = VOP_MKNOD(ndp->ni_dvp, &ndp->ni_vp,
1151 		    &ndp->ni_cnd, &nvap->na_vattr);
1152 		vput(ndp->ni_dvp);
1153 		nfsvno_relpathbuf(ndp);
1154 		vrele(ndp->ni_startdir);
1155 		/*
1156 		 * Since VOP_MKNOD returns the ni_vp, I can't
1157 		 * see any reason to do the lookup.
1158 		 */
1159 	}
1160 
1161 out:
1162 	NFSEXITCODE(error);
1163 	return (error);
1164 }
1165 
1166 /*
1167  * Mkdir vnode op.
1168  */
1169 int
1170 nfsvno_mkdir(struct nameidata *ndp, struct nfsvattr *nvap, uid_t saved_uid,
1171     struct ucred *cred, struct thread *p, struct nfsexstuff *exp)
1172 {
1173 	int error = 0;
1174 
1175 	if (ndp->ni_vp != NULL) {
1176 		if (ndp->ni_dvp == ndp->ni_vp)
1177 			vrele(ndp->ni_dvp);
1178 		else
1179 			vput(ndp->ni_dvp);
1180 		vrele(ndp->ni_vp);
1181 		nfsvno_relpathbuf(ndp);
1182 		error = EEXIST;
1183 		goto out;
1184 	}
1185 	error = VOP_MKDIR(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
1186 	    &nvap->na_vattr);
1187 	vput(ndp->ni_dvp);
1188 	nfsvno_relpathbuf(ndp);
1189 
1190 out:
1191 	NFSEXITCODE(error);
1192 	return (error);
1193 }
1194 
1195 /*
1196  * symlink vnode op.
1197  */
1198 int
1199 nfsvno_symlink(struct nameidata *ndp, struct nfsvattr *nvap, char *pathcp,
1200     int pathlen, int not_v2, uid_t saved_uid, struct ucred *cred, struct thread *p,
1201     struct nfsexstuff *exp)
1202 {
1203 	int error = 0;
1204 
1205 	if (ndp->ni_vp) {
1206 		vrele(ndp->ni_startdir);
1207 		nfsvno_relpathbuf(ndp);
1208 		if (ndp->ni_dvp == ndp->ni_vp)
1209 			vrele(ndp->ni_dvp);
1210 		else
1211 			vput(ndp->ni_dvp);
1212 		vrele(ndp->ni_vp);
1213 		error = EEXIST;
1214 		goto out;
1215 	}
1216 
1217 	error = VOP_SYMLINK(ndp->ni_dvp, &ndp->ni_vp, &ndp->ni_cnd,
1218 	    &nvap->na_vattr, pathcp);
1219 	vput(ndp->ni_dvp);
1220 	vrele(ndp->ni_startdir);
1221 	nfsvno_relpathbuf(ndp);
1222 	/*
1223 	 * Although FreeBSD still had the lookup code in
1224 	 * it for 7/current, there doesn't seem to be any
1225 	 * point, since VOP_SYMLINK() returns the ni_vp.
1226 	 * Just vput it for v2.
1227 	 */
1228 	if (!not_v2 && !error)
1229 		vput(ndp->ni_vp);
1230 
1231 out:
1232 	NFSEXITCODE(error);
1233 	return (error);
1234 }
1235 
1236 /*
1237  * Parse symbolic link arguments.
1238  * This function has an ugly side effect. It will malloc() an area for
1239  * the symlink and set iov_base to point to it, only if it succeeds.
1240  * So, if it returns with uiop->uio_iov->iov_base != NULL, that must
1241  * be FREE'd later.
1242  */
1243 int
1244 nfsvno_getsymlink(struct nfsrv_descript *nd, struct nfsvattr *nvap,
1245     struct thread *p, char **pathcpp, int *lenp)
1246 {
1247 	u_int32_t *tl;
1248 	char *pathcp = NULL;
1249 	int error = 0, len;
1250 	struct nfsv2_sattr *sp;
1251 
1252 	*pathcpp = NULL;
1253 	*lenp = 0;
1254 	if ((nd->nd_flag & ND_NFSV3) &&
1255 	    (error = nfsrv_sattr(nd, NULL, nvap, NULL, NULL, p)))
1256 		goto nfsmout;
1257 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
1258 	len = fxdr_unsigned(int, *tl);
1259 	if (len > NFS_MAXPATHLEN || len <= 0) {
1260 		error = EBADRPC;
1261 		goto nfsmout;
1262 	}
1263 	pathcp = malloc(len + 1, M_TEMP, M_WAITOK);
1264 	error = nfsrv_mtostr(nd, pathcp, len);
1265 	if (error)
1266 		goto nfsmout;
1267 	if (nd->nd_flag & ND_NFSV2) {
1268 		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
1269 		nvap->na_mode = fxdr_unsigned(u_int16_t, sp->sa_mode);
1270 	}
1271 	*pathcpp = pathcp;
1272 	*lenp = len;
1273 	NFSEXITCODE2(0, nd);
1274 	return (0);
1275 nfsmout:
1276 	if (pathcp)
1277 		free(pathcp, M_TEMP);
1278 	NFSEXITCODE2(error, nd);
1279 	return (error);
1280 }
1281 
1282 /*
1283  * Remove a non-directory object.
1284  */
1285 int
1286 nfsvno_removesub(struct nameidata *ndp, int is_v4, struct ucred *cred,
1287     struct thread *p, struct nfsexstuff *exp)
1288 {
1289 	struct vnode *vp, *dsdvp[NFSDEV_MAXMIRRORS];
1290 	int error = 0, mirrorcnt;
1291 	char fname[PNFS_FILENAME_LEN + 1];
1292 	fhandle_t fh;
1293 
1294 	vp = ndp->ni_vp;
1295 	dsdvp[0] = NULL;
1296 	if (vp->v_type == VDIR)
1297 		error = NFSERR_ISDIR;
1298 	else if (is_v4)
1299 		error = nfsrv_checkremove(vp, 1, NULL, (nfsquad_t)((u_quad_t)0),
1300 		    p);
1301 	if (error == 0)
1302 		nfsrv_pnfsremovesetup(vp, p, dsdvp, &mirrorcnt, fname, &fh);
1303 	if (!error)
1304 		error = VOP_REMOVE(ndp->ni_dvp, vp, &ndp->ni_cnd);
1305 	if (error == 0 && dsdvp[0] != NULL)
1306 		nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p);
1307 	if (ndp->ni_dvp == vp)
1308 		vrele(ndp->ni_dvp);
1309 	else
1310 		vput(ndp->ni_dvp);
1311 	vput(vp);
1312 	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
1313 		nfsvno_relpathbuf(ndp);
1314 	NFSEXITCODE(error);
1315 	return (error);
1316 }
1317 
1318 /*
1319  * Remove a directory.
1320  */
1321 int
1322 nfsvno_rmdirsub(struct nameidata *ndp, int is_v4, struct ucred *cred,
1323     struct thread *p, struct nfsexstuff *exp)
1324 {
1325 	struct vnode *vp;
1326 	int error = 0;
1327 
1328 	vp = ndp->ni_vp;
1329 	if (vp->v_type != VDIR) {
1330 		error = ENOTDIR;
1331 		goto out;
1332 	}
1333 	/*
1334 	 * No rmdir "." please.
1335 	 */
1336 	if (ndp->ni_dvp == vp) {
1337 		error = EINVAL;
1338 		goto out;
1339 	}
1340 	/*
1341 	 * The root of a mounted filesystem cannot be deleted.
1342 	 */
1343 	if (vp->v_vflag & VV_ROOT)
1344 		error = EBUSY;
1345 out:
1346 	if (!error)
1347 		error = VOP_RMDIR(ndp->ni_dvp, vp, &ndp->ni_cnd);
1348 	if (ndp->ni_dvp == vp)
1349 		vrele(ndp->ni_dvp);
1350 	else
1351 		vput(ndp->ni_dvp);
1352 	vput(vp);
1353 	if ((ndp->ni_cnd.cn_flags & SAVENAME) != 0)
1354 		nfsvno_relpathbuf(ndp);
1355 	NFSEXITCODE(error);
1356 	return (error);
1357 }
1358 
1359 /*
1360  * Rename vnode op.
1361  */
1362 int
1363 nfsvno_rename(struct nameidata *fromndp, struct nameidata *tondp,
1364     u_int32_t ndstat, u_int32_t ndflag, struct ucred *cred, struct thread *p)
1365 {
1366 	struct vnode *fvp, *tvp, *tdvp, *dsdvp[NFSDEV_MAXMIRRORS];
1367 	int error = 0, mirrorcnt;
1368 	char fname[PNFS_FILENAME_LEN + 1];
1369 	fhandle_t fh;
1370 
1371 	dsdvp[0] = NULL;
1372 	fvp = fromndp->ni_vp;
1373 	if (ndstat) {
1374 		vrele(fromndp->ni_dvp);
1375 		vrele(fvp);
1376 		error = ndstat;
1377 		goto out1;
1378 	}
1379 	tdvp = tondp->ni_dvp;
1380 	tvp = tondp->ni_vp;
1381 	if (tvp != NULL) {
1382 		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
1383 			error = (ndflag & ND_NFSV2) ? EISDIR : EEXIST;
1384 			goto out;
1385 		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
1386 			error = (ndflag & ND_NFSV2) ? ENOTDIR : EEXIST;
1387 			goto out;
1388 		}
1389 		if (tvp->v_type == VDIR && tvp->v_mountedhere) {
1390 			error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1391 			goto out;
1392 		}
1393 
1394 		/*
1395 		 * A rename to '.' or '..' results in a prematurely
1396 		 * unlocked vnode on FreeBSD5, so I'm just going to fail that
1397 		 * here.
1398 		 */
1399 		if ((tondp->ni_cnd.cn_namelen == 1 &&
1400 		     tondp->ni_cnd.cn_nameptr[0] == '.') ||
1401 		    (tondp->ni_cnd.cn_namelen == 2 &&
1402 		     tondp->ni_cnd.cn_nameptr[0] == '.' &&
1403 		     tondp->ni_cnd.cn_nameptr[1] == '.')) {
1404 			error = EINVAL;
1405 			goto out;
1406 		}
1407 	}
1408 	if (fvp->v_type == VDIR && fvp->v_mountedhere) {
1409 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1410 		goto out;
1411 	}
1412 	if (fvp->v_mount != tdvp->v_mount) {
1413 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EXDEV;
1414 		goto out;
1415 	}
1416 	if (fvp == tdvp) {
1417 		error = (ndflag & ND_NFSV2) ? ENOTEMPTY : EINVAL;
1418 		goto out;
1419 	}
1420 	if (fvp == tvp) {
1421 		/*
1422 		 * If source and destination are the same, there is nothing to
1423 		 * do. Set error to -1 to indicate this.
1424 		 */
1425 		error = -1;
1426 		goto out;
1427 	}
1428 	if (ndflag & ND_NFSV4) {
1429 		if (NFSVOPLOCK(fvp, LK_EXCLUSIVE) == 0) {
1430 			error = nfsrv_checkremove(fvp, 0, NULL,
1431 			    (nfsquad_t)((u_quad_t)0), p);
1432 			NFSVOPUNLOCK(fvp);
1433 		} else
1434 			error = EPERM;
1435 		if (tvp && !error)
1436 			error = nfsrv_checkremove(tvp, 1, NULL,
1437 			    (nfsquad_t)((u_quad_t)0), p);
1438 	} else {
1439 		/*
1440 		 * For NFSv2 and NFSv3, try to get rid of the delegation, so
1441 		 * that the NFSv4 client won't be confused by the rename.
1442 		 * Since nfsd_recalldelegation() can only be called on an
1443 		 * unlocked vnode at this point and fvp is the file that will
1444 		 * still exist after the rename, just do fvp.
1445 		 */
1446 		nfsd_recalldelegation(fvp, p);
1447 	}
1448 	if (error == 0 && tvp != NULL) {
1449 		nfsrv_pnfsremovesetup(tvp, p, dsdvp, &mirrorcnt, fname, &fh);
1450 		NFSD_DEBUG(4, "nfsvno_rename: pnfsremovesetup"
1451 		    " dsdvp=%p\n", dsdvp[0]);
1452 	}
1453 out:
1454 	if (!error) {
1455 		error = VOP_RENAME(fromndp->ni_dvp, fromndp->ni_vp,
1456 		    &fromndp->ni_cnd, tondp->ni_dvp, tondp->ni_vp,
1457 		    &tondp->ni_cnd);
1458 	} else {
1459 		if (tdvp == tvp)
1460 			vrele(tdvp);
1461 		else
1462 			vput(tdvp);
1463 		if (tvp)
1464 			vput(tvp);
1465 		vrele(fromndp->ni_dvp);
1466 		vrele(fvp);
1467 		if (error == -1)
1468 			error = 0;
1469 	}
1470 
1471 	/*
1472 	 * If dsdvp[0] != NULL, it was set up by nfsrv_pnfsremovesetup() and
1473 	 * if the rename succeeded, the DS file for the tvp needs to be
1474 	 * removed.
1475 	 */
1476 	if (error == 0 && dsdvp[0] != NULL) {
1477 		nfsrv_pnfsremove(dsdvp, mirrorcnt, fname, &fh, p);
1478 		NFSD_DEBUG(4, "nfsvno_rename: pnfsremove\n");
1479 	}
1480 
1481 	vrele(tondp->ni_startdir);
1482 	nfsvno_relpathbuf(tondp);
1483 out1:
1484 	vrele(fromndp->ni_startdir);
1485 	nfsvno_relpathbuf(fromndp);
1486 	NFSEXITCODE(error);
1487 	return (error);
1488 }
1489 
1490 /*
1491  * Link vnode op.
1492  */
1493 int
1494 nfsvno_link(struct nameidata *ndp, struct vnode *vp, struct ucred *cred,
1495     struct thread *p, struct nfsexstuff *exp)
1496 {
1497 	struct vnode *xp;
1498 	int error = 0;
1499 
1500 	xp = ndp->ni_vp;
1501 	if (xp != NULL) {
1502 		error = EEXIST;
1503 	} else {
1504 		xp = ndp->ni_dvp;
1505 		if (vp->v_mount != xp->v_mount)
1506 			error = EXDEV;
1507 	}
1508 	if (!error) {
1509 		NFSVOPLOCK(vp, LK_EXCLUSIVE | LK_RETRY);
1510 		if (!VN_IS_DOOMED(vp))
1511 			error = VOP_LINK(ndp->ni_dvp, vp, &ndp->ni_cnd);
1512 		else
1513 			error = EPERM;
1514 		if (ndp->ni_dvp == vp)
1515 			vrele(ndp->ni_dvp);
1516 		else
1517 			vput(ndp->ni_dvp);
1518 		NFSVOPUNLOCK(vp);
1519 	} else {
1520 		if (ndp->ni_dvp == ndp->ni_vp)
1521 			vrele(ndp->ni_dvp);
1522 		else
1523 			vput(ndp->ni_dvp);
1524 		if (ndp->ni_vp)
1525 			vrele(ndp->ni_vp);
1526 	}
1527 	nfsvno_relpathbuf(ndp);
1528 	NFSEXITCODE(error);
1529 	return (error);
1530 }
1531 
1532 /*
1533  * Do the fsync() appropriate for the commit.
1534  */
1535 int
1536 nfsvno_fsync(struct vnode *vp, u_int64_t off, int cnt, struct ucred *cred,
1537     struct thread *td)
1538 {
1539 	int error = 0;
1540 
1541 	/*
1542 	 * RFC 1813 3.3.21: if count is 0, a flush from offset to the end of
1543 	 * file is done.  At this time VOP_FSYNC does not accept offset and
1544 	 * byte count parameters so call VOP_FSYNC the whole file for now.
1545 	 * The same is true for NFSv4: RFC 3530 Sec. 14.2.3.
1546 	 * File systems that do not use the buffer cache (as indicated
1547 	 * by MNTK_USES_BCACHE not being set) must use VOP_FSYNC().
1548 	 */
1549 	if (cnt == 0 || cnt > MAX_COMMIT_COUNT ||
1550 	    (vp->v_mount->mnt_kern_flag & MNTK_USES_BCACHE) == 0) {
1551 		/*
1552 		 * Give up and do the whole thing
1553 		 */
1554 		if (vp->v_object && vm_object_mightbedirty(vp->v_object)) {
1555 			VM_OBJECT_WLOCK(vp->v_object);
1556 			vm_object_page_clean(vp->v_object, 0, 0, OBJPC_SYNC);
1557 			VM_OBJECT_WUNLOCK(vp->v_object);
1558 		}
1559 		error = VOP_FSYNC(vp, MNT_WAIT, td);
1560 	} else {
1561 		/*
1562 		 * Locate and synchronously write any buffers that fall
1563 		 * into the requested range.  Note:  we are assuming that
1564 		 * f_iosize is a power of 2.
1565 		 */
1566 		int iosize = vp->v_mount->mnt_stat.f_iosize;
1567 		int iomask = iosize - 1;
1568 		struct bufobj *bo;
1569 		daddr_t lblkno;
1570 
1571 		/*
1572 		 * Align to iosize boundary, super-align to page boundary.
1573 		 */
1574 		if (off & iomask) {
1575 			cnt += off & iomask;
1576 			off &= ~(u_quad_t)iomask;
1577 		}
1578 		if (off & PAGE_MASK) {
1579 			cnt += off & PAGE_MASK;
1580 			off &= ~(u_quad_t)PAGE_MASK;
1581 		}
1582 		lblkno = off / iosize;
1583 
1584 		if (vp->v_object && vm_object_mightbedirty(vp->v_object)) {
1585 			VM_OBJECT_WLOCK(vp->v_object);
1586 			vm_object_page_clean(vp->v_object, off, off + cnt,
1587 			    OBJPC_SYNC);
1588 			VM_OBJECT_WUNLOCK(vp->v_object);
1589 		}
1590 
1591 		bo = &vp->v_bufobj;
1592 		BO_LOCK(bo);
1593 		while (cnt > 0) {
1594 			struct buf *bp;
1595 
1596 			/*
1597 			 * If we have a buffer and it is marked B_DELWRI we
1598 			 * have to lock and write it.  Otherwise the prior
1599 			 * write is assumed to have already been committed.
1600 			 *
1601 			 * gbincore() can return invalid buffers now so we
1602 			 * have to check that bit as well (though B_DELWRI
1603 			 * should not be set if B_INVAL is set there could be
1604 			 * a race here since we haven't locked the buffer).
1605 			 */
1606 			if ((bp = gbincore(&vp->v_bufobj, lblkno)) != NULL) {
1607 				if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_SLEEPFAIL |
1608 				    LK_INTERLOCK, BO_LOCKPTR(bo)) == ENOLCK) {
1609 					BO_LOCK(bo);
1610 					continue; /* retry */
1611 				}
1612 			    	if ((bp->b_flags & (B_DELWRI|B_INVAL)) ==
1613 				    B_DELWRI) {
1614 					bremfree(bp);
1615 					bp->b_flags &= ~B_ASYNC;
1616 					bwrite(bp);
1617 					++nfs_commit_miss;
1618 				} else
1619 					BUF_UNLOCK(bp);
1620 				BO_LOCK(bo);
1621 			}
1622 			++nfs_commit_blks;
1623 			if (cnt < iosize)
1624 				break;
1625 			cnt -= iosize;
1626 			++lblkno;
1627 		}
1628 		BO_UNLOCK(bo);
1629 	}
1630 	NFSEXITCODE(error);
1631 	return (error);
1632 }
1633 
1634 /*
1635  * Statfs vnode op.
1636  */
1637 int
1638 nfsvno_statfs(struct vnode *vp, struct statfs *sf)
1639 {
1640 	struct statfs *tsf;
1641 	int error;
1642 
1643 	tsf = NULL;
1644 	if (nfsrv_devidcnt > 0) {
1645 		/* For a pNFS service, get the DS numbers. */
1646 		tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK | M_ZERO);
1647 		error = nfsrv_pnfsstatfs(tsf, vp->v_mount);
1648 		if (error != 0) {
1649 			free(tsf, M_TEMP);
1650 			tsf = NULL;
1651 		}
1652 	}
1653 	error = VFS_STATFS(vp->v_mount, sf);
1654 	if (error == 0) {
1655 		if (tsf != NULL) {
1656 			sf->f_blocks = tsf->f_blocks;
1657 			sf->f_bavail = tsf->f_bavail;
1658 			sf->f_bfree = tsf->f_bfree;
1659 			sf->f_bsize = tsf->f_bsize;
1660 		}
1661 		/*
1662 		 * Since NFS handles these values as unsigned on the
1663 		 * wire, there is no way to represent negative values,
1664 		 * so set them to 0. Without this, they will appear
1665 		 * to be very large positive values for clients like
1666 		 * Solaris10.
1667 		 */
1668 		if (sf->f_bavail < 0)
1669 			sf->f_bavail = 0;
1670 		if (sf->f_ffree < 0)
1671 			sf->f_ffree = 0;
1672 	}
1673 	free(tsf, M_TEMP);
1674 	NFSEXITCODE(error);
1675 	return (error);
1676 }
1677 
1678 /*
1679  * Do the vnode op stuff for Open. Similar to nfsvno_createsub(), but
1680  * must handle nfsrv_opencheck() calls after any other access checks.
1681  */
1682 void
1683 nfsvno_open(struct nfsrv_descript *nd, struct nameidata *ndp,
1684     nfsquad_t clientid, nfsv4stateid_t *stateidp, struct nfsstate *stp,
1685     int *exclusive_flagp, struct nfsvattr *nvap, int32_t *cverf, int create,
1686     NFSACL_T *aclp, nfsattrbit_t *attrbitp, struct ucred *cred,
1687     struct nfsexstuff *exp, struct vnode **vpp)
1688 {
1689 	struct vnode *vp = NULL;
1690 	u_quad_t tempsize;
1691 	struct nfsexstuff nes;
1692 	struct thread *p = curthread;
1693 
1694 	if (ndp->ni_vp == NULL)
1695 		nd->nd_repstat = nfsrv_opencheck(clientid,
1696 		    stateidp, stp, NULL, nd, p, nd->nd_repstat);
1697 	if (!nd->nd_repstat) {
1698 		if (ndp->ni_vp == NULL) {
1699 			vrele(ndp->ni_startdir);
1700 			nd->nd_repstat = VOP_CREATE(ndp->ni_dvp,
1701 			    &ndp->ni_vp, &ndp->ni_cnd, &nvap->na_vattr);
1702 			/* For a pNFS server, create the data file on a DS. */
1703 			if (nd->nd_repstat == 0) {
1704 				/*
1705 				 * Create a data file on a DS for a pNFS server.
1706 				 * This function just returns if not
1707 				 * running a pNFS DS or the creation fails.
1708 				 */
1709 				nfsrv_pnfscreate(ndp->ni_vp, &nvap->na_vattr,
1710 				    cred, p);
1711 			}
1712 			vput(ndp->ni_dvp);
1713 			nfsvno_relpathbuf(ndp);
1714 			if (!nd->nd_repstat) {
1715 				if (*exclusive_flagp) {
1716 					*exclusive_flagp = 0;
1717 					NFSVNO_ATTRINIT(nvap);
1718 					nvap->na_atime.tv_sec = cverf[0];
1719 					nvap->na_atime.tv_nsec = cverf[1];
1720 					nd->nd_repstat = VOP_SETATTR(ndp->ni_vp,
1721 					    &nvap->na_vattr, cred);
1722 					if (nd->nd_repstat != 0) {
1723 						vput(ndp->ni_vp);
1724 						ndp->ni_vp = NULL;
1725 						nd->nd_repstat = NFSERR_NOTSUPP;
1726 					} else
1727 						NFSSETBIT_ATTRBIT(attrbitp,
1728 						    NFSATTRBIT_TIMEACCESS);
1729 				} else {
1730 					nfsrv_fixattr(nd, ndp->ni_vp, nvap,
1731 					    aclp, p, attrbitp, exp);
1732 				}
1733 			}
1734 			vp = ndp->ni_vp;
1735 		} else {
1736 			if (ndp->ni_startdir)
1737 				vrele(ndp->ni_startdir);
1738 			nfsvno_relpathbuf(ndp);
1739 			vp = ndp->ni_vp;
1740 			if (create == NFSV4OPEN_CREATE) {
1741 				if (ndp->ni_dvp == vp)
1742 					vrele(ndp->ni_dvp);
1743 				else
1744 					vput(ndp->ni_dvp);
1745 			}
1746 			if (NFSVNO_ISSETSIZE(nvap) && vp->v_type == VREG) {
1747 				if (ndp->ni_cnd.cn_flags & RDONLY)
1748 					NFSVNO_SETEXRDONLY(&nes);
1749 				else
1750 					NFSVNO_EXINIT(&nes);
1751 				nd->nd_repstat = nfsvno_accchk(vp,
1752 				    VWRITE, cred, &nes, p,
1753 				    NFSACCCHK_NOOVERRIDE,
1754 				    NFSACCCHK_VPISLOCKED, NULL);
1755 				nd->nd_repstat = nfsrv_opencheck(clientid,
1756 				    stateidp, stp, vp, nd, p, nd->nd_repstat);
1757 				if (!nd->nd_repstat) {
1758 					tempsize = nvap->na_size;
1759 					NFSVNO_ATTRINIT(nvap);
1760 					nvap->na_size = tempsize;
1761 					nd->nd_repstat = VOP_SETATTR(vp,
1762 					    &nvap->na_vattr, cred);
1763 				}
1764 			} else if (vp->v_type == VREG) {
1765 				nd->nd_repstat = nfsrv_opencheck(clientid,
1766 				    stateidp, stp, vp, nd, p, nd->nd_repstat);
1767 			}
1768 		}
1769 	} else {
1770 		if (ndp->ni_cnd.cn_flags & HASBUF)
1771 			nfsvno_relpathbuf(ndp);
1772 		if (ndp->ni_startdir && create == NFSV4OPEN_CREATE) {
1773 			vrele(ndp->ni_startdir);
1774 			if (ndp->ni_dvp == ndp->ni_vp)
1775 				vrele(ndp->ni_dvp);
1776 			else
1777 				vput(ndp->ni_dvp);
1778 			if (ndp->ni_vp)
1779 				vput(ndp->ni_vp);
1780 		}
1781 	}
1782 	*vpp = vp;
1783 
1784 	NFSEXITCODE2(0, nd);
1785 }
1786 
1787 /*
1788  * Updates the file rev and sets the mtime and ctime
1789  * to the current clock time, returning the va_filerev and va_Xtime
1790  * values.
1791  * Return ESTALE to indicate the vnode is VIRF_DOOMED.
1792  */
1793 int
1794 nfsvno_updfilerev(struct vnode *vp, struct nfsvattr *nvap,
1795     struct nfsrv_descript *nd, struct thread *p)
1796 {
1797 	struct vattr va;
1798 
1799 	VATTR_NULL(&va);
1800 	vfs_timestamp(&va.va_mtime);
1801 	if (NFSVOPISLOCKED(vp) != LK_EXCLUSIVE) {
1802 		NFSVOPLOCK(vp, LK_UPGRADE | LK_RETRY);
1803 		if (VN_IS_DOOMED(vp))
1804 			return (ESTALE);
1805 	}
1806 	(void) VOP_SETATTR(vp, &va, nd->nd_cred);
1807 	(void) nfsvno_getattr(vp, nvap, nd, p, 1, NULL);
1808 	return (0);
1809 }
1810 
1811 /*
1812  * Glue routine to nfsv4_fillattr().
1813  */
1814 int
1815 nfsvno_fillattr(struct nfsrv_descript *nd, struct mount *mp, struct vnode *vp,
1816     struct nfsvattr *nvap, fhandle_t *fhp, int rderror, nfsattrbit_t *attrbitp,
1817     struct ucred *cred, struct thread *p, int isdgram, int reterr,
1818     int supports_nfsv4acls, int at_root, uint64_t mounted_on_fileno)
1819 {
1820 	struct statfs *sf;
1821 	int error;
1822 
1823 	sf = NULL;
1824 	if (nfsrv_devidcnt > 0 &&
1825 	    (NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEAVAIL) ||
1826 	     NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACEFREE) ||
1827 	     NFSISSET_ATTRBIT(attrbitp, NFSATTRBIT_SPACETOTAL))) {
1828 		sf = malloc(sizeof(*sf), M_TEMP, M_WAITOK | M_ZERO);
1829 		error = nfsrv_pnfsstatfs(sf, mp);
1830 		if (error != 0) {
1831 			free(sf, M_TEMP);
1832 			sf = NULL;
1833 		}
1834 	}
1835 	error = nfsv4_fillattr(nd, mp, vp, NULL, &nvap->na_vattr, fhp, rderror,
1836 	    attrbitp, cred, p, isdgram, reterr, supports_nfsv4acls, at_root,
1837 	    mounted_on_fileno, sf);
1838 	free(sf, M_TEMP);
1839 	NFSEXITCODE2(0, nd);
1840 	return (error);
1841 }
1842 
1843 /* Since the Readdir vnode ops vary, put the entire functions in here. */
1844 /*
1845  * nfs readdir service
1846  * - mallocs what it thinks is enough to read
1847  *	count rounded up to a multiple of DIRBLKSIZ <= NFS_MAXREADDIR
1848  * - calls VOP_READDIR()
1849  * - loops around building the reply
1850  *	if the output generated exceeds count break out of loop
1851  *	The NFSM_CLGET macro is used here so that the reply will be packed
1852  *	tightly in mbuf clusters.
1853  * - it trims out records with d_fileno == 0
1854  *	this doesn't matter for Unix clients, but they might confuse clients
1855  *	for other os'.
1856  * - it trims out records with d_type == DT_WHT
1857  *	these cannot be seen through NFS (unless we extend the protocol)
1858  *     The alternate call nfsrvd_readdirplus() does lookups as well.
1859  * PS: The NFS protocol spec. does not clarify what the "count" byte
1860  *	argument is a count of.. just name strings and file id's or the
1861  *	entire reply rpc or ...
1862  *	I tried just file name and id sizes and it confused the Sun client,
1863  *	so I am using the full rpc size now. The "paranoia.." comment refers
1864  *	to including the status longwords that are not a part of the dir.
1865  *	"entry" structures, but are in the rpc.
1866  */
1867 int
1868 nfsrvd_readdir(struct nfsrv_descript *nd, int isdgram,
1869     struct vnode *vp, struct nfsexstuff *exp)
1870 {
1871 	struct dirent *dp;
1872 	u_int32_t *tl;
1873 	int dirlen;
1874 	char *cpos, *cend, *rbuf;
1875 	struct nfsvattr at;
1876 	int nlen, error = 0, getret = 1;
1877 	int siz, cnt, fullsiz, eofflag, ncookies;
1878 	u_int64_t off, toff, verf __unused;
1879 	u_long *cookies = NULL, *cookiep;
1880 	struct uio io;
1881 	struct iovec iv;
1882 	int is_ufs;
1883 	struct thread *p = curthread;
1884 
1885 	if (nd->nd_repstat) {
1886 		nfsrv_postopattr(nd, getret, &at);
1887 		goto out;
1888 	}
1889 	if (nd->nd_flag & ND_NFSV2) {
1890 		NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1891 		off = fxdr_unsigned(u_quad_t, *tl++);
1892 	} else {
1893 		NFSM_DISSECT(tl, u_int32_t *, 5 * NFSX_UNSIGNED);
1894 		off = fxdr_hyper(tl);
1895 		tl += 2;
1896 		verf = fxdr_hyper(tl);
1897 		tl += 2;
1898 	}
1899 	toff = off;
1900 	cnt = fxdr_unsigned(int, *tl);
1901 	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
1902 		cnt = NFS_SRVMAXDATA(nd);
1903 	siz = ((cnt + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
1904 	fullsiz = siz;
1905 	if (nd->nd_flag & ND_NFSV3) {
1906 		nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1,
1907 		    NULL);
1908 #if 0
1909 		/*
1910 		 * va_filerev is not sufficient as a cookie verifier,
1911 		 * since it is not supposed to change when entries are
1912 		 * removed/added unless that offset cookies returned to
1913 		 * the client are no longer valid.
1914 		 */
1915 		if (!nd->nd_repstat && toff && verf != at.na_filerev)
1916 			nd->nd_repstat = NFSERR_BAD_COOKIE;
1917 #endif
1918 	}
1919 	if (!nd->nd_repstat && vp->v_type != VDIR)
1920 		nd->nd_repstat = NFSERR_NOTDIR;
1921 	if (nd->nd_repstat == 0 && cnt == 0) {
1922 		if (nd->nd_flag & ND_NFSV2)
1923 			/* NFSv2 does not have NFSERR_TOOSMALL */
1924 			nd->nd_repstat = EPERM;
1925 		else
1926 			nd->nd_repstat = NFSERR_TOOSMALL;
1927 	}
1928 	if (!nd->nd_repstat)
1929 		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
1930 		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
1931 		    NFSACCCHK_VPISLOCKED, NULL);
1932 	if (nd->nd_repstat) {
1933 		vput(vp);
1934 		if (nd->nd_flag & ND_NFSV3)
1935 			nfsrv_postopattr(nd, getret, &at);
1936 		goto out;
1937 	}
1938 	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
1939 	rbuf = malloc(siz, M_TEMP, M_WAITOK);
1940 again:
1941 	eofflag = 0;
1942 	if (cookies) {
1943 		free(cookies, M_TEMP);
1944 		cookies = NULL;
1945 	}
1946 
1947 	iv.iov_base = rbuf;
1948 	iv.iov_len = siz;
1949 	io.uio_iov = &iv;
1950 	io.uio_iovcnt = 1;
1951 	io.uio_offset = (off_t)off;
1952 	io.uio_resid = siz;
1953 	io.uio_segflg = UIO_SYSSPACE;
1954 	io.uio_rw = UIO_READ;
1955 	io.uio_td = NULL;
1956 	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
1957 	    &cookies);
1958 	off = (u_int64_t)io.uio_offset;
1959 	if (io.uio_resid)
1960 		siz -= io.uio_resid;
1961 
1962 	if (!cookies && !nd->nd_repstat)
1963 		nd->nd_repstat = NFSERR_PERM;
1964 	if (nd->nd_flag & ND_NFSV3) {
1965 		getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
1966 		if (!nd->nd_repstat)
1967 			nd->nd_repstat = getret;
1968 	}
1969 
1970 	/*
1971 	 * Handles the failed cases. nd->nd_repstat == 0 past here.
1972 	 */
1973 	if (nd->nd_repstat) {
1974 		vput(vp);
1975 		free(rbuf, M_TEMP);
1976 		if (cookies)
1977 			free(cookies, M_TEMP);
1978 		if (nd->nd_flag & ND_NFSV3)
1979 			nfsrv_postopattr(nd, getret, &at);
1980 		goto out;
1981 	}
1982 	/*
1983 	 * If nothing read, return eof
1984 	 * rpc reply
1985 	 */
1986 	if (siz == 0) {
1987 		vput(vp);
1988 		if (nd->nd_flag & ND_NFSV2) {
1989 			NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
1990 		} else {
1991 			nfsrv_postopattr(nd, getret, &at);
1992 			NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
1993 			txdr_hyper(at.na_filerev, tl);
1994 			tl += 2;
1995 		}
1996 		*tl++ = newnfs_false;
1997 		*tl = newnfs_true;
1998 		free(rbuf, M_TEMP);
1999 		free(cookies, M_TEMP);
2000 		goto out;
2001 	}
2002 
2003 	/*
2004 	 * Check for degenerate cases of nothing useful read.
2005 	 * If so go try again
2006 	 */
2007 	cpos = rbuf;
2008 	cend = rbuf + siz;
2009 	dp = (struct dirent *)cpos;
2010 	cookiep = cookies;
2011 
2012 	/*
2013 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
2014 	 * directory offset up to a block boundary, so it is necessary to
2015 	 * skip over the records that precede the requested offset. This
2016 	 * requires the assumption that file offset cookies monotonically
2017 	 * increase.
2018 	 */
2019 	while (cpos < cend && ncookies > 0 &&
2020 	    (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
2021 	     (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff))) {
2022 		cpos += dp->d_reclen;
2023 		dp = (struct dirent *)cpos;
2024 		cookiep++;
2025 		ncookies--;
2026 	}
2027 	if (cpos >= cend || ncookies == 0) {
2028 		siz = fullsiz;
2029 		toff = off;
2030 		goto again;
2031 	}
2032 	vput(vp);
2033 
2034 	/*
2035 	 * dirlen is the size of the reply, including all XDR and must
2036 	 * not exceed cnt. For NFSv2, RFC1094 didn't clearly indicate
2037 	 * if the XDR should be included in "count", but to be safe, we do.
2038 	 * (Include the two booleans at the end of the reply in dirlen now.)
2039 	 */
2040 	if (nd->nd_flag & ND_NFSV3) {
2041 		nfsrv_postopattr(nd, getret, &at);
2042 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2043 		txdr_hyper(at.na_filerev, tl);
2044 		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
2045 	} else {
2046 		dirlen = 2 * NFSX_UNSIGNED;
2047 	}
2048 
2049 	/* Loop through the records and build reply */
2050 	while (cpos < cend && ncookies > 0) {
2051 		nlen = dp->d_namlen;
2052 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
2053 			nlen <= NFS_MAXNAMLEN) {
2054 			if (nd->nd_flag & ND_NFSV3)
2055 				dirlen += (6*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
2056 			else
2057 				dirlen += (4*NFSX_UNSIGNED + NFSM_RNDUP(nlen));
2058 			if (dirlen > cnt) {
2059 				eofflag = 0;
2060 				break;
2061 			}
2062 
2063 			/*
2064 			 * Build the directory record xdr from
2065 			 * the dirent entry.
2066 			 */
2067 			if (nd->nd_flag & ND_NFSV3) {
2068 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
2069 				*tl++ = newnfs_true;
2070 				*tl++ = 0;
2071 			} else {
2072 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2073 				*tl++ = newnfs_true;
2074 			}
2075 			*tl = txdr_unsigned(dp->d_fileno);
2076 			(void) nfsm_strtom(nd, dp->d_name, nlen);
2077 			if (nd->nd_flag & ND_NFSV3) {
2078 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2079 				*tl++ = 0;
2080 			} else
2081 				NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
2082 			*tl = txdr_unsigned(*cookiep);
2083 		}
2084 		cpos += dp->d_reclen;
2085 		dp = (struct dirent *)cpos;
2086 		cookiep++;
2087 		ncookies--;
2088 	}
2089 	if (cpos < cend)
2090 		eofflag = 0;
2091 	NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2092 	*tl++ = newnfs_false;
2093 	if (eofflag)
2094 		*tl = newnfs_true;
2095 	else
2096 		*tl = newnfs_false;
2097 	free(rbuf, M_TEMP);
2098 	free(cookies, M_TEMP);
2099 
2100 out:
2101 	NFSEXITCODE2(0, nd);
2102 	return (0);
2103 nfsmout:
2104 	vput(vp);
2105 	NFSEXITCODE2(error, nd);
2106 	return (error);
2107 }
2108 
2109 /*
2110  * Readdirplus for V3 and Readdir for V4.
2111  */
2112 int
2113 nfsrvd_readdirplus(struct nfsrv_descript *nd, int isdgram,
2114     struct vnode *vp, struct nfsexstuff *exp)
2115 {
2116 	struct dirent *dp;
2117 	u_int32_t *tl;
2118 	int dirlen;
2119 	char *cpos, *cend, *rbuf;
2120 	struct vnode *nvp;
2121 	fhandle_t nfh;
2122 	struct nfsvattr nva, at, *nvap = &nva;
2123 	struct mbuf *mb0, *mb1;
2124 	struct nfsreferral *refp;
2125 	int nlen, r, error = 0, getret = 1, usevget = 1;
2126 	int siz, cnt, fullsiz, eofflag, ncookies, entrycnt;
2127 	caddr_t bpos0, bpos1;
2128 	u_int64_t off, toff, verf;
2129 	u_long *cookies = NULL, *cookiep;
2130 	nfsattrbit_t attrbits, rderrbits, savbits;
2131 	struct uio io;
2132 	struct iovec iv;
2133 	struct componentname cn;
2134 	int at_root, is_ufs, is_zfs, needs_unbusy, supports_nfsv4acls;
2135 	struct mount *mp, *new_mp;
2136 	uint64_t mounted_on_fileno;
2137 	struct thread *p = curthread;
2138 
2139 	if (nd->nd_repstat) {
2140 		nfsrv_postopattr(nd, getret, &at);
2141 		goto out;
2142 	}
2143 	NFSM_DISSECT(tl, u_int32_t *, 6 * NFSX_UNSIGNED);
2144 	off = fxdr_hyper(tl);
2145 	toff = off;
2146 	tl += 2;
2147 	verf = fxdr_hyper(tl);
2148 	tl += 2;
2149 	siz = fxdr_unsigned(int, *tl++);
2150 	cnt = fxdr_unsigned(int, *tl);
2151 
2152 	/*
2153 	 * Use the server's maximum data transfer size as the upper bound
2154 	 * on reply datalen.
2155 	 */
2156 	if (cnt > NFS_SRVMAXDATA(nd) || cnt < 0)
2157 		cnt = NFS_SRVMAXDATA(nd);
2158 
2159 	/*
2160 	 * siz is a "hint" of how much directory information (name, fileid,
2161 	 * cookie) should be in the reply. At least one client "hints" 0,
2162 	 * so I set it to cnt for that case. I also round it up to the
2163 	 * next multiple of DIRBLKSIZ.
2164 	 * Since the size of a Readdirplus directory entry reply will always
2165 	 * be greater than a directory entry returned by VOP_READDIR(), it
2166 	 * does not make sense to read more than NFS_SRVMAXDATA() via
2167 	 * VOP_READDIR().
2168 	 */
2169 	if (siz <= 0)
2170 		siz = cnt;
2171 	else if (siz > NFS_SRVMAXDATA(nd))
2172 		siz = NFS_SRVMAXDATA(nd);
2173 	siz = ((siz + DIRBLKSIZ - 1) & ~(DIRBLKSIZ - 1));
2174 
2175 	if (nd->nd_flag & ND_NFSV4) {
2176 		error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
2177 		if (error)
2178 			goto nfsmout;
2179 		NFSSET_ATTRBIT(&savbits, &attrbits);
2180 		NFSCLRNOTFILLABLE_ATTRBIT(&attrbits, nd);
2181 		NFSZERO_ATTRBIT(&rderrbits);
2182 		NFSSETBIT_ATTRBIT(&rderrbits, NFSATTRBIT_RDATTRERROR);
2183 	} else {
2184 		NFSZERO_ATTRBIT(&attrbits);
2185 	}
2186 	fullsiz = siz;
2187 	nd->nd_repstat = getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
2188 #if 0
2189 	if (!nd->nd_repstat) {
2190 	    if (off && verf != at.na_filerev) {
2191 		/*
2192 		 * va_filerev is not sufficient as a cookie verifier,
2193 		 * since it is not supposed to change when entries are
2194 		 * removed/added unless that offset cookies returned to
2195 		 * the client are no longer valid.
2196 		 */
2197 		if (nd->nd_flag & ND_NFSV4) {
2198 			nd->nd_repstat = NFSERR_NOTSAME;
2199 		} else {
2200 			nd->nd_repstat = NFSERR_BAD_COOKIE;
2201 		}
2202 	    }
2203 	}
2204 #endif
2205 	if (!nd->nd_repstat && vp->v_type != VDIR)
2206 		nd->nd_repstat = NFSERR_NOTDIR;
2207 	if (!nd->nd_repstat && cnt == 0)
2208 		nd->nd_repstat = NFSERR_TOOSMALL;
2209 	if (!nd->nd_repstat)
2210 		nd->nd_repstat = nfsvno_accchk(vp, VEXEC,
2211 		    nd->nd_cred, exp, p, NFSACCCHK_NOOVERRIDE,
2212 		    NFSACCCHK_VPISLOCKED, NULL);
2213 	if (nd->nd_repstat) {
2214 		vput(vp);
2215 		if (nd->nd_flag & ND_NFSV3)
2216 			nfsrv_postopattr(nd, getret, &at);
2217 		goto out;
2218 	}
2219 	is_ufs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "ufs") == 0;
2220 	is_zfs = strcmp(vp->v_mount->mnt_vfc->vfc_name, "zfs") == 0;
2221 
2222 	rbuf = malloc(siz, M_TEMP, M_WAITOK);
2223 again:
2224 	eofflag = 0;
2225 	if (cookies) {
2226 		free(cookies, M_TEMP);
2227 		cookies = NULL;
2228 	}
2229 
2230 	iv.iov_base = rbuf;
2231 	iv.iov_len = siz;
2232 	io.uio_iov = &iv;
2233 	io.uio_iovcnt = 1;
2234 	io.uio_offset = (off_t)off;
2235 	io.uio_resid = siz;
2236 	io.uio_segflg = UIO_SYSSPACE;
2237 	io.uio_rw = UIO_READ;
2238 	io.uio_td = NULL;
2239 	nd->nd_repstat = VOP_READDIR(vp, &io, nd->nd_cred, &eofflag, &ncookies,
2240 	    &cookies);
2241 	off = (u_int64_t)io.uio_offset;
2242 	if (io.uio_resid)
2243 		siz -= io.uio_resid;
2244 
2245 	getret = nfsvno_getattr(vp, &at, nd, p, 1, NULL);
2246 
2247 	if (!cookies && !nd->nd_repstat)
2248 		nd->nd_repstat = NFSERR_PERM;
2249 	if (!nd->nd_repstat)
2250 		nd->nd_repstat = getret;
2251 	if (nd->nd_repstat) {
2252 		vput(vp);
2253 		if (cookies)
2254 			free(cookies, M_TEMP);
2255 		free(rbuf, M_TEMP);
2256 		if (nd->nd_flag & ND_NFSV3)
2257 			nfsrv_postopattr(nd, getret, &at);
2258 		goto out;
2259 	}
2260 	/*
2261 	 * If nothing read, return eof
2262 	 * rpc reply
2263 	 */
2264 	if (siz == 0) {
2265 		vput(vp);
2266 		if (nd->nd_flag & ND_NFSV3)
2267 			nfsrv_postopattr(nd, getret, &at);
2268 		NFSM_BUILD(tl, u_int32_t *, 4 * NFSX_UNSIGNED);
2269 		txdr_hyper(at.na_filerev, tl);
2270 		tl += 2;
2271 		*tl++ = newnfs_false;
2272 		*tl = newnfs_true;
2273 		free(cookies, M_TEMP);
2274 		free(rbuf, M_TEMP);
2275 		goto out;
2276 	}
2277 
2278 	/*
2279 	 * Check for degenerate cases of nothing useful read.
2280 	 * If so go try again
2281 	 */
2282 	cpos = rbuf;
2283 	cend = rbuf + siz;
2284 	dp = (struct dirent *)cpos;
2285 	cookiep = cookies;
2286 
2287 	/*
2288 	 * For some reason FreeBSD's ufs_readdir() chooses to back the
2289 	 * directory offset up to a block boundary, so it is necessary to
2290 	 * skip over the records that precede the requested offset. This
2291 	 * requires the assumption that file offset cookies monotonically
2292 	 * increase.
2293 	 */
2294 	while (cpos < cend && ncookies > 0 &&
2295 	  (dp->d_fileno == 0 || dp->d_type == DT_WHT ||
2296 	   (is_ufs == 1 && ((u_quad_t)(*cookiep)) <= toff) ||
2297 	   ((nd->nd_flag & ND_NFSV4) &&
2298 	    ((dp->d_namlen == 1 && dp->d_name[0] == '.') ||
2299 	     (dp->d_namlen==2 && dp->d_name[0]=='.' && dp->d_name[1]=='.'))))) {
2300 		cpos += dp->d_reclen;
2301 		dp = (struct dirent *)cpos;
2302 		cookiep++;
2303 		ncookies--;
2304 	}
2305 	if (cpos >= cend || ncookies == 0) {
2306 		siz = fullsiz;
2307 		toff = off;
2308 		goto again;
2309 	}
2310 
2311 	/*
2312 	 * Busy the file system so that the mount point won't go away
2313 	 * and, as such, VFS_VGET() can be used safely.
2314 	 */
2315 	mp = vp->v_mount;
2316 	vfs_ref(mp);
2317 	NFSVOPUNLOCK(vp);
2318 	nd->nd_repstat = vfs_busy(mp, 0);
2319 	vfs_rel(mp);
2320 	if (nd->nd_repstat != 0) {
2321 		vrele(vp);
2322 		free(cookies, M_TEMP);
2323 		free(rbuf, M_TEMP);
2324 		if (nd->nd_flag & ND_NFSV3)
2325 			nfsrv_postopattr(nd, getret, &at);
2326 		goto out;
2327 	}
2328 
2329 	/*
2330 	 * Check to see if entries in this directory can be safely acquired
2331 	 * via VFS_VGET() or if a switch to VOP_LOOKUP() is required.
2332 	 * ZFS snapshot directories need VOP_LOOKUP(), so that any
2333 	 * automount of the snapshot directory that is required will
2334 	 * be done.
2335 	 * This needs to be done here for NFSv4, since NFSv4 never does
2336 	 * a VFS_VGET() for "." or "..".
2337 	 */
2338 	if (is_zfs == 1) {
2339 		r = VFS_VGET(mp, at.na_fileid, LK_SHARED, &nvp);
2340 		if (r == EOPNOTSUPP) {
2341 			usevget = 0;
2342 			cn.cn_nameiop = LOOKUP;
2343 			cn.cn_lkflags = LK_SHARED | LK_RETRY;
2344 			cn.cn_cred = nd->nd_cred;
2345 			cn.cn_thread = p;
2346 		} else if (r == 0)
2347 			vput(nvp);
2348 	}
2349 
2350 	/*
2351 	 * Save this position, in case there is an error before one entry
2352 	 * is created.
2353 	 */
2354 	mb0 = nd->nd_mb;
2355 	bpos0 = nd->nd_bpos;
2356 
2357 	/*
2358 	 * Fill in the first part of the reply.
2359 	 * dirlen is the reply length in bytes and cannot exceed cnt.
2360 	 * (Include the two booleans at the end of the reply in dirlen now,
2361 	 *  so we recognize when we have exceeded cnt.)
2362 	 */
2363 	if (nd->nd_flag & ND_NFSV3) {
2364 		dirlen = NFSX_V3POSTOPATTR + NFSX_VERF + 2 * NFSX_UNSIGNED;
2365 		nfsrv_postopattr(nd, getret, &at);
2366 	} else {
2367 		dirlen = NFSX_VERF + 2 * NFSX_UNSIGNED;
2368 	}
2369 	NFSM_BUILD(tl, u_int32_t *, NFSX_VERF);
2370 	txdr_hyper(at.na_filerev, tl);
2371 
2372 	/*
2373 	 * Save this position, in case there is an empty reply needed.
2374 	 */
2375 	mb1 = nd->nd_mb;
2376 	bpos1 = nd->nd_bpos;
2377 
2378 	/* Loop through the records and build reply */
2379 	entrycnt = 0;
2380 	while (cpos < cend && ncookies > 0 && dirlen < cnt) {
2381 		nlen = dp->d_namlen;
2382 		if (dp->d_fileno != 0 && dp->d_type != DT_WHT &&
2383 		    nlen <= NFS_MAXNAMLEN &&
2384 		    ((nd->nd_flag & ND_NFSV3) || nlen > 2 ||
2385 		     (nlen==2 && (dp->d_name[0]!='.' || dp->d_name[1]!='.'))
2386 		      || (nlen == 1 && dp->d_name[0] != '.'))) {
2387 			/*
2388 			 * Save the current position in the reply, in case
2389 			 * this entry exceeds cnt.
2390 			 */
2391 			mb1 = nd->nd_mb;
2392 			bpos1 = nd->nd_bpos;
2393 
2394 			/*
2395 			 * For readdir_and_lookup get the vnode using
2396 			 * the file number.
2397 			 */
2398 			nvp = NULL;
2399 			refp = NULL;
2400 			r = 0;
2401 			at_root = 0;
2402 			needs_unbusy = 0;
2403 			new_mp = mp;
2404 			mounted_on_fileno = (uint64_t)dp->d_fileno;
2405 			if ((nd->nd_flag & ND_NFSV3) ||
2406 			    NFSNONZERO_ATTRBIT(&savbits)) {
2407 				if (nd->nd_flag & ND_NFSV4)
2408 					refp = nfsv4root_getreferral(NULL,
2409 					    vp, dp->d_fileno);
2410 				if (refp == NULL) {
2411 					if (usevget)
2412 						r = VFS_VGET(mp, dp->d_fileno,
2413 						    LK_SHARED, &nvp);
2414 					else
2415 						r = EOPNOTSUPP;
2416 					if (r == EOPNOTSUPP) {
2417 						if (usevget) {
2418 							usevget = 0;
2419 							cn.cn_nameiop = LOOKUP;
2420 							cn.cn_lkflags =
2421 							    LK_SHARED |
2422 							    LK_RETRY;
2423 							cn.cn_cred =
2424 							    nd->nd_cred;
2425 							cn.cn_thread = p;
2426 						}
2427 						cn.cn_nameptr = dp->d_name;
2428 						cn.cn_namelen = nlen;
2429 						cn.cn_flags = ISLASTCN |
2430 						    NOFOLLOW | LOCKLEAF;
2431 						if (nlen == 2 &&
2432 						    dp->d_name[0] == '.' &&
2433 						    dp->d_name[1] == '.')
2434 							cn.cn_flags |=
2435 							    ISDOTDOT;
2436 						if (NFSVOPLOCK(vp, LK_SHARED)
2437 						    != 0) {
2438 							nd->nd_repstat = EPERM;
2439 							break;
2440 						}
2441 						if ((vp->v_vflag & VV_ROOT) != 0
2442 						    && (cn.cn_flags & ISDOTDOT)
2443 						    != 0) {
2444 							vref(vp);
2445 							nvp = vp;
2446 							r = 0;
2447 						} else {
2448 							r = VOP_LOOKUP(vp, &nvp,
2449 							    &cn);
2450 							if (vp != nvp)
2451 								NFSVOPUNLOCK(vp);
2452 						}
2453 					}
2454 
2455 					/*
2456 					 * For NFSv4, check to see if nvp is
2457 					 * a mount point and get the mount
2458 					 * point vnode, as required.
2459 					 */
2460 					if (r == 0 &&
2461 					    nfsrv_enable_crossmntpt != 0 &&
2462 					    (nd->nd_flag & ND_NFSV4) != 0 &&
2463 					    nvp->v_type == VDIR &&
2464 					    nvp->v_mountedhere != NULL) {
2465 						new_mp = nvp->v_mountedhere;
2466 						r = vfs_busy(new_mp, 0);
2467 						vput(nvp);
2468 						nvp = NULL;
2469 						if (r == 0) {
2470 							r = VFS_ROOT(new_mp,
2471 							    LK_SHARED, &nvp);
2472 							needs_unbusy = 1;
2473 							if (r == 0)
2474 								at_root = 1;
2475 						}
2476 					}
2477 				}
2478 
2479 				/*
2480 				 * If we failed to look up the entry, then it
2481 				 * has become invalid, most likely removed.
2482 				 */
2483 				if (r != 0) {
2484 					if (needs_unbusy)
2485 						vfs_unbusy(new_mp);
2486 					goto invalid;
2487 				}
2488 				KASSERT(refp != NULL || nvp != NULL,
2489 				    ("%s: undetected lookup error", __func__));
2490 
2491 				if (refp == NULL &&
2492 				    ((nd->nd_flag & ND_NFSV3) ||
2493 				     NFSNONZERO_ATTRBIT(&attrbits))) {
2494 					r = nfsvno_getfh(nvp, &nfh, p);
2495 					if (!r)
2496 					    r = nfsvno_getattr(nvp, nvap, nd, p,
2497 						1, &attrbits);
2498 					if (r == 0 && is_zfs == 1 &&
2499 					    nfsrv_enable_crossmntpt != 0 &&
2500 					    (nd->nd_flag & ND_NFSV4) != 0 &&
2501 					    nvp->v_type == VDIR &&
2502 					    vp->v_mount != nvp->v_mount) {
2503 					    /*
2504 					     * For a ZFS snapshot, there is a
2505 					     * pseudo mount that does not set
2506 					     * v_mountedhere, so it needs to
2507 					     * be detected via a different
2508 					     * mount structure.
2509 					     */
2510 					    at_root = 1;
2511 					    if (new_mp == mp)
2512 						new_mp = nvp->v_mount;
2513 					}
2514 				}
2515 
2516 				/*
2517 				 * If we failed to get attributes of the entry,
2518 				 * then just skip it for NFSv3 (the traditional
2519 				 * behavior in the old NFS server).
2520 				 * For NFSv4 the behavior is controlled by
2521 				 * RDATTRERROR: we either ignore the error or
2522 				 * fail the request.
2523 				 * Note that RDATTRERROR is never set for NFSv3.
2524 				 */
2525 				if (r != 0) {
2526 					if (!NFSISSET_ATTRBIT(&attrbits,
2527 					    NFSATTRBIT_RDATTRERROR)) {
2528 						vput(nvp);
2529 						if (needs_unbusy != 0)
2530 							vfs_unbusy(new_mp);
2531 						if ((nd->nd_flag & ND_NFSV3))
2532 							goto invalid;
2533 						nd->nd_repstat = r;
2534 						break;
2535 					}
2536 				}
2537 			}
2538 
2539 			/*
2540 			 * Build the directory record xdr
2541 			 */
2542 			if (nd->nd_flag & ND_NFSV3) {
2543 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
2544 				*tl++ = newnfs_true;
2545 				*tl++ = 0;
2546 				*tl = txdr_unsigned(dp->d_fileno);
2547 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
2548 				NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2549 				*tl++ = 0;
2550 				*tl = txdr_unsigned(*cookiep);
2551 				nfsrv_postopattr(nd, 0, nvap);
2552 				dirlen += nfsm_fhtom(nd,(u_int8_t *)&nfh,0,1);
2553 				dirlen += (5*NFSX_UNSIGNED+NFSX_V3POSTOPATTR);
2554 				if (nvp != NULL)
2555 					vput(nvp);
2556 			} else {
2557 				NFSM_BUILD(tl, u_int32_t *, 3 * NFSX_UNSIGNED);
2558 				*tl++ = newnfs_true;
2559 				*tl++ = 0;
2560 				*tl = txdr_unsigned(*cookiep);
2561 				dirlen += nfsm_strtom(nd, dp->d_name, nlen);
2562 				if (nvp != NULL) {
2563 					supports_nfsv4acls =
2564 					    nfs_supportsnfsv4acls(nvp);
2565 					NFSVOPUNLOCK(nvp);
2566 				} else
2567 					supports_nfsv4acls = 0;
2568 				if (refp != NULL) {
2569 					dirlen += nfsrv_putreferralattr(nd,
2570 					    &savbits, refp, 0,
2571 					    &nd->nd_repstat);
2572 					if (nd->nd_repstat) {
2573 						if (nvp != NULL)
2574 							vrele(nvp);
2575 						if (needs_unbusy != 0)
2576 							vfs_unbusy(new_mp);
2577 						break;
2578 					}
2579 				} else if (r) {
2580 					dirlen += nfsvno_fillattr(nd, new_mp,
2581 					    nvp, nvap, &nfh, r, &rderrbits,
2582 					    nd->nd_cred, p, isdgram, 0,
2583 					    supports_nfsv4acls, at_root,
2584 					    mounted_on_fileno);
2585 				} else {
2586 					dirlen += nfsvno_fillattr(nd, new_mp,
2587 					    nvp, nvap, &nfh, r, &attrbits,
2588 					    nd->nd_cred, p, isdgram, 0,
2589 					    supports_nfsv4acls, at_root,
2590 					    mounted_on_fileno);
2591 				}
2592 				if (nvp != NULL)
2593 					vrele(nvp);
2594 				dirlen += (3 * NFSX_UNSIGNED);
2595 			}
2596 			if (needs_unbusy != 0)
2597 				vfs_unbusy(new_mp);
2598 			if (dirlen <= cnt)
2599 				entrycnt++;
2600 		}
2601 invalid:
2602 		cpos += dp->d_reclen;
2603 		dp = (struct dirent *)cpos;
2604 		cookiep++;
2605 		ncookies--;
2606 	}
2607 	vrele(vp);
2608 	vfs_unbusy(mp);
2609 
2610 	/*
2611 	 * If dirlen > cnt, we must strip off the last entry. If that
2612 	 * results in an empty reply, report NFSERR_TOOSMALL.
2613 	 */
2614 	if (dirlen > cnt || nd->nd_repstat) {
2615 		if (!nd->nd_repstat && entrycnt == 0)
2616 			nd->nd_repstat = NFSERR_TOOSMALL;
2617 		if (nd->nd_repstat) {
2618 			newnfs_trimtrailing(nd, mb0, bpos0);
2619 			if (nd->nd_flag & ND_NFSV3)
2620 				nfsrv_postopattr(nd, getret, &at);
2621 		} else
2622 			newnfs_trimtrailing(nd, mb1, bpos1);
2623 		eofflag = 0;
2624 	} else if (cpos < cend)
2625 		eofflag = 0;
2626 	if (!nd->nd_repstat) {
2627 		NFSM_BUILD(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2628 		*tl++ = newnfs_false;
2629 		if (eofflag)
2630 			*tl = newnfs_true;
2631 		else
2632 			*tl = newnfs_false;
2633 	}
2634 	free(cookies, M_TEMP);
2635 	free(rbuf, M_TEMP);
2636 
2637 out:
2638 	NFSEXITCODE2(0, nd);
2639 	return (0);
2640 nfsmout:
2641 	vput(vp);
2642 	NFSEXITCODE2(error, nd);
2643 	return (error);
2644 }
2645 
2646 /*
2647  * Get the settable attributes out of the mbuf list.
2648  * (Return 0 or EBADRPC)
2649  */
2650 int
2651 nfsrv_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
2652     nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
2653 {
2654 	u_int32_t *tl;
2655 	struct nfsv2_sattr *sp;
2656 	int error = 0, toclient = 0;
2657 
2658 	switch (nd->nd_flag & (ND_NFSV2 | ND_NFSV3 | ND_NFSV4)) {
2659 	case ND_NFSV2:
2660 		NFSM_DISSECT(sp, struct nfsv2_sattr *, NFSX_V2SATTR);
2661 		/*
2662 		 * Some old clients didn't fill in the high order 16bits.
2663 		 * --> check the low order 2 bytes for 0xffff
2664 		 */
2665 		if ((fxdr_unsigned(int, sp->sa_mode) & 0xffff) != 0xffff)
2666 			nvap->na_mode = nfstov_mode(sp->sa_mode);
2667 		if (sp->sa_uid != newnfs_xdrneg1)
2668 			nvap->na_uid = fxdr_unsigned(uid_t, sp->sa_uid);
2669 		if (sp->sa_gid != newnfs_xdrneg1)
2670 			nvap->na_gid = fxdr_unsigned(gid_t, sp->sa_gid);
2671 		if (sp->sa_size != newnfs_xdrneg1)
2672 			nvap->na_size = fxdr_unsigned(u_quad_t, sp->sa_size);
2673 		if (sp->sa_atime.nfsv2_sec != newnfs_xdrneg1) {
2674 #ifdef notyet
2675 			fxdr_nfsv2time(&sp->sa_atime, &nvap->na_atime);
2676 #else
2677 			nvap->na_atime.tv_sec =
2678 				fxdr_unsigned(u_int32_t,sp->sa_atime.nfsv2_sec);
2679 			nvap->na_atime.tv_nsec = 0;
2680 #endif
2681 		}
2682 		if (sp->sa_mtime.nfsv2_sec != newnfs_xdrneg1)
2683 			fxdr_nfsv2time(&sp->sa_mtime, &nvap->na_mtime);
2684 		break;
2685 	case ND_NFSV3:
2686 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2687 		if (*tl == newnfs_true) {
2688 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2689 			nvap->na_mode = nfstov_mode(*tl);
2690 		}
2691 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2692 		if (*tl == newnfs_true) {
2693 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2694 			nvap->na_uid = fxdr_unsigned(uid_t, *tl);
2695 		}
2696 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2697 		if (*tl == newnfs_true) {
2698 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2699 			nvap->na_gid = fxdr_unsigned(gid_t, *tl);
2700 		}
2701 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2702 		if (*tl == newnfs_true) {
2703 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2704 			nvap->na_size = fxdr_hyper(tl);
2705 		}
2706 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2707 		switch (fxdr_unsigned(int, *tl)) {
2708 		case NFSV3SATTRTIME_TOCLIENT:
2709 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2710 			fxdr_nfsv3time(tl, &nvap->na_atime);
2711 			toclient = 1;
2712 			break;
2713 		case NFSV3SATTRTIME_TOSERVER:
2714 			vfs_timestamp(&nvap->na_atime);
2715 			nvap->na_vaflags |= VA_UTIMES_NULL;
2716 			break;
2717 		}
2718 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2719 		switch (fxdr_unsigned(int, *tl)) {
2720 		case NFSV3SATTRTIME_TOCLIENT:
2721 			NFSM_DISSECT(tl, u_int32_t *, 2 * NFSX_UNSIGNED);
2722 			fxdr_nfsv3time(tl, &nvap->na_mtime);
2723 			nvap->na_vaflags &= ~VA_UTIMES_NULL;
2724 			break;
2725 		case NFSV3SATTRTIME_TOSERVER:
2726 			vfs_timestamp(&nvap->na_mtime);
2727 			if (!toclient)
2728 				nvap->na_vaflags |= VA_UTIMES_NULL;
2729 			break;
2730 		}
2731 		break;
2732 	case ND_NFSV4:
2733 		error = nfsv4_sattr(nd, vp, nvap, attrbitp, aclp, p);
2734 	}
2735 nfsmout:
2736 	NFSEXITCODE2(error, nd);
2737 	return (error);
2738 }
2739 
2740 /*
2741  * Handle the setable attributes for V4.
2742  * Returns NFSERR_BADXDR if it can't be parsed, 0 otherwise.
2743  */
2744 int
2745 nfsv4_sattr(struct nfsrv_descript *nd, vnode_t vp, struct nfsvattr *nvap,
2746     nfsattrbit_t *attrbitp, NFSACL_T *aclp, struct thread *p)
2747 {
2748 	u_int32_t *tl;
2749 	int attrsum = 0;
2750 	int i, j;
2751 	int error, attrsize, bitpos, aclsize, aceerr, retnotsup = 0;
2752 	int moderet, toclient = 0;
2753 	u_char *cp, namestr[NFSV4_SMALLSTR + 1];
2754 	uid_t uid;
2755 	gid_t gid;
2756 	u_short mode, mask;		/* Same type as va_mode. */
2757 	struct vattr va;
2758 
2759 	error = nfsrv_getattrbits(nd, attrbitp, NULL, &retnotsup);
2760 	if (error)
2761 		goto nfsmout;
2762 	NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2763 	attrsize = fxdr_unsigned(int, *tl);
2764 
2765 	/*
2766 	 * Loop around getting the setable attributes. If an unsupported
2767 	 * one is found, set nd_repstat == NFSERR_ATTRNOTSUPP and return.
2768 	 */
2769 	if (retnotsup) {
2770 		nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2771 		bitpos = NFSATTRBIT_MAX;
2772 	} else {
2773 		bitpos = 0;
2774 	}
2775 	moderet = 0;
2776 	for (; bitpos < NFSATTRBIT_MAX; bitpos++) {
2777 	    if (attrsum > attrsize) {
2778 		error = NFSERR_BADXDR;
2779 		goto nfsmout;
2780 	    }
2781 	    if (NFSISSET_ATTRBIT(attrbitp, bitpos))
2782 		switch (bitpos) {
2783 		case NFSATTRBIT_SIZE:
2784 			NFSM_DISSECT(tl, u_int32_t *, NFSX_HYPER);
2785                      if (vp != NULL && vp->v_type != VREG) {
2786                             error = (vp->v_type == VDIR) ? NFSERR_ISDIR :
2787                                 NFSERR_INVAL;
2788                             goto nfsmout;
2789 			}
2790 			nvap->na_size = fxdr_hyper(tl);
2791 			attrsum += NFSX_HYPER;
2792 			break;
2793 		case NFSATTRBIT_ACL:
2794 			error = nfsrv_dissectacl(nd, aclp, &aceerr, &aclsize,
2795 			    p);
2796 			if (error)
2797 				goto nfsmout;
2798 			if (aceerr && !nd->nd_repstat)
2799 				nd->nd_repstat = aceerr;
2800 			attrsum += aclsize;
2801 			break;
2802 		case NFSATTRBIT_ARCHIVE:
2803 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2804 			if (!nd->nd_repstat)
2805 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2806 			attrsum += NFSX_UNSIGNED;
2807 			break;
2808 		case NFSATTRBIT_HIDDEN:
2809 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2810 			if (!nd->nd_repstat)
2811 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2812 			attrsum += NFSX_UNSIGNED;
2813 			break;
2814 		case NFSATTRBIT_MIMETYPE:
2815 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2816 			i = fxdr_unsigned(int, *tl);
2817 			error = nfsm_advance(nd, NFSM_RNDUP(i), -1);
2818 			if (error)
2819 				goto nfsmout;
2820 			if (!nd->nd_repstat)
2821 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2822 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(i));
2823 			break;
2824 		case NFSATTRBIT_MODE:
2825 			moderet = NFSERR_INVAL;	/* Can't do MODESETMASKED. */
2826 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2827 			nvap->na_mode = nfstov_mode(*tl);
2828 			attrsum += NFSX_UNSIGNED;
2829 			break;
2830 		case NFSATTRBIT_OWNER:
2831 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2832 			j = fxdr_unsigned(int, *tl);
2833 			if (j < 0) {
2834 				error = NFSERR_BADXDR;
2835 				goto nfsmout;
2836 			}
2837 			if (j > NFSV4_SMALLSTR)
2838 				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
2839 			else
2840 				cp = namestr;
2841 			error = nfsrv_mtostr(nd, cp, j);
2842 			if (error) {
2843 				if (j > NFSV4_SMALLSTR)
2844 					free(cp, M_NFSSTRING);
2845 				goto nfsmout;
2846 			}
2847 			if (!nd->nd_repstat) {
2848 				nd->nd_repstat = nfsv4_strtouid(nd, cp, j,
2849 				    &uid);
2850 				if (!nd->nd_repstat)
2851 					nvap->na_uid = uid;
2852 			}
2853 			if (j > NFSV4_SMALLSTR)
2854 				free(cp, M_NFSSTRING);
2855 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
2856 			break;
2857 		case NFSATTRBIT_OWNERGROUP:
2858 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2859 			j = fxdr_unsigned(int, *tl);
2860 			if (j < 0) {
2861 				error = NFSERR_BADXDR;
2862 				goto nfsmout;
2863 			}
2864 			if (j > NFSV4_SMALLSTR)
2865 				cp = malloc(j + 1, M_NFSSTRING, M_WAITOK);
2866 			else
2867 				cp = namestr;
2868 			error = nfsrv_mtostr(nd, cp, j);
2869 			if (error) {
2870 				if (j > NFSV4_SMALLSTR)
2871 					free(cp, M_NFSSTRING);
2872 				goto nfsmout;
2873 			}
2874 			if (!nd->nd_repstat) {
2875 				nd->nd_repstat = nfsv4_strtogid(nd, cp, j,
2876 				    &gid);
2877 				if (!nd->nd_repstat)
2878 					nvap->na_gid = gid;
2879 			}
2880 			if (j > NFSV4_SMALLSTR)
2881 				free(cp, M_NFSSTRING);
2882 			attrsum += (NFSX_UNSIGNED + NFSM_RNDUP(j));
2883 			break;
2884 		case NFSATTRBIT_SYSTEM:
2885 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2886 			if (!nd->nd_repstat)
2887 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2888 			attrsum += NFSX_UNSIGNED;
2889 			break;
2890 		case NFSATTRBIT_TIMEACCESSSET:
2891 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2892 			attrsum += NFSX_UNSIGNED;
2893 			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
2894 			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2895 			    fxdr_nfsv4time(tl, &nvap->na_atime);
2896 			    toclient = 1;
2897 			    attrsum += NFSX_V4TIME;
2898 			} else {
2899 			    vfs_timestamp(&nvap->na_atime);
2900 			    nvap->na_vaflags |= VA_UTIMES_NULL;
2901 			}
2902 			break;
2903 		case NFSATTRBIT_TIMEBACKUP:
2904 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2905 			if (!nd->nd_repstat)
2906 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2907 			attrsum += NFSX_V4TIME;
2908 			break;
2909 		case NFSATTRBIT_TIMECREATE:
2910 			NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2911 			if (!nd->nd_repstat)
2912 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2913 			attrsum += NFSX_V4TIME;
2914 			break;
2915 		case NFSATTRBIT_TIMEMODIFYSET:
2916 			NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
2917 			attrsum += NFSX_UNSIGNED;
2918 			if (fxdr_unsigned(int, *tl)==NFSV4SATTRTIME_TOCLIENT) {
2919 			    NFSM_DISSECT(tl, u_int32_t *, NFSX_V4TIME);
2920 			    fxdr_nfsv4time(tl, &nvap->na_mtime);
2921 			    nvap->na_vaflags &= ~VA_UTIMES_NULL;
2922 			    attrsum += NFSX_V4TIME;
2923 			} else {
2924 			    vfs_timestamp(&nvap->na_mtime);
2925 			    if (!toclient)
2926 				nvap->na_vaflags |= VA_UTIMES_NULL;
2927 			}
2928 			break;
2929 		case NFSATTRBIT_MODESETMASKED:
2930 			NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
2931 			mode = fxdr_unsigned(u_short, *tl++);
2932 			mask = fxdr_unsigned(u_short, *tl);
2933 			/*
2934 			 * vp == NULL implies an Open/Create operation.
2935 			 * This attribute can only be used for Setattr and
2936 			 * only for NFSv4.1 or higher.
2937 			 * If moderet != 0, a mode attribute has also been
2938 			 * specified and this attribute cannot be done in the
2939 			 * same Setattr operation.
2940 			 */
2941 			if ((nd->nd_flag & ND_NFSV41) == 0)
2942 				nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2943 			else if ((mode & ~07777) != 0 || (mask & ~07777) != 0 ||
2944 			    vp == NULL)
2945 				nd->nd_repstat = NFSERR_INVAL;
2946 			else if (moderet == 0)
2947 				moderet = VOP_GETATTR(vp, &va, nd->nd_cred);
2948 			if (moderet == 0)
2949 				nvap->na_mode = (mode & mask) |
2950 				    (va.va_mode & ~mask);
2951 			else
2952 				nd->nd_repstat = moderet;
2953 			attrsum += 2 * NFSX_UNSIGNED;
2954 			break;
2955 		default:
2956 			nd->nd_repstat = NFSERR_ATTRNOTSUPP;
2957 			/*
2958 			 * set bitpos so we drop out of the loop.
2959 			 */
2960 			bitpos = NFSATTRBIT_MAX;
2961 			break;
2962 		}
2963 	}
2964 
2965 	/*
2966 	 * some clients pad the attrlist, so we need to skip over the
2967 	 * padding.
2968 	 */
2969 	if (attrsum > attrsize) {
2970 		error = NFSERR_BADXDR;
2971 	} else {
2972 		attrsize = NFSM_RNDUP(attrsize);
2973 		if (attrsum < attrsize)
2974 			error = nfsm_advance(nd, attrsize - attrsum, -1);
2975 	}
2976 nfsmout:
2977 	NFSEXITCODE2(error, nd);
2978 	return (error);
2979 }
2980 
2981 /*
2982  * Check/setup export credentials.
2983  */
2984 int
2985 nfsd_excred(struct nfsrv_descript *nd, struct nfsexstuff *exp,
2986     struct ucred *credanon)
2987 {
2988 	int error = 0;
2989 
2990 	/*
2991 	 * Check/setup credentials.
2992 	 */
2993 	if (nd->nd_flag & ND_GSS)
2994 		exp->nes_exflag &= ~MNT_EXPORTANON;
2995 
2996 	/*
2997 	 * Check to see if the operation is allowed for this security flavor.
2998 	 * RFC2623 suggests that the NFSv3 Fsinfo RPC be allowed to
2999 	 * AUTH_NONE or AUTH_SYS for file systems requiring RPCSEC_GSS.
3000 	 * Also, allow Secinfo, so that it can acquire the correct flavor(s).
3001 	 */
3002 	if (nfsvno_testexp(nd, exp) &&
3003 	    nd->nd_procnum != NFSV4OP_SECINFO &&
3004 	    nd->nd_procnum != NFSPROC_FSINFO) {
3005 		if (nd->nd_flag & ND_NFSV4)
3006 			error = NFSERR_WRONGSEC;
3007 		else
3008 			error = (NFSERR_AUTHERR | AUTH_TOOWEAK);
3009 		goto out;
3010 	}
3011 
3012 	/*
3013 	 * Check to see if the file system is exported V4 only.
3014 	 */
3015 	if (NFSVNO_EXV4ONLY(exp) && !(nd->nd_flag & ND_NFSV4)) {
3016 		error = NFSERR_PROGNOTV4;
3017 		goto out;
3018 	}
3019 
3020 	/*
3021 	 * Now, map the user credentials.
3022 	 * (Note that ND_AUTHNONE will only be set for an NFSv3
3023 	 *  Fsinfo RPC. If set for anything else, this code might need
3024 	 *  to change.)
3025 	 */
3026 	if (NFSVNO_EXPORTED(exp)) {
3027 		if (((nd->nd_flag & ND_GSS) == 0 && nd->nd_cred->cr_uid == 0) ||
3028 		     NFSVNO_EXPORTANON(exp) ||
3029 		     (nd->nd_flag & ND_AUTHNONE) != 0) {
3030 			nd->nd_cred->cr_uid = credanon->cr_uid;
3031 			nd->nd_cred->cr_gid = credanon->cr_gid;
3032 			crsetgroups(nd->nd_cred, credanon->cr_ngroups,
3033 			    credanon->cr_groups);
3034 		} else if ((nd->nd_flag & ND_GSS) == 0) {
3035 			/*
3036 			 * If using AUTH_SYS, call nfsrv_getgrpscred() to see
3037 			 * if there is a replacement credential with a group
3038 			 * list set up by "nfsuserd -manage-gids".
3039 			 * If there is no replacement, nfsrv_getgrpscred()
3040 			 * simply returns its argument.
3041 			 */
3042 			nd->nd_cred = nfsrv_getgrpscred(nd->nd_cred);
3043 		}
3044 	}
3045 
3046 out:
3047 	NFSEXITCODE2(error, nd);
3048 	return (error);
3049 }
3050 
3051 /*
3052  * Check exports.
3053  */
3054 int
3055 nfsvno_checkexp(struct mount *mp, struct sockaddr *nam, struct nfsexstuff *exp,
3056     struct ucred **credp)
3057 {
3058 	int i, error, *secflavors;
3059 
3060 	error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
3061 	    &exp->nes_numsecflavor, &secflavors);
3062 	if (error) {
3063 		if (nfs_rootfhset) {
3064 			exp->nes_exflag = 0;
3065 			exp->nes_numsecflavor = 0;
3066 			error = 0;
3067 		}
3068 	} else {
3069 		/* Copy the security flavors. */
3070 		for (i = 0; i < exp->nes_numsecflavor; i++)
3071 			exp->nes_secflavors[i] = secflavors[i];
3072 	}
3073 	NFSEXITCODE(error);
3074 	return (error);
3075 }
3076 
3077 /*
3078  * Get a vnode for a file handle and export stuff.
3079  */
3080 int
3081 nfsvno_fhtovp(struct mount *mp, fhandle_t *fhp, struct sockaddr *nam,
3082     int lktype, struct vnode **vpp, struct nfsexstuff *exp,
3083     struct ucred **credp)
3084 {
3085 	int i, error, *secflavors;
3086 
3087 	*credp = NULL;
3088 	exp->nes_numsecflavor = 0;
3089 	error = VFS_FHTOVP(mp, &fhp->fh_fid, lktype, vpp);
3090 	if (error != 0)
3091 		/* Make sure the server replies ESTALE to the client. */
3092 		error = ESTALE;
3093 	if (nam && !error) {
3094 		error = VFS_CHECKEXP(mp, nam, &exp->nes_exflag, credp,
3095 		    &exp->nes_numsecflavor, &secflavors);
3096 		if (error) {
3097 			if (nfs_rootfhset) {
3098 				exp->nes_exflag = 0;
3099 				exp->nes_numsecflavor = 0;
3100 				error = 0;
3101 			} else {
3102 				vput(*vpp);
3103 			}
3104 		} else {
3105 			/* Copy the security flavors. */
3106 			for (i = 0; i < exp->nes_numsecflavor; i++)
3107 				exp->nes_secflavors[i] = secflavors[i];
3108 		}
3109 	}
3110 	NFSEXITCODE(error);
3111 	return (error);
3112 }
3113 
3114 /*
3115  * nfsd_fhtovp() - convert a fh to a vnode ptr
3116  * 	- look up fsid in mount list (if not found ret error)
3117  *	- get vp and export rights by calling nfsvno_fhtovp()
3118  *	- if cred->cr_uid == 0 or MNT_EXPORTANON set it to credanon
3119  *	  for AUTH_SYS
3120  *	- if mpp != NULL, return the mount point so that it can
3121  *	  be used for vn_finished_write() by the caller
3122  */
3123 void
3124 nfsd_fhtovp(struct nfsrv_descript *nd, struct nfsrvfh *nfp, int lktype,
3125     struct vnode **vpp, struct nfsexstuff *exp,
3126     struct mount **mpp, int startwrite)
3127 {
3128 	struct mount *mp;
3129 	struct ucred *credanon;
3130 	fhandle_t *fhp;
3131 
3132 	fhp = (fhandle_t *)nfp->nfsrvfh_data;
3133 	/*
3134 	 * Check for the special case of the nfsv4root_fh.
3135 	 */
3136 	mp = vfs_busyfs(&fhp->fh_fsid);
3137 	if (mpp != NULL)
3138 		*mpp = mp;
3139 	if (mp == NULL) {
3140 		*vpp = NULL;
3141 		nd->nd_repstat = ESTALE;
3142 		goto out;
3143 	}
3144 
3145 	if (startwrite) {
3146 		vn_start_write(NULL, mpp, V_WAIT);
3147 		if (lktype == LK_SHARED && !(MNT_SHARED_WRITES(mp)))
3148 			lktype = LK_EXCLUSIVE;
3149 	}
3150 	nd->nd_repstat = nfsvno_fhtovp(mp, fhp, nd->nd_nam, lktype, vpp, exp,
3151 	    &credanon);
3152 	vfs_unbusy(mp);
3153 
3154 	/*
3155 	 * For NFSv4 without a pseudo root fs, unexported file handles
3156 	 * can be returned, so that Lookup works everywhere.
3157 	 */
3158 	if (!nd->nd_repstat && exp->nes_exflag == 0 &&
3159 	    !(nd->nd_flag & ND_NFSV4)) {
3160 		vput(*vpp);
3161 		nd->nd_repstat = EACCES;
3162 	}
3163 
3164 	/*
3165 	 * Personally, I've never seen any point in requiring a
3166 	 * reserved port#, since only in the rare case where the
3167 	 * clients are all boxes with secure system privileges,
3168 	 * does it provide any enhanced security, but... some people
3169 	 * believe it to be useful and keep putting this code back in.
3170 	 * (There is also some "security checker" out there that
3171 	 *  complains if the nfs server doesn't enforce this.)
3172 	 * However, note the following:
3173 	 * RFC3530 (NFSv4) specifies that a reserved port# not be
3174 	 *	required.
3175 	 * RFC2623 recommends that, if a reserved port# is checked for,
3176 	 *	that there be a way to turn that off--> ifdef'd.
3177 	 */
3178 #ifdef NFS_REQRSVPORT
3179 	if (!nd->nd_repstat) {
3180 		struct sockaddr_in *saddr;
3181 		struct sockaddr_in6 *saddr6;
3182 
3183 		saddr = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in *);
3184 		saddr6 = NFSSOCKADDR(nd->nd_nam, struct sockaddr_in6 *);
3185 		if (!(nd->nd_flag & ND_NFSV4) &&
3186 		    ((saddr->sin_family == AF_INET &&
3187 		      ntohs(saddr->sin_port) >= IPPORT_RESERVED) ||
3188 		     (saddr6->sin6_family == AF_INET6 &&
3189 		      ntohs(saddr6->sin6_port) >= IPPORT_RESERVED))) {
3190 			vput(*vpp);
3191 			nd->nd_repstat = (NFSERR_AUTHERR | AUTH_TOOWEAK);
3192 		}
3193 	}
3194 #endif	/* NFS_REQRSVPORT */
3195 
3196 	/*
3197 	 * Check/setup credentials.
3198 	 */
3199 	if (!nd->nd_repstat) {
3200 		nd->nd_saveduid = nd->nd_cred->cr_uid;
3201 		nd->nd_repstat = nfsd_excred(nd, exp, credanon);
3202 		if (nd->nd_repstat)
3203 			vput(*vpp);
3204 	}
3205 	if (credanon != NULL)
3206 		crfree(credanon);
3207 	if (nd->nd_repstat) {
3208 		if (startwrite)
3209 			vn_finished_write(mp);
3210 		*vpp = NULL;
3211 		if (mpp != NULL)
3212 			*mpp = NULL;
3213 	}
3214 
3215 out:
3216 	NFSEXITCODE2(0, nd);
3217 }
3218 
3219 /*
3220  * glue for fp.
3221  */
3222 static int
3223 fp_getfvp(struct thread *p, int fd, struct file **fpp, struct vnode **vpp)
3224 {
3225 	struct filedesc *fdp;
3226 	struct file *fp;
3227 	int error = 0;
3228 
3229 	fdp = p->td_proc->p_fd;
3230 	if (fd < 0 || fd >= fdp->fd_nfiles ||
3231 	    (fp = fdp->fd_ofiles[fd].fde_file) == NULL) {
3232 		error = EBADF;
3233 		goto out;
3234 	}
3235 	*fpp = fp;
3236 
3237 out:
3238 	NFSEXITCODE(error);
3239 	return (error);
3240 }
3241 
3242 /*
3243  * Called from nfssvc() to update the exports list. Just call
3244  * vfs_export(). This has to be done, since the v4 root fake fs isn't
3245  * in the mount list.
3246  */
3247 int
3248 nfsrv_v4rootexport(void *argp, struct ucred *cred, struct thread *p)
3249 {
3250 	struct nfsex_args *nfsexargp = (struct nfsex_args *)argp;
3251 	int error = 0;
3252 	struct nameidata nd;
3253 	fhandle_t fh;
3254 
3255 	error = vfs_export(&nfsv4root_mnt, &nfsexargp->export);
3256 	if ((nfsexargp->export.ex_flags & MNT_DELEXPORT) != 0)
3257 		nfs_rootfhset = 0;
3258 	else if (error == 0) {
3259 		if (nfsexargp->fspec == NULL) {
3260 			error = EPERM;
3261 			goto out;
3262 		}
3263 		/*
3264 		 * If fspec != NULL, this is the v4root path.
3265 		 */
3266 		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE,
3267 		    nfsexargp->fspec, p);
3268 		if ((error = namei(&nd)) != 0)
3269 			goto out;
3270 		error = nfsvno_getfh(nd.ni_vp, &fh, p);
3271 		vrele(nd.ni_vp);
3272 		if (!error) {
3273 			nfs_rootfh.nfsrvfh_len = NFSX_MYFH;
3274 			NFSBCOPY((caddr_t)&fh,
3275 			    nfs_rootfh.nfsrvfh_data,
3276 			    sizeof (fhandle_t));
3277 			nfs_rootfhset = 1;
3278 		}
3279 	}
3280 
3281 out:
3282 	NFSEXITCODE(error);
3283 	return (error);
3284 }
3285 
3286 /*
3287  * This function needs to test to see if the system is near its limit
3288  * for memory allocation via malloc() or mget() and return True iff
3289  * either of these resources are near their limit.
3290  * XXX (For now, this is just a stub.)
3291  */
3292 int nfsrv_testmalloclimit = 0;
3293 int
3294 nfsrv_mallocmget_limit(void)
3295 {
3296 	static int printmesg = 0;
3297 	static int testval = 1;
3298 
3299 	if (nfsrv_testmalloclimit && (testval++ % 1000) == 0) {
3300 		if ((printmesg++ % 100) == 0)
3301 			printf("nfsd: malloc/mget near limit\n");
3302 		return (1);
3303 	}
3304 	return (0);
3305 }
3306 
3307 /*
3308  * BSD specific initialization of a mount point.
3309  */
3310 void
3311 nfsd_mntinit(void)
3312 {
3313 	static int inited = 0;
3314 
3315 	if (inited)
3316 		return;
3317 	inited = 1;
3318 	nfsv4root_mnt.mnt_flag = (MNT_RDONLY | MNT_EXPORTED);
3319 	TAILQ_INIT(&nfsv4root_mnt.mnt_nvnodelist);
3320 	TAILQ_INIT(&nfsv4root_mnt.mnt_lazyvnodelist);
3321 	nfsv4root_mnt.mnt_export = NULL;
3322 	TAILQ_INIT(&nfsv4root_opt);
3323 	TAILQ_INIT(&nfsv4root_newopt);
3324 	nfsv4root_mnt.mnt_opt = &nfsv4root_opt;
3325 	nfsv4root_mnt.mnt_optnew = &nfsv4root_newopt;
3326 	nfsv4root_mnt.mnt_nvnodelistsize = 0;
3327 	nfsv4root_mnt.mnt_lazyvnodelistsize = 0;
3328 }
3329 
3330 /*
3331  * Get a vnode for a file handle, without checking exports, etc.
3332  */
3333 struct vnode *
3334 nfsvno_getvp(fhandle_t *fhp)
3335 {
3336 	struct mount *mp;
3337 	struct vnode *vp;
3338 	int error;
3339 
3340 	mp = vfs_busyfs(&fhp->fh_fsid);
3341 	if (mp == NULL)
3342 		return (NULL);
3343 	error = VFS_FHTOVP(mp, &fhp->fh_fid, LK_EXCLUSIVE, &vp);
3344 	vfs_unbusy(mp);
3345 	if (error)
3346 		return (NULL);
3347 	return (vp);
3348 }
3349 
3350 /*
3351  * Do a local VOP_ADVLOCK().
3352  */
3353 int
3354 nfsvno_advlock(struct vnode *vp, int ftype, u_int64_t first,
3355     u_int64_t end, struct thread *td)
3356 {
3357 	int error = 0;
3358 	struct flock fl;
3359 	u_int64_t tlen;
3360 
3361 	if (nfsrv_dolocallocks == 0)
3362 		goto out;
3363 	ASSERT_VOP_UNLOCKED(vp, "nfsvno_advlock: vp locked");
3364 
3365 	fl.l_whence = SEEK_SET;
3366 	fl.l_type = ftype;
3367 	fl.l_start = (off_t)first;
3368 	if (end == NFS64BITSSET) {
3369 		fl.l_len = 0;
3370 	} else {
3371 		tlen = end - first;
3372 		fl.l_len = (off_t)tlen;
3373 	}
3374 	/*
3375 	 * For FreeBSD8, the l_pid and l_sysid must be set to the same
3376 	 * values for all calls, so that all locks will be held by the
3377 	 * nfsd server. (The nfsd server handles conflicts between the
3378 	 * various clients.)
3379 	 * Since an NFSv4 lockowner is a ClientID plus an array of up to 1024
3380 	 * bytes, so it can't be put in l_sysid.
3381 	 */
3382 	if (nfsv4_sysid == 0)
3383 		nfsv4_sysid = nlm_acquire_next_sysid();
3384 	fl.l_pid = (pid_t)0;
3385 	fl.l_sysid = (int)nfsv4_sysid;
3386 
3387 	if (ftype == F_UNLCK)
3388 		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_UNLCK, &fl,
3389 		    (F_POSIX | F_REMOTE));
3390 	else
3391 		error = VOP_ADVLOCK(vp, (caddr_t)td->td_proc, F_SETLK, &fl,
3392 		    (F_POSIX | F_REMOTE));
3393 
3394 out:
3395 	NFSEXITCODE(error);
3396 	return (error);
3397 }
3398 
3399 /*
3400  * Check the nfsv4 root exports.
3401  */
3402 int
3403 nfsvno_v4rootexport(struct nfsrv_descript *nd)
3404 {
3405 	struct ucred *credanon;
3406 	int exflags, error = 0, numsecflavor, *secflavors, i;
3407 
3408 	error = vfs_stdcheckexp(&nfsv4root_mnt, nd->nd_nam, &exflags,
3409 	    &credanon, &numsecflavor, &secflavors);
3410 	if (error) {
3411 		error = NFSERR_PROGUNAVAIL;
3412 		goto out;
3413 	}
3414 	if (credanon != NULL)
3415 		crfree(credanon);
3416 	for (i = 0; i < numsecflavor; i++) {
3417 		if (secflavors[i] == AUTH_SYS)
3418 			nd->nd_flag |= ND_EXAUTHSYS;
3419 		else if (secflavors[i] == RPCSEC_GSS_KRB5)
3420 			nd->nd_flag |= ND_EXGSS;
3421 		else if (secflavors[i] == RPCSEC_GSS_KRB5I)
3422 			nd->nd_flag |= ND_EXGSSINTEGRITY;
3423 		else if (secflavors[i] == RPCSEC_GSS_KRB5P)
3424 			nd->nd_flag |= ND_EXGSSPRIVACY;
3425 	}
3426 
3427 out:
3428 	NFSEXITCODE(error);
3429 	return (error);
3430 }
3431 
3432 /*
3433  * Nfs server pseudo system call for the nfsd's
3434  */
3435 /*
3436  * MPSAFE
3437  */
3438 static int
3439 nfssvc_nfsd(struct thread *td, struct nfssvc_args *uap)
3440 {
3441 	struct file *fp;
3442 	struct nfsd_addsock_args sockarg;
3443 	struct nfsd_nfsd_args nfsdarg;
3444 	struct nfsd_nfsd_oargs onfsdarg;
3445 	struct nfsd_pnfsd_args pnfsdarg;
3446 	struct vnode *vp, *nvp, *curdvp;
3447 	struct pnfsdsfile *pf;
3448 	struct nfsdevice *ds, *fds;
3449 	cap_rights_t rights;
3450 	int buflen, error, ret;
3451 	char *buf, *cp, *cp2, *cp3;
3452 	char fname[PNFS_FILENAME_LEN + 1];
3453 
3454 	if (uap->flag & NFSSVC_NFSDADDSOCK) {
3455 		error = copyin(uap->argp, (caddr_t)&sockarg, sizeof (sockarg));
3456 		if (error)
3457 			goto out;
3458 		/*
3459 		 * Since we don't know what rights might be required,
3460 		 * pretend that we need them all. It is better to be too
3461 		 * careful than too reckless.
3462 		 */
3463 		error = fget(td, sockarg.sock,
3464 		    cap_rights_init(&rights, CAP_SOCK_SERVER), &fp);
3465 		if (error != 0)
3466 			goto out;
3467 		if (fp->f_type != DTYPE_SOCKET) {
3468 			fdrop(fp, td);
3469 			error = EPERM;
3470 			goto out;
3471 		}
3472 		error = nfsrvd_addsock(fp);
3473 		fdrop(fp, td);
3474 	} else if (uap->flag & NFSSVC_NFSDNFSD) {
3475 		if (uap->argp == NULL) {
3476 			error = EINVAL;
3477 			goto out;
3478 		}
3479 		if ((uap->flag & NFSSVC_NEWSTRUCT) == 0) {
3480 			error = copyin(uap->argp, &onfsdarg, sizeof(onfsdarg));
3481 			if (error == 0) {
3482 				nfsdarg.principal = onfsdarg.principal;
3483 				nfsdarg.minthreads = onfsdarg.minthreads;
3484 				nfsdarg.maxthreads = onfsdarg.maxthreads;
3485 				nfsdarg.version = 1;
3486 				nfsdarg.addr = NULL;
3487 				nfsdarg.addrlen = 0;
3488 				nfsdarg.dnshost = NULL;
3489 				nfsdarg.dnshostlen = 0;
3490 				nfsdarg.dspath = NULL;
3491 				nfsdarg.dspathlen = 0;
3492 				nfsdarg.mdspath = NULL;
3493 				nfsdarg.mdspathlen = 0;
3494 				nfsdarg.mirrorcnt = 1;
3495 			}
3496 		} else
3497 			error = copyin(uap->argp, &nfsdarg, sizeof(nfsdarg));
3498 		if (error)
3499 			goto out;
3500 		if (nfsdarg.addrlen > 0 && nfsdarg.addrlen < 10000 &&
3501 		    nfsdarg.dnshostlen > 0 && nfsdarg.dnshostlen < 10000 &&
3502 		    nfsdarg.dspathlen > 0 && nfsdarg.dspathlen < 10000 &&
3503 		    nfsdarg.mdspathlen > 0 && nfsdarg.mdspathlen < 10000 &&
3504 		    nfsdarg.mirrorcnt >= 1 &&
3505 		    nfsdarg.mirrorcnt <= NFSDEV_MAXMIRRORS &&
3506 		    nfsdarg.addr != NULL && nfsdarg.dnshost != NULL &&
3507 		    nfsdarg.dspath != NULL && nfsdarg.mdspath != NULL) {
3508 			NFSD_DEBUG(1, "addrlen=%d dspathlen=%d dnslen=%d"
3509 			    " mdspathlen=%d mirrorcnt=%d\n", nfsdarg.addrlen,
3510 			    nfsdarg.dspathlen, nfsdarg.dnshostlen,
3511 			    nfsdarg.mdspathlen, nfsdarg.mirrorcnt);
3512 			cp = malloc(nfsdarg.addrlen + 1, M_TEMP, M_WAITOK);
3513 			error = copyin(nfsdarg.addr, cp, nfsdarg.addrlen);
3514 			if (error != 0) {
3515 				free(cp, M_TEMP);
3516 				goto out;
3517 			}
3518 			cp[nfsdarg.addrlen] = '\0';	/* Ensure nul term. */
3519 			nfsdarg.addr = cp;
3520 			cp = malloc(nfsdarg.dnshostlen + 1, M_TEMP, M_WAITOK);
3521 			error = copyin(nfsdarg.dnshost, cp, nfsdarg.dnshostlen);
3522 			if (error != 0) {
3523 				free(nfsdarg.addr, M_TEMP);
3524 				free(cp, M_TEMP);
3525 				goto out;
3526 			}
3527 			cp[nfsdarg.dnshostlen] = '\0';	/* Ensure nul term. */
3528 			nfsdarg.dnshost = cp;
3529 			cp = malloc(nfsdarg.dspathlen + 1, M_TEMP, M_WAITOK);
3530 			error = copyin(nfsdarg.dspath, cp, nfsdarg.dspathlen);
3531 			if (error != 0) {
3532 				free(nfsdarg.addr, M_TEMP);
3533 				free(nfsdarg.dnshost, M_TEMP);
3534 				free(cp, M_TEMP);
3535 				goto out;
3536 			}
3537 			cp[nfsdarg.dspathlen] = '\0';	/* Ensure nul term. */
3538 			nfsdarg.dspath = cp;
3539 			cp = malloc(nfsdarg.mdspathlen + 1, M_TEMP, M_WAITOK);
3540 			error = copyin(nfsdarg.mdspath, cp, nfsdarg.mdspathlen);
3541 			if (error != 0) {
3542 				free(nfsdarg.addr, M_TEMP);
3543 				free(nfsdarg.dnshost, M_TEMP);
3544 				free(nfsdarg.dspath, M_TEMP);
3545 				free(cp, M_TEMP);
3546 				goto out;
3547 			}
3548 			cp[nfsdarg.mdspathlen] = '\0';	/* Ensure nul term. */
3549 			nfsdarg.mdspath = cp;
3550 		} else {
3551 			nfsdarg.addr = NULL;
3552 			nfsdarg.addrlen = 0;
3553 			nfsdarg.dnshost = NULL;
3554 			nfsdarg.dnshostlen = 0;
3555 			nfsdarg.dspath = NULL;
3556 			nfsdarg.dspathlen = 0;
3557 			nfsdarg.mdspath = NULL;
3558 			nfsdarg.mdspathlen = 0;
3559 			nfsdarg.mirrorcnt = 1;
3560 		}
3561 		error = nfsrvd_nfsd(td, &nfsdarg);
3562 		free(nfsdarg.addr, M_TEMP);
3563 		free(nfsdarg.dnshost, M_TEMP);
3564 		free(nfsdarg.dspath, M_TEMP);
3565 		free(nfsdarg.mdspath, M_TEMP);
3566 	} else if (uap->flag & NFSSVC_PNFSDS) {
3567 		error = copyin(uap->argp, &pnfsdarg, sizeof(pnfsdarg));
3568 		if (error == 0 && (pnfsdarg.op == PNFSDOP_DELDSSERVER ||
3569 		    pnfsdarg.op == PNFSDOP_FORCEDELDS)) {
3570 			cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
3571 			error = copyinstr(pnfsdarg.dspath, cp, PATH_MAX + 1,
3572 			    NULL);
3573 			if (error == 0)
3574 				error = nfsrv_deldsserver(pnfsdarg.op, cp, td);
3575 			free(cp, M_TEMP);
3576 		} else if (error == 0 && pnfsdarg.op == PNFSDOP_COPYMR) {
3577 			cp = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
3578 			buflen = sizeof(*pf) * NFSDEV_MAXMIRRORS;
3579 			buf = malloc(buflen, M_TEMP, M_WAITOK);
3580 			error = copyinstr(pnfsdarg.mdspath, cp, PATH_MAX + 1,
3581 			    NULL);
3582 			NFSD_DEBUG(4, "pnfsdcopymr cp mdspath=%d\n", error);
3583 			if (error == 0 && pnfsdarg.dspath != NULL) {
3584 				cp2 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
3585 				error = copyinstr(pnfsdarg.dspath, cp2,
3586 				    PATH_MAX + 1, NULL);
3587 				NFSD_DEBUG(4, "pnfsdcopymr cp dspath=%d\n",
3588 				    error);
3589 			} else
3590 				cp2 = NULL;
3591 			if (error == 0 && pnfsdarg.curdspath != NULL) {
3592 				cp3 = malloc(PATH_MAX + 1, M_TEMP, M_WAITOK);
3593 				error = copyinstr(pnfsdarg.curdspath, cp3,
3594 				    PATH_MAX + 1, NULL);
3595 				NFSD_DEBUG(4, "pnfsdcopymr cp curdspath=%d\n",
3596 				    error);
3597 			} else
3598 				cp3 = NULL;
3599 			curdvp = NULL;
3600 			fds = NULL;
3601 			if (error == 0)
3602 				error = nfsrv_mdscopymr(cp, cp2, cp3, buf,
3603 				    &buflen, fname, td, &vp, &nvp, &pf, &ds,
3604 				    &fds);
3605 			NFSD_DEBUG(4, "nfsrv_mdscopymr=%d\n", error);
3606 			if (error == 0) {
3607 				if (pf->dsf_dir >= nfsrv_dsdirsize) {
3608 					printf("copymr: dsdir out of range\n");
3609 					pf->dsf_dir = 0;
3610 				}
3611 				NFSD_DEBUG(4, "copymr: buflen=%d\n", buflen);
3612 				error = nfsrv_copymr(vp, nvp,
3613 				    ds->nfsdev_dsdir[pf->dsf_dir], ds, pf,
3614 				    (struct pnfsdsfile *)buf,
3615 				    buflen / sizeof(*pf), td->td_ucred, td);
3616 				vput(vp);
3617 				vput(nvp);
3618 				if (fds != NULL && error == 0) {
3619 					curdvp = fds->nfsdev_dsdir[pf->dsf_dir];
3620 					ret = vn_lock(curdvp, LK_EXCLUSIVE);
3621 					if (ret == 0) {
3622 						nfsrv_dsremove(curdvp, fname,
3623 						    td->td_ucred, td);
3624 						NFSVOPUNLOCK(curdvp);
3625 					}
3626 				}
3627 				NFSD_DEBUG(4, "nfsrv_copymr=%d\n", error);
3628 			}
3629 			free(cp, M_TEMP);
3630 			free(cp2, M_TEMP);
3631 			free(cp3, M_TEMP);
3632 			free(buf, M_TEMP);
3633 		}
3634 	} else {
3635 		error = nfssvc_srvcall(td, uap, td->td_ucred);
3636 	}
3637 
3638 out:
3639 	NFSEXITCODE(error);
3640 	return (error);
3641 }
3642 
3643 static int
3644 nfssvc_srvcall(struct thread *p, struct nfssvc_args *uap, struct ucred *cred)
3645 {
3646 	struct nfsex_args export;
3647 	struct file *fp = NULL;
3648 	int stablefd, len;
3649 	struct nfsd_clid adminrevoke;
3650 	struct nfsd_dumplist dumplist;
3651 	struct nfsd_dumpclients *dumpclients;
3652 	struct nfsd_dumplocklist dumplocklist;
3653 	struct nfsd_dumplocks *dumplocks;
3654 	struct nameidata nd;
3655 	vnode_t vp;
3656 	int error = EINVAL, igotlock;
3657 	struct proc *procp;
3658 	static int suspend_nfsd = 0;
3659 
3660 	if (uap->flag & NFSSVC_PUBLICFH) {
3661 		NFSBZERO((caddr_t)&nfs_pubfh.nfsrvfh_data,
3662 		    sizeof (fhandle_t));
3663 		error = copyin(uap->argp,
3664 		    &nfs_pubfh.nfsrvfh_data, sizeof (fhandle_t));
3665 		if (!error)
3666 			nfs_pubfhset = 1;
3667 	} else if (uap->flag & NFSSVC_V4ROOTEXPORT) {
3668 		error = copyin(uap->argp,(caddr_t)&export,
3669 		    sizeof (struct nfsex_args));
3670 		if (!error)
3671 			error = nfsrv_v4rootexport(&export, cred, p);
3672 	} else if (uap->flag & NFSSVC_NOPUBLICFH) {
3673 		nfs_pubfhset = 0;
3674 		error = 0;
3675 	} else if (uap->flag & NFSSVC_STABLERESTART) {
3676 		error = copyin(uap->argp, (caddr_t)&stablefd,
3677 		    sizeof (int));
3678 		if (!error)
3679 			error = fp_getfvp(p, stablefd, &fp, &vp);
3680 		if (!error && (NFSFPFLAG(fp) & (FREAD | FWRITE)) != (FREAD | FWRITE))
3681 			error = EBADF;
3682 		if (!error && newnfs_numnfsd != 0)
3683 			error = EPERM;
3684 		if (!error) {
3685 			nfsrv_stablefirst.nsf_fp = fp;
3686 			nfsrv_setupstable(p);
3687 		}
3688 	} else if (uap->flag & NFSSVC_ADMINREVOKE) {
3689 		error = copyin(uap->argp, (caddr_t)&adminrevoke,
3690 		    sizeof (struct nfsd_clid));
3691 		if (!error)
3692 			error = nfsrv_adminrevoke(&adminrevoke, p);
3693 	} else if (uap->flag & NFSSVC_DUMPCLIENTS) {
3694 		error = copyin(uap->argp, (caddr_t)&dumplist,
3695 		    sizeof (struct nfsd_dumplist));
3696 		if (!error && (dumplist.ndl_size < 1 ||
3697 			dumplist.ndl_size > NFSRV_MAXDUMPLIST))
3698 			error = EPERM;
3699 		if (!error) {
3700 		    len = sizeof (struct nfsd_dumpclients) * dumplist.ndl_size;
3701 		    dumpclients = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
3702 		    nfsrv_dumpclients(dumpclients, dumplist.ndl_size);
3703 		    error = copyout(dumpclients,
3704 			CAST_USER_ADDR_T(dumplist.ndl_list), len);
3705 		    free(dumpclients, M_TEMP);
3706 		}
3707 	} else if (uap->flag & NFSSVC_DUMPLOCKS) {
3708 		error = copyin(uap->argp, (caddr_t)&dumplocklist,
3709 		    sizeof (struct nfsd_dumplocklist));
3710 		if (!error && (dumplocklist.ndllck_size < 1 ||
3711 			dumplocklist.ndllck_size > NFSRV_MAXDUMPLIST))
3712 			error = EPERM;
3713 		if (!error)
3714 			error = nfsrv_lookupfilename(&nd,
3715 				dumplocklist.ndllck_fname, p);
3716 		if (!error) {
3717 			len = sizeof (struct nfsd_dumplocks) *
3718 				dumplocklist.ndllck_size;
3719 			dumplocks = malloc(len, M_TEMP, M_WAITOK | M_ZERO);
3720 			nfsrv_dumplocks(nd.ni_vp, dumplocks,
3721 			    dumplocklist.ndllck_size, p);
3722 			vput(nd.ni_vp);
3723 			error = copyout(dumplocks,
3724 			    CAST_USER_ADDR_T(dumplocklist.ndllck_list), len);
3725 			free(dumplocks, M_TEMP);
3726 		}
3727 	} else if (uap->flag & NFSSVC_BACKUPSTABLE) {
3728 		procp = p->td_proc;
3729 		PROC_LOCK(procp);
3730 		nfsd_master_pid = procp->p_pid;
3731 		bcopy(procp->p_comm, nfsd_master_comm, MAXCOMLEN + 1);
3732 		nfsd_master_start = procp->p_stats->p_start;
3733 		nfsd_master_proc = procp;
3734 		PROC_UNLOCK(procp);
3735 	} else if ((uap->flag & NFSSVC_SUSPENDNFSD) != 0) {
3736 		NFSLOCKV4ROOTMUTEX();
3737 		if (suspend_nfsd == 0) {
3738 			/* Lock out all nfsd threads */
3739 			do {
3740 				igotlock = nfsv4_lock(&nfsd_suspend_lock, 1,
3741 				    NULL, NFSV4ROOTLOCKMUTEXPTR, NULL);
3742 			} while (igotlock == 0 && suspend_nfsd == 0);
3743 			suspend_nfsd = 1;
3744 		}
3745 		NFSUNLOCKV4ROOTMUTEX();
3746 		error = 0;
3747 	} else if ((uap->flag & NFSSVC_RESUMENFSD) != 0) {
3748 		NFSLOCKV4ROOTMUTEX();
3749 		if (suspend_nfsd != 0) {
3750 			nfsv4_unlock(&nfsd_suspend_lock, 0);
3751 			suspend_nfsd = 0;
3752 		}
3753 		NFSUNLOCKV4ROOTMUTEX();
3754 		error = 0;
3755 	}
3756 
3757 	NFSEXITCODE(error);
3758 	return (error);
3759 }
3760 
3761 /*
3762  * Check exports.
3763  * Returns 0 if ok, 1 otherwise.
3764  */
3765 int
3766 nfsvno_testexp(struct nfsrv_descript *nd, struct nfsexstuff *exp)
3767 {
3768 	int i;
3769 
3770 	/*
3771 	 * This seems odd, but allow the case where the security flavor
3772 	 * list is empty. This happens when NFSv4 is traversing non-exported
3773 	 * file systems. Exported file systems should always have a non-empty
3774 	 * security flavor list.
3775 	 */
3776 	if (exp->nes_numsecflavor == 0)
3777 		return (0);
3778 
3779 	for (i = 0; i < exp->nes_numsecflavor; i++) {
3780 		/*
3781 		 * The tests for privacy and integrity must be first,
3782 		 * since ND_GSS is set for everything but AUTH_SYS.
3783 		 */
3784 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5P &&
3785 		    (nd->nd_flag & ND_GSSPRIVACY))
3786 			return (0);
3787 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5I &&
3788 		    (nd->nd_flag & ND_GSSINTEGRITY))
3789 			return (0);
3790 		if (exp->nes_secflavors[i] == RPCSEC_GSS_KRB5 &&
3791 		    (nd->nd_flag & ND_GSS))
3792 			return (0);
3793 		if (exp->nes_secflavors[i] == AUTH_SYS &&
3794 		    (nd->nd_flag & ND_GSS) == 0)
3795 			return (0);
3796 	}
3797 	return (1);
3798 }
3799 
3800 /*
3801  * Calculate a hash value for the fid in a file handle.
3802  */
3803 uint32_t
3804 nfsrv_hashfh(fhandle_t *fhp)
3805 {
3806 	uint32_t hashval;
3807 
3808 	hashval = hash32_buf(&fhp->fh_fid, sizeof(struct fid), 0);
3809 	return (hashval);
3810 }
3811 
3812 /*
3813  * Calculate a hash value for the sessionid.
3814  */
3815 uint32_t
3816 nfsrv_hashsessionid(uint8_t *sessionid)
3817 {
3818 	uint32_t hashval;
3819 
3820 	hashval = hash32_buf(sessionid, NFSX_V4SESSIONID, 0);
3821 	return (hashval);
3822 }
3823 
3824 /*
3825  * Signal the userland master nfsd to backup the stable restart file.
3826  */
3827 void
3828 nfsrv_backupstable(void)
3829 {
3830 	struct proc *procp;
3831 
3832 	if (nfsd_master_proc != NULL) {
3833 		procp = pfind(nfsd_master_pid);
3834 		/* Try to make sure it is the correct process. */
3835 		if (procp == nfsd_master_proc &&
3836 		    procp->p_stats->p_start.tv_sec ==
3837 		    nfsd_master_start.tv_sec &&
3838 		    procp->p_stats->p_start.tv_usec ==
3839 		    nfsd_master_start.tv_usec &&
3840 		    strcmp(procp->p_comm, nfsd_master_comm) == 0)
3841 			kern_psignal(procp, SIGUSR2);
3842 		else
3843 			nfsd_master_proc = NULL;
3844 
3845 		if (procp != NULL)
3846 			PROC_UNLOCK(procp);
3847 	}
3848 }
3849 
3850 /*
3851  * Create a DS data file for nfsrv_pnfscreate(). Called for each mirror.
3852  * The arguments are in a structure, so that they can be passed through
3853  * taskqueue for a kernel process to execute this function.
3854  */
3855 struct nfsrvdscreate {
3856 	int			done;
3857 	int			inprog;
3858 	struct task		tsk;
3859 	struct ucred		*tcred;
3860 	struct vnode		*dvp;
3861 	NFSPROC_T		*p;
3862 	struct pnfsdsfile	*pf;
3863 	int			err;
3864 	fhandle_t		fh;
3865 	struct vattr		va;
3866 	struct vattr		createva;
3867 };
3868 
3869 int
3870 nfsrv_dscreate(struct vnode *dvp, struct vattr *vap, struct vattr *nvap,
3871     fhandle_t *fhp, struct pnfsdsfile *pf, struct pnfsdsattr *dsa,
3872     char *fnamep, struct ucred *tcred, NFSPROC_T *p, struct vnode **nvpp)
3873 {
3874 	struct vnode *nvp;
3875 	struct nameidata named;
3876 	struct vattr va;
3877 	char *bufp;
3878 	u_long *hashp;
3879 	struct nfsnode *np;
3880 	struct nfsmount *nmp;
3881 	int error;
3882 
3883 	NFSNAMEICNDSET(&named.ni_cnd, tcred, CREATE,
3884 	    LOCKPARENT | LOCKLEAF | SAVESTART | NOCACHE);
3885 	nfsvno_setpathbuf(&named, &bufp, &hashp);
3886 	named.ni_cnd.cn_lkflags = LK_EXCLUSIVE;
3887 	named.ni_cnd.cn_thread = p;
3888 	named.ni_cnd.cn_nameptr = bufp;
3889 	if (fnamep != NULL) {
3890 		strlcpy(bufp, fnamep, PNFS_FILENAME_LEN + 1);
3891 		named.ni_cnd.cn_namelen = strlen(bufp);
3892 	} else
3893 		named.ni_cnd.cn_namelen = nfsrv_putfhname(fhp, bufp);
3894 	NFSD_DEBUG(4, "nfsrv_dscreate: dvp=%p fname=%s\n", dvp, bufp);
3895 
3896 	/* Create the date file in the DS mount. */
3897 	error = NFSVOPLOCK(dvp, LK_EXCLUSIVE);
3898 	if (error == 0) {
3899 		error = VOP_CREATE(dvp, &nvp, &named.ni_cnd, vap);
3900 		NFSVOPUNLOCK(dvp);
3901 		if (error == 0) {
3902 			/* Set the ownership of the file. */
3903 			error = VOP_SETATTR(nvp, nvap, tcred);
3904 			NFSD_DEBUG(4, "nfsrv_dscreate:"
3905 			    " setattr-uid=%d\n", error);
3906 			if (error != 0)
3907 				vput(nvp);
3908 		}
3909 		if (error != 0)
3910 			printf("pNFS: pnfscreate failed=%d\n", error);
3911 	} else
3912 		printf("pNFS: pnfscreate vnlock=%d\n", error);
3913 	if (error == 0) {
3914 		np = VTONFS(nvp);
3915 		nmp = VFSTONFS(nvp->v_mount);
3916 		if (strcmp(nvp->v_mount->mnt_vfc->vfc_name, "nfs")
3917 		    != 0 || nmp->nm_nam->sa_len > sizeof(
3918 		    struct sockaddr_in6) ||
3919 		    np->n_fhp->nfh_len != NFSX_MYFH) {
3920 			printf("Bad DS file: fstype=%s salen=%d"
3921 			    " fhlen=%d\n",
3922 			    nvp->v_mount->mnt_vfc->vfc_name,
3923 			    nmp->nm_nam->sa_len, np->n_fhp->nfh_len);
3924 			error = ENOENT;
3925 		}
3926 
3927 		/* Set extattrs for the DS on the MDS file. */
3928 		if (error == 0) {
3929 			if (dsa != NULL) {
3930 				error = VOP_GETATTR(nvp, &va, tcred);
3931 				if (error == 0) {
3932 					dsa->dsa_filerev = va.va_filerev;
3933 					dsa->dsa_size = va.va_size;
3934 					dsa->dsa_atime = va.va_atime;
3935 					dsa->dsa_mtime = va.va_mtime;
3936 					dsa->dsa_bytes = va.va_bytes;
3937 				}
3938 			}
3939 			if (error == 0) {
3940 				NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh,
3941 				    NFSX_MYFH);
3942 				NFSBCOPY(nmp->nm_nam, &pf->dsf_sin,
3943 				    nmp->nm_nam->sa_len);
3944 				NFSBCOPY(named.ni_cnd.cn_nameptr,
3945 				    pf->dsf_filename,
3946 				    sizeof(pf->dsf_filename));
3947 			}
3948 		} else
3949 			printf("pNFS: pnfscreate can't get DS"
3950 			    " attr=%d\n", error);
3951 		if (nvpp != NULL && error == 0)
3952 			*nvpp = nvp;
3953 		else
3954 			vput(nvp);
3955 	}
3956 	nfsvno_relpathbuf(&named);
3957 	return (error);
3958 }
3959 
3960 /*
3961  * Start up the thread that will execute nfsrv_dscreate().
3962  */
3963 static void
3964 start_dscreate(void *arg, int pending)
3965 {
3966 	struct nfsrvdscreate *dsc;
3967 
3968 	dsc = (struct nfsrvdscreate *)arg;
3969 	dsc->err = nfsrv_dscreate(dsc->dvp, &dsc->createva, &dsc->va, &dsc->fh,
3970 	    dsc->pf, NULL, NULL, dsc->tcred, dsc->p, NULL);
3971 	dsc->done = 1;
3972 	NFSD_DEBUG(4, "start_dscreate: err=%d\n", dsc->err);
3973 }
3974 
3975 /*
3976  * Create a pNFS data file on the Data Server(s).
3977  */
3978 static void
3979 nfsrv_pnfscreate(struct vnode *vp, struct vattr *vap, struct ucred *cred,
3980     NFSPROC_T *p)
3981 {
3982 	struct nfsrvdscreate *dsc, *tdsc = NULL;
3983 	struct nfsdevice *ds, *tds, *fds;
3984 	struct mount *mp;
3985 	struct pnfsdsfile *pf, *tpf;
3986 	struct pnfsdsattr dsattr;
3987 	struct vattr va;
3988 	struct vnode *dvp[NFSDEV_MAXMIRRORS];
3989 	struct nfsmount *nmp;
3990 	fhandle_t fh;
3991 	uid_t vauid;
3992 	gid_t vagid;
3993 	u_short vamode;
3994 	struct ucred *tcred;
3995 	int dsdir[NFSDEV_MAXMIRRORS], error, i, mirrorcnt, ret;
3996 	int failpos, timo;
3997 
3998 	/* Get a DS server directory in a round-robin order. */
3999 	mirrorcnt = 1;
4000 	mp = vp->v_mount;
4001 	ds = fds = NULL;
4002 	NFSDDSLOCK();
4003 	/*
4004 	 * Search for the first entry that handles this MDS fs, but use the
4005 	 * first entry for all MDS fs's otherwise.
4006 	 */
4007 	TAILQ_FOREACH(tds, &nfsrv_devidhead, nfsdev_list) {
4008 		if (tds->nfsdev_nmp != NULL) {
4009 			if (tds->nfsdev_mdsisset == 0 && ds == NULL)
4010 				ds = tds;
4011 			else if (tds->nfsdev_mdsisset != 0 &&
4012 			    mp->mnt_stat.f_fsid.val[0] ==
4013 			    tds->nfsdev_mdsfsid.val[0] &&
4014 			    mp->mnt_stat.f_fsid.val[1] ==
4015 			    tds->nfsdev_mdsfsid.val[1]) {
4016 				ds = fds = tds;
4017 				break;
4018 			}
4019 		}
4020 	}
4021 	if (ds == NULL) {
4022 		NFSDDSUNLOCK();
4023 		NFSD_DEBUG(4, "nfsrv_pnfscreate: no srv\n");
4024 		return;
4025 	}
4026 	i = dsdir[0] = ds->nfsdev_nextdir;
4027 	ds->nfsdev_nextdir = (ds->nfsdev_nextdir + 1) % nfsrv_dsdirsize;
4028 	dvp[0] = ds->nfsdev_dsdir[i];
4029 	tds = TAILQ_NEXT(ds, nfsdev_list);
4030 	if (nfsrv_maxpnfsmirror > 1 && tds != NULL) {
4031 		TAILQ_FOREACH_FROM(tds, &nfsrv_devidhead, nfsdev_list) {
4032 			if (tds->nfsdev_nmp != NULL &&
4033 			    ((tds->nfsdev_mdsisset == 0 && fds == NULL) ||
4034 			     (tds->nfsdev_mdsisset != 0 && fds != NULL &&
4035 			      mp->mnt_stat.f_fsid.val[0] ==
4036 			      tds->nfsdev_mdsfsid.val[0] &&
4037 			      mp->mnt_stat.f_fsid.val[1] ==
4038 			      tds->nfsdev_mdsfsid.val[1]))) {
4039 				dsdir[mirrorcnt] = i;
4040 				dvp[mirrorcnt] = tds->nfsdev_dsdir[i];
4041 				mirrorcnt++;
4042 				if (mirrorcnt >= nfsrv_maxpnfsmirror)
4043 					break;
4044 			}
4045 		}
4046 	}
4047 	/* Put at end of list to implement round-robin usage. */
4048 	TAILQ_REMOVE(&nfsrv_devidhead, ds, nfsdev_list);
4049 	TAILQ_INSERT_TAIL(&nfsrv_devidhead, ds, nfsdev_list);
4050 	NFSDDSUNLOCK();
4051 	dsc = NULL;
4052 	if (mirrorcnt > 1)
4053 		tdsc = dsc = malloc(sizeof(*dsc) * (mirrorcnt - 1), M_TEMP,
4054 		    M_WAITOK | M_ZERO);
4055 	tpf = pf = malloc(sizeof(*pf) * nfsrv_maxpnfsmirror, M_TEMP, M_WAITOK |
4056 	    M_ZERO);
4057 
4058 	error = nfsvno_getfh(vp, &fh, p);
4059 	if (error == 0)
4060 		error = VOP_GETATTR(vp, &va, cred);
4061 	if (error == 0) {
4062 		/* Set the attributes for "vp" to Setattr the DS vp. */
4063 		vauid = va.va_uid;
4064 		vagid = va.va_gid;
4065 		vamode = va.va_mode;
4066 		VATTR_NULL(&va);
4067 		va.va_uid = vauid;
4068 		va.va_gid = vagid;
4069 		va.va_mode = vamode;
4070 		va.va_size = 0;
4071 	} else
4072 		printf("pNFS: pnfscreate getfh+attr=%d\n", error);
4073 
4074 	NFSD_DEBUG(4, "nfsrv_pnfscreate: cruid=%d crgid=%d\n", cred->cr_uid,
4075 	    cred->cr_gid);
4076 	/* Make data file name based on FH. */
4077 	tcred = newnfs_getcred();
4078 
4079 	/*
4080 	 * Create the file on each DS mirror, using kernel process(es) for the
4081 	 * additional mirrors.
4082 	 */
4083 	failpos = -1;
4084 	for (i = 0; i < mirrorcnt - 1 && error == 0; i++, tpf++, tdsc++) {
4085 		tpf->dsf_dir = dsdir[i];
4086 		tdsc->tcred = tcred;
4087 		tdsc->p = p;
4088 		tdsc->pf = tpf;
4089 		tdsc->createva = *vap;
4090 		NFSBCOPY(&fh, &tdsc->fh, sizeof(fh));
4091 		tdsc->va = va;
4092 		tdsc->dvp = dvp[i];
4093 		tdsc->done = 0;
4094 		tdsc->inprog = 0;
4095 		tdsc->err = 0;
4096 		ret = EIO;
4097 		if (nfs_pnfsiothreads != 0) {
4098 			ret = nfs_pnfsio(start_dscreate, tdsc);
4099 			NFSD_DEBUG(4, "nfsrv_pnfscreate: nfs_pnfsio=%d\n", ret);
4100 		}
4101 		if (ret != 0) {
4102 			ret = nfsrv_dscreate(dvp[i], vap, &va, &fh, tpf, NULL,
4103 			    NULL, tcred, p, NULL);
4104 			if (ret != 0) {
4105 				KASSERT(error == 0, ("nfsrv_dscreate err=%d",
4106 				    error));
4107 				if (failpos == -1 && nfsds_failerr(ret))
4108 					failpos = i;
4109 				else
4110 					error = ret;
4111 			}
4112 		}
4113 	}
4114 	if (error == 0) {
4115 		tpf->dsf_dir = dsdir[mirrorcnt - 1];
4116 		error = nfsrv_dscreate(dvp[mirrorcnt - 1], vap, &va, &fh, tpf,
4117 		    &dsattr, NULL, tcred, p, NULL);
4118 		if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(error)) {
4119 			failpos = mirrorcnt - 1;
4120 			error = 0;
4121 		}
4122 	}
4123 	timo = hz / 50;		/* Wait for 20msec. */
4124 	if (timo < 1)
4125 		timo = 1;
4126 	/* Wait for kernel task(s) to complete. */
4127 	for (tdsc = dsc, i = 0; i < mirrorcnt - 1; i++, tdsc++) {
4128 		while (tdsc->inprog != 0 && tdsc->done == 0)
4129 			tsleep(&tdsc->tsk, PVFS, "srvdcr", timo);
4130 		if (tdsc->err != 0) {
4131 			if (failpos == -1 && nfsds_failerr(tdsc->err))
4132 				failpos = i;
4133 			else if (error == 0)
4134 				error = tdsc->err;
4135 		}
4136 	}
4137 
4138 	/*
4139 	 * If failpos has been set, that mirror has failed, so it needs
4140 	 * to be disabled.
4141 	 */
4142 	if (failpos >= 0) {
4143 		nmp = VFSTONFS(dvp[failpos]->v_mount);
4144 		NFSLOCKMNT(nmp);
4145 		if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
4146 		     NFSMNTP_CANCELRPCS)) == 0) {
4147 			nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
4148 			NFSUNLOCKMNT(nmp);
4149 			ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p);
4150 			NFSD_DEBUG(4, "dscreatfail fail=%d ds=%p\n", failpos,
4151 			    ds);
4152 			if (ds != NULL)
4153 				nfsrv_killrpcs(nmp);
4154 			NFSLOCKMNT(nmp);
4155 			nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
4156 			wakeup(nmp);
4157 		}
4158 		NFSUNLOCKMNT(nmp);
4159 	}
4160 
4161 	NFSFREECRED(tcred);
4162 	if (error == 0) {
4163 		ASSERT_VOP_ELOCKED(vp, "nfsrv_pnfscreate vp");
4164 
4165 		NFSD_DEBUG(4, "nfsrv_pnfscreate: mirrorcnt=%d maxmirror=%d\n",
4166 		    mirrorcnt, nfsrv_maxpnfsmirror);
4167 		/*
4168 		 * For all mirrors that couldn't be created, fill in the
4169 		 * *pf structure, but with an IP address == 0.0.0.0.
4170 		 */
4171 		tpf = pf + mirrorcnt;
4172 		for (i = mirrorcnt; i < nfsrv_maxpnfsmirror; i++, tpf++) {
4173 			*tpf = *pf;
4174 			tpf->dsf_sin.sin_family = AF_INET;
4175 			tpf->dsf_sin.sin_len = sizeof(struct sockaddr_in);
4176 			tpf->dsf_sin.sin_addr.s_addr = 0;
4177 			tpf->dsf_sin.sin_port = 0;
4178 		}
4179 
4180 		error = vn_extattr_set(vp, IO_NODELOCKED,
4181 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile",
4182 		    sizeof(*pf) * nfsrv_maxpnfsmirror, (char *)pf, p);
4183 		if (error == 0)
4184 			error = vn_extattr_set(vp, IO_NODELOCKED,
4185 			    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr",
4186 			    sizeof(dsattr), (char *)&dsattr, p);
4187 		if (error != 0)
4188 			printf("pNFS: pnfscreate setextattr=%d\n",
4189 			    error);
4190 	} else
4191 		printf("pNFS: pnfscreate=%d\n", error);
4192 	free(pf, M_TEMP);
4193 	free(dsc, M_TEMP);
4194 }
4195 
4196 /*
4197  * Get the information needed to remove the pNFS Data Server file from the
4198  * Metadata file.  Upon success, ddvp is set non-NULL to the locked
4199  * DS directory vnode.  The caller must unlock *ddvp when done with it.
4200  */
4201 static void
4202 nfsrv_pnfsremovesetup(struct vnode *vp, NFSPROC_T *p, struct vnode **dvpp,
4203     int *mirrorcntp, char *fname, fhandle_t *fhp)
4204 {
4205 	struct vattr va;
4206 	struct ucred *tcred;
4207 	char *buf;
4208 	int buflen, error;
4209 
4210 	dvpp[0] = NULL;
4211 	/* If not an exported regular file or not a pNFS server, just return. */
4212 	if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 ||
4213 	    nfsrv_devidcnt == 0)
4214 		return;
4215 
4216 	/* Check to see if this is the last hard link. */
4217 	tcred = newnfs_getcred();
4218 	error = VOP_GETATTR(vp, &va, tcred);
4219 	NFSFREECRED(tcred);
4220 	if (error != 0) {
4221 		printf("pNFS: nfsrv_pnfsremovesetup getattr=%d\n", error);
4222 		return;
4223 	}
4224 	if (va.va_nlink > 1)
4225 		return;
4226 
4227 	error = nfsvno_getfh(vp, fhp, p);
4228 	if (error != 0) {
4229 		printf("pNFS: nfsrv_pnfsremovesetup getfh=%d\n", error);
4230 		return;
4231 	}
4232 
4233 	buflen = 1024;
4234 	buf = malloc(buflen, M_TEMP, M_WAITOK);
4235 	/* Get the directory vnode for the DS mount and the file handle. */
4236 	error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, dvpp,
4237 	    NULL, NULL, fname, NULL, NULL, NULL, NULL, NULL);
4238 	free(buf, M_TEMP);
4239 	if (error != 0)
4240 		printf("pNFS: nfsrv_pnfsremovesetup getsockmnt=%d\n", error);
4241 }
4242 
4243 /*
4244  * Remove a DS data file for nfsrv_pnfsremove(). Called for each mirror.
4245  * The arguments are in a structure, so that they can be passed through
4246  * taskqueue for a kernel process to execute this function.
4247  */
4248 struct nfsrvdsremove {
4249 	int			done;
4250 	int			inprog;
4251 	struct task		tsk;
4252 	struct ucred		*tcred;
4253 	struct vnode		*dvp;
4254 	NFSPROC_T		*p;
4255 	int			err;
4256 	char			fname[PNFS_FILENAME_LEN + 1];
4257 };
4258 
4259 static int
4260 nfsrv_dsremove(struct vnode *dvp, char *fname, struct ucred *tcred,
4261     NFSPROC_T *p)
4262 {
4263 	struct nameidata named;
4264 	struct vnode *nvp;
4265 	char *bufp;
4266 	u_long *hashp;
4267 	int error;
4268 
4269 	error = NFSVOPLOCK(dvp, LK_EXCLUSIVE);
4270 	if (error != 0)
4271 		return (error);
4272 	named.ni_cnd.cn_nameiop = DELETE;
4273 	named.ni_cnd.cn_lkflags = LK_EXCLUSIVE | LK_RETRY;
4274 	named.ni_cnd.cn_cred = tcred;
4275 	named.ni_cnd.cn_thread = p;
4276 	named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME;
4277 	nfsvno_setpathbuf(&named, &bufp, &hashp);
4278 	named.ni_cnd.cn_nameptr = bufp;
4279 	named.ni_cnd.cn_namelen = strlen(fname);
4280 	strlcpy(bufp, fname, NAME_MAX);
4281 	NFSD_DEBUG(4, "nfsrv_pnfsremove: filename=%s\n", bufp);
4282 	error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd);
4283 	NFSD_DEBUG(4, "nfsrv_pnfsremove: aft LOOKUP=%d\n", error);
4284 	if (error == 0) {
4285 		error = VOP_REMOVE(dvp, nvp, &named.ni_cnd);
4286 		vput(nvp);
4287 	}
4288 	NFSVOPUNLOCK(dvp);
4289 	nfsvno_relpathbuf(&named);
4290 	if (error != 0)
4291 		printf("pNFS: nfsrv_pnfsremove failed=%d\n", error);
4292 	return (error);
4293 }
4294 
4295 /*
4296  * Start up the thread that will execute nfsrv_dsremove().
4297  */
4298 static void
4299 start_dsremove(void *arg, int pending)
4300 {
4301 	struct nfsrvdsremove *dsrm;
4302 
4303 	dsrm = (struct nfsrvdsremove *)arg;
4304 	dsrm->err = nfsrv_dsremove(dsrm->dvp, dsrm->fname, dsrm->tcred,
4305 	    dsrm->p);
4306 	dsrm->done = 1;
4307 	NFSD_DEBUG(4, "start_dsremove: err=%d\n", dsrm->err);
4308 }
4309 
4310 /*
4311  * Remove a pNFS data file from a Data Server.
4312  * nfsrv_pnfsremovesetup() must have been called before the MDS file was
4313  * removed to set up the dvp and fill in the FH.
4314  */
4315 static void
4316 nfsrv_pnfsremove(struct vnode **dvp, int mirrorcnt, char *fname, fhandle_t *fhp,
4317     NFSPROC_T *p)
4318 {
4319 	struct ucred *tcred;
4320 	struct nfsrvdsremove *dsrm, *tdsrm;
4321 	struct nfsdevice *ds;
4322 	struct nfsmount *nmp;
4323 	int failpos, i, ret, timo;
4324 
4325 	tcred = newnfs_getcred();
4326 	dsrm = NULL;
4327 	if (mirrorcnt > 1)
4328 		dsrm = malloc(sizeof(*dsrm) * mirrorcnt - 1, M_TEMP, M_WAITOK);
4329 	/*
4330 	 * Remove the file on each DS mirror, using kernel process(es) for the
4331 	 * additional mirrors.
4332 	 */
4333 	failpos = -1;
4334 	for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) {
4335 		tdsrm->tcred = tcred;
4336 		tdsrm->p = p;
4337 		tdsrm->dvp = dvp[i];
4338 		strlcpy(tdsrm->fname, fname, PNFS_FILENAME_LEN + 1);
4339 		tdsrm->inprog = 0;
4340 		tdsrm->done = 0;
4341 		tdsrm->err = 0;
4342 		ret = EIO;
4343 		if (nfs_pnfsiothreads != 0) {
4344 			ret = nfs_pnfsio(start_dsremove, tdsrm);
4345 			NFSD_DEBUG(4, "nfsrv_pnfsremove: nfs_pnfsio=%d\n", ret);
4346 		}
4347 		if (ret != 0) {
4348 			ret = nfsrv_dsremove(dvp[i], fname, tcred, p);
4349 			if (failpos == -1 && nfsds_failerr(ret))
4350 				failpos = i;
4351 		}
4352 	}
4353 	ret = nfsrv_dsremove(dvp[mirrorcnt - 1], fname, tcred, p);
4354 	if (failpos == -1 && mirrorcnt > 1 && nfsds_failerr(ret))
4355 		failpos = mirrorcnt - 1;
4356 	timo = hz / 50;		/* Wait for 20msec. */
4357 	if (timo < 1)
4358 		timo = 1;
4359 	/* Wait for kernel task(s) to complete. */
4360 	for (tdsrm = dsrm, i = 0; i < mirrorcnt - 1; i++, tdsrm++) {
4361 		while (tdsrm->inprog != 0 && tdsrm->done == 0)
4362 			tsleep(&tdsrm->tsk, PVFS, "srvdsrm", timo);
4363 		if (failpos == -1 && nfsds_failerr(tdsrm->err))
4364 			failpos = i;
4365 	}
4366 
4367 	/*
4368 	 * If failpos has been set, that mirror has failed, so it needs
4369 	 * to be disabled.
4370 	 */
4371 	if (failpos >= 0) {
4372 		nmp = VFSTONFS(dvp[failpos]->v_mount);
4373 		NFSLOCKMNT(nmp);
4374 		if ((nmp->nm_privflag & (NFSMNTP_FORCEDISM |
4375 		     NFSMNTP_CANCELRPCS)) == 0) {
4376 			nmp->nm_privflag |= NFSMNTP_CANCELRPCS;
4377 			NFSUNLOCKMNT(nmp);
4378 			ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER, nmp, p);
4379 			NFSD_DEBUG(4, "dsremovefail fail=%d ds=%p\n", failpos,
4380 			    ds);
4381 			if (ds != NULL)
4382 				nfsrv_killrpcs(nmp);
4383 			NFSLOCKMNT(nmp);
4384 			nmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
4385 			wakeup(nmp);
4386 		}
4387 		NFSUNLOCKMNT(nmp);
4388 	}
4389 
4390 	/* Get rid all layouts for the file. */
4391 	nfsrv_freefilelayouts(fhp);
4392 
4393 	NFSFREECRED(tcred);
4394 	free(dsrm, M_TEMP);
4395 }
4396 
4397 /*
4398  * Generate a file name based on the file handle and put it in *bufp.
4399  * Return the number of bytes generated.
4400  */
4401 static int
4402 nfsrv_putfhname(fhandle_t *fhp, char *bufp)
4403 {
4404 	int i;
4405 	uint8_t *cp;
4406 	const uint8_t *hexdigits = "0123456789abcdef";
4407 
4408 	cp = (uint8_t *)fhp;
4409 	for (i = 0; i < sizeof(*fhp); i++) {
4410 		bufp[2 * i] = hexdigits[(*cp >> 4) & 0xf];
4411 		bufp[2 * i + 1] = hexdigits[*cp++ & 0xf];
4412 	}
4413 	bufp[2 * i] = '\0';
4414 	return (2 * i);
4415 }
4416 
4417 /*
4418  * Update the Metadata file's attributes from the DS file when a Read/Write
4419  * layout is returned.
4420  * Basically just call nfsrv_proxyds() with procedure == NFSPROC_LAYOUTRETURN
4421  * so that it does a nfsrv_getattrdsrpc() and nfsrv_setextattr() on the DS file.
4422  */
4423 int
4424 nfsrv_updatemdsattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p)
4425 {
4426 	struct ucred *tcred;
4427 	int error;
4428 
4429 	/* Do this as root so that it won't fail with EACCES. */
4430 	tcred = newnfs_getcred();
4431 	error = nfsrv_proxyds(vp, 0, 0, tcred, p, NFSPROC_LAYOUTRETURN,
4432 	    NULL, NULL, NULL, nap, NULL, NULL, 0, NULL);
4433 	NFSFREECRED(tcred);
4434 	return (error);
4435 }
4436 
4437 /*
4438  * Set the NFSv4 ACL on the DS file to the same ACL as the MDS file.
4439  */
4440 static int
4441 nfsrv_dssetacl(struct vnode *vp, struct acl *aclp, struct ucred *cred,
4442     NFSPROC_T *p)
4443 {
4444 	int error;
4445 
4446 	error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SETACL,
4447 	    NULL, NULL, NULL, NULL, aclp, NULL, 0, NULL);
4448 	return (error);
4449 }
4450 
4451 static int
4452 nfsrv_proxyds(struct vnode *vp, off_t off, int cnt, struct ucred *cred,
4453     struct thread *p, int ioproc, struct mbuf **mpp, char *cp,
4454     struct mbuf **mpp2, struct nfsvattr *nap, struct acl *aclp,
4455     off_t *offp, int content, bool *eofp)
4456 {
4457 	struct nfsmount *nmp[NFSDEV_MAXMIRRORS], *failnmp;
4458 	fhandle_t fh[NFSDEV_MAXMIRRORS];
4459 	struct vnode *dvp[NFSDEV_MAXMIRRORS];
4460 	struct nfsdevice *ds;
4461 	struct pnfsdsattr dsattr;
4462 	struct opnfsdsattr odsattr;
4463 	char *buf;
4464 	int buflen, error, failpos, i, mirrorcnt, origmircnt, trycnt;
4465 
4466 	NFSD_DEBUG(4, "in nfsrv_proxyds\n");
4467 	/*
4468 	 * If not a regular file, not exported or not a pNFS server,
4469 	 * just return ENOENT.
4470 	 */
4471 	if (vp->v_type != VREG || (vp->v_mount->mnt_flag & MNT_EXPORTED) == 0 ||
4472 	    nfsrv_devidcnt == 0)
4473 		return (ENOENT);
4474 
4475 	buflen = 1024;
4476 	buf = malloc(buflen, M_TEMP, M_WAITOK);
4477 	error = 0;
4478 
4479 	/*
4480 	 * For Getattr, get the Change attribute (va_filerev) and size (va_size)
4481 	 * from the MetaData file's extended attribute.
4482 	 */
4483 	if (ioproc == NFSPROC_GETATTR) {
4484 		error = vn_extattr_get(vp, IO_NODELOCKED,
4485 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsattr", &buflen, buf,
4486 		    p);
4487 		if (error == 0) {
4488 			if (buflen == sizeof(odsattr)) {
4489 				NFSBCOPY(buf, &odsattr, buflen);
4490 				nap->na_filerev = odsattr.dsa_filerev;
4491 				nap->na_size = odsattr.dsa_size;
4492 				nap->na_atime = odsattr.dsa_atime;
4493 				nap->na_mtime = odsattr.dsa_mtime;
4494 				/*
4495 				 * Fake na_bytes by rounding up na_size.
4496 				 * Since we don't know the block size, just
4497 				 * use BLKDEV_IOSIZE.
4498 				 */
4499 				nap->na_bytes = (odsattr.dsa_size +
4500 				    BLKDEV_IOSIZE - 1) & ~(BLKDEV_IOSIZE - 1);
4501 			} else if (buflen == sizeof(dsattr)) {
4502 				NFSBCOPY(buf, &dsattr, buflen);
4503 				nap->na_filerev = dsattr.dsa_filerev;
4504 				nap->na_size = dsattr.dsa_size;
4505 				nap->na_atime = dsattr.dsa_atime;
4506 				nap->na_mtime = dsattr.dsa_mtime;
4507 				nap->na_bytes = dsattr.dsa_bytes;
4508 			} else
4509 				error = ENXIO;
4510 		}
4511 		if (error == 0) {
4512 			/*
4513 			 * If nfsrv_pnfsgetdsattr is 0 or nfsrv_checkdsattr()
4514 			 * returns 0, just return now.  nfsrv_checkdsattr()
4515 			 * returns 0 if there is no Read/Write layout
4516 			 * plus either an Open/Write_access or Write
4517 			 * delegation issued to a client for the file.
4518 			 */
4519 			if (nfsrv_pnfsgetdsattr == 0 ||
4520 			    nfsrv_checkdsattr(vp, p) == 0) {
4521 				free(buf, M_TEMP);
4522 				return (error);
4523 			}
4524 		}
4525 
4526 		/*
4527 		 * Clear ENOATTR so the code below will attempt to do a
4528 		 * nfsrv_getattrdsrpc() to get the attributes and (re)create
4529 		 * the extended attribute.
4530 		 */
4531 		if (error == ENOATTR)
4532 			error = 0;
4533 	}
4534 
4535 	origmircnt = -1;
4536 	trycnt = 0;
4537 tryagain:
4538 	if (error == 0) {
4539 		buflen = 1024;
4540 		if (ioproc == NFSPROC_READDS && NFSVOPISLOCKED(vp) ==
4541 		    LK_EXCLUSIVE)
4542 			printf("nfsrv_proxyds: Readds vp exclusively locked\n");
4543 		error = nfsrv_dsgetsockmnt(vp, LK_SHARED, buf, &buflen,
4544 		    &mirrorcnt, p, dvp, fh, NULL, NULL, NULL, NULL, NULL,
4545 		    NULL, NULL);
4546 		if (error == 0) {
4547 			for (i = 0; i < mirrorcnt; i++)
4548 				nmp[i] = VFSTONFS(dvp[i]->v_mount);
4549 		} else
4550 			printf("pNFS: proxy getextattr sockaddr=%d\n", error);
4551 	} else
4552 		printf("pNFS: nfsrv_dsgetsockmnt=%d\n", error);
4553 	if (error == 0) {
4554 		failpos = -1;
4555 		if (origmircnt == -1)
4556 			origmircnt = mirrorcnt;
4557 		/*
4558 		 * If failpos is set to a mirror#, then that mirror has
4559 		 * failed and will be disabled. For Read, Getattr and Seek, the
4560 		 * function only tries one mirror, so if that mirror has
4561 		 * failed, it will need to be retried. As such, increment
4562 		 * tryitagain for these cases.
4563 		 * For Write, Setattr and Setacl, the function tries all
4564 		 * mirrors and will not return an error for the case where
4565 		 * one mirror has failed. For these cases, the functioning
4566 		 * mirror(s) will have been modified, so a retry isn't
4567 		 * necessary. These functions will set failpos for the
4568 		 * failed mirror#.
4569 		 */
4570 		if (ioproc == NFSPROC_READDS) {
4571 			error = nfsrv_readdsrpc(fh, off, cnt, cred, p, nmp[0],
4572 			    mpp, mpp2);
4573 			if (nfsds_failerr(error) && mirrorcnt > 1) {
4574 				/*
4575 				 * Setting failpos will cause the mirror
4576 				 * to be disabled and then a retry of this
4577 				 * read is required.
4578 				 */
4579 				failpos = 0;
4580 				error = 0;
4581 				trycnt++;
4582 			}
4583 		} else if (ioproc == NFSPROC_WRITEDS)
4584 			error = nfsrv_writedsrpc(fh, off, cnt, cred, p, vp,
4585 			    &nmp[0], mirrorcnt, mpp, cp, &failpos);
4586 		else if (ioproc == NFSPROC_SETATTR)
4587 			error = nfsrv_setattrdsrpc(fh, cred, p, vp, &nmp[0],
4588 			    mirrorcnt, nap, &failpos);
4589 		else if (ioproc == NFSPROC_SETACL)
4590 			error = nfsrv_setacldsrpc(fh, cred, p, vp, &nmp[0],
4591 			    mirrorcnt, aclp, &failpos);
4592 		else if (ioproc == NFSPROC_SEEKDS) {
4593 			error = nfsrv_seekdsrpc(fh, offp, content, eofp, cred,
4594 			    p, nmp[0]);
4595 			if (nfsds_failerr(error) && mirrorcnt > 1) {
4596 				/*
4597 				 * Setting failpos will cause the mirror
4598 				 * to be disabled and then a retry of this
4599 				 * read is required.
4600 				 */
4601 				failpos = 0;
4602 				error = 0;
4603 				trycnt++;
4604 			}
4605 		} else if (ioproc == NFSPROC_ALLOCATE)
4606 			error = nfsrv_allocatedsrpc(fh, off, *offp, cred, p, vp,
4607 			    &nmp[0], mirrorcnt, &failpos);
4608 		else {
4609 			error = nfsrv_getattrdsrpc(&fh[mirrorcnt - 1], cred, p,
4610 			    vp, nmp[mirrorcnt - 1], nap);
4611 			if (nfsds_failerr(error) && mirrorcnt > 1) {
4612 				/*
4613 				 * Setting failpos will cause the mirror
4614 				 * to be disabled and then a retry of this
4615 				 * getattr is required.
4616 				 */
4617 				failpos = mirrorcnt - 1;
4618 				error = 0;
4619 				trycnt++;
4620 			}
4621 		}
4622 		ds = NULL;
4623 		if (failpos >= 0) {
4624 			failnmp = nmp[failpos];
4625 			NFSLOCKMNT(failnmp);
4626 			if ((failnmp->nm_privflag & (NFSMNTP_FORCEDISM |
4627 			     NFSMNTP_CANCELRPCS)) == 0) {
4628 				failnmp->nm_privflag |= NFSMNTP_CANCELRPCS;
4629 				NFSUNLOCKMNT(failnmp);
4630 				ds = nfsrv_deldsnmp(PNFSDOP_DELDSSERVER,
4631 				    failnmp, p);
4632 				NFSD_DEBUG(4, "dsldsnmp fail=%d ds=%p\n",
4633 				    failpos, ds);
4634 				if (ds != NULL)
4635 					nfsrv_killrpcs(failnmp);
4636 				NFSLOCKMNT(failnmp);
4637 				failnmp->nm_privflag &= ~NFSMNTP_CANCELRPCS;
4638 				wakeup(failnmp);
4639 			}
4640 			NFSUNLOCKMNT(failnmp);
4641 		}
4642 		for (i = 0; i < mirrorcnt; i++)
4643 			NFSVOPUNLOCK(dvp[i]);
4644 		NFSD_DEBUG(4, "nfsrv_proxyds: aft RPC=%d trya=%d\n", error,
4645 		    trycnt);
4646 		/* Try the Read/Getattr again if a mirror was deleted. */
4647 		if (ds != NULL && trycnt > 0 && trycnt < origmircnt)
4648 			goto tryagain;
4649 	} else {
4650 		/* Return ENOENT for any Extended Attribute error. */
4651 		error = ENOENT;
4652 	}
4653 	free(buf, M_TEMP);
4654 	NFSD_DEBUG(4, "nfsrv_proxyds: error=%d\n", error);
4655 	return (error);
4656 }
4657 
4658 /*
4659  * Get the DS mount point, fh and directory from the "pnfsd.dsfile" extended
4660  * attribute.
4661  * newnmpp - If it points to a non-NULL nmp, that is the destination and needs
4662  *           to be checked.  If it points to a NULL nmp, then it returns
4663  *           a suitable destination.
4664  * curnmp - If non-NULL, it is the source mount for the copy.
4665  */
4666 int
4667 nfsrv_dsgetsockmnt(struct vnode *vp, int lktype, char *buf, int *buflenp,
4668     int *mirrorcntp, NFSPROC_T *p, struct vnode **dvpp, fhandle_t *fhp,
4669     char *devid, char *fnamep, struct vnode **nvpp, struct nfsmount **newnmpp,
4670     struct nfsmount *curnmp, int *ippos, int *dsdirp)
4671 {
4672 	struct vnode *dvp, *nvp = NULL, **tdvpp;
4673 	struct mount *mp;
4674 	struct nfsmount *nmp, *newnmp;
4675 	struct sockaddr *sad;
4676 	struct sockaddr_in *sin;
4677 	struct nfsdevice *ds, *tds, *fndds;
4678 	struct pnfsdsfile *pf;
4679 	uint32_t dsdir;
4680 	int error, fhiszero, fnd, gotone, i, mirrorcnt;
4681 
4682 	ASSERT_VOP_LOCKED(vp, "nfsrv_dsgetsockmnt vp");
4683 	*mirrorcntp = 1;
4684 	tdvpp = dvpp;
4685 	if (nvpp != NULL)
4686 		*nvpp = NULL;
4687 	if (dvpp != NULL)
4688 		*dvpp = NULL;
4689 	if (ippos != NULL)
4690 		*ippos = -1;
4691 	if (newnmpp != NULL)
4692 		newnmp = *newnmpp;
4693 	else
4694 		newnmp = NULL;
4695 	mp = vp->v_mount;
4696 	error = vn_extattr_get(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM,
4697 	    "pnfsd.dsfile", buflenp, buf, p);
4698 	mirrorcnt = *buflenp / sizeof(*pf);
4699 	if (error == 0 && (mirrorcnt < 1 || mirrorcnt > NFSDEV_MAXMIRRORS ||
4700 	    *buflenp != sizeof(*pf) * mirrorcnt))
4701 		error = ENOATTR;
4702 
4703 	pf = (struct pnfsdsfile *)buf;
4704 	/* If curnmp != NULL, check for a match in the mirror list. */
4705 	if (curnmp != NULL && error == 0) {
4706 		fnd = 0;
4707 		for (i = 0; i < mirrorcnt; i++, pf++) {
4708 			sad = (struct sockaddr *)&pf->dsf_sin;
4709 			if (nfsaddr2_match(sad, curnmp->nm_nam)) {
4710 				if (ippos != NULL)
4711 					*ippos = i;
4712 				fnd = 1;
4713 				break;
4714 			}
4715 		}
4716 		if (fnd == 0)
4717 			error = ENXIO;
4718 	}
4719 
4720 	gotone = 0;
4721 	pf = (struct pnfsdsfile *)buf;
4722 	NFSD_DEBUG(4, "nfsrv_dsgetsockmnt: mirrorcnt=%d err=%d\n", mirrorcnt,
4723 	    error);
4724 	for (i = 0; i < mirrorcnt && error == 0; i++, pf++) {
4725 		fhiszero = 0;
4726 		sad = (struct sockaddr *)&pf->dsf_sin;
4727 		sin = &pf->dsf_sin;
4728 		dsdir = pf->dsf_dir;
4729 		if (dsdir >= nfsrv_dsdirsize) {
4730 			printf("nfsrv_dsgetsockmnt: dsdir=%d\n", dsdir);
4731 			error = ENOATTR;
4732 		} else if (nvpp != NULL && newnmp != NULL &&
4733 		    nfsaddr2_match(sad, newnmp->nm_nam))
4734 			error = EEXIST;
4735 		if (error == 0) {
4736 			if (ippos != NULL && curnmp == NULL &&
4737 			    sad->sa_family == AF_INET &&
4738 			    sin->sin_addr.s_addr == 0)
4739 				*ippos = i;
4740 			if (NFSBCMP(&zerofh, &pf->dsf_fh, sizeof(zerofh)) == 0)
4741 				fhiszero = 1;
4742 			/* Use the socket address to find the mount point. */
4743 			fndds = NULL;
4744 			NFSDDSLOCK();
4745 			/* Find a match for the IP address. */
4746 			TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
4747 				if (ds->nfsdev_nmp != NULL) {
4748 					dvp = ds->nfsdev_dvp;
4749 					nmp = VFSTONFS(dvp->v_mount);
4750 					if (nmp != ds->nfsdev_nmp)
4751 						printf("different2 nmp %p %p\n",
4752 						    nmp, ds->nfsdev_nmp);
4753 					if (nfsaddr2_match(sad, nmp->nm_nam)) {
4754 						fndds = ds;
4755 						break;
4756 					}
4757 				}
4758 			}
4759 			if (fndds != NULL && newnmpp != NULL &&
4760 			    newnmp == NULL) {
4761 				/* Search for a place to make a mirror copy. */
4762 				TAILQ_FOREACH(tds, &nfsrv_devidhead,
4763 				    nfsdev_list) {
4764 					if (tds->nfsdev_nmp != NULL &&
4765 					    fndds != tds &&
4766 					    ((tds->nfsdev_mdsisset == 0 &&
4767 					      fndds->nfsdev_mdsisset == 0) ||
4768 					     (tds->nfsdev_mdsisset != 0 &&
4769 					      fndds->nfsdev_mdsisset != 0 &&
4770 					      tds->nfsdev_mdsfsid.val[0] ==
4771 					      mp->mnt_stat.f_fsid.val[0] &&
4772 					      tds->nfsdev_mdsfsid.val[1] ==
4773 					      mp->mnt_stat.f_fsid.val[1]))) {
4774 						*newnmpp = tds->nfsdev_nmp;
4775 						break;
4776 					}
4777 				}
4778 				if (tds != NULL) {
4779 					/*
4780 					 * Move this entry to the end of the
4781 					 * list, so it won't be selected as
4782 					 * easily the next time.
4783 					 */
4784 					TAILQ_REMOVE(&nfsrv_devidhead, tds,
4785 					    nfsdev_list);
4786 					TAILQ_INSERT_TAIL(&nfsrv_devidhead, tds,
4787 					    nfsdev_list);
4788 				}
4789 			}
4790 			NFSDDSUNLOCK();
4791 			if (fndds != NULL) {
4792 				dvp = fndds->nfsdev_dsdir[dsdir];
4793 				if (lktype != 0 || fhiszero != 0 ||
4794 				    (nvpp != NULL && *nvpp == NULL)) {
4795 					if (fhiszero != 0)
4796 						error = vn_lock(dvp,
4797 						    LK_EXCLUSIVE);
4798 					else if (lktype != 0)
4799 						error = vn_lock(dvp, lktype);
4800 					else
4801 						error = vn_lock(dvp, LK_SHARED);
4802 					/*
4803 					 * If the file handle is all 0's, try to
4804 					 * do a Lookup against the DS to acquire
4805 					 * it.
4806 					 * If dvpp == NULL or the Lookup fails,
4807 					 * unlock dvp after the call.
4808 					 */
4809 					if (error == 0 && (fhiszero != 0 ||
4810 					    (nvpp != NULL && *nvpp == NULL))) {
4811 						error = nfsrv_pnfslookupds(vp,
4812 						    dvp, pf, &nvp, p);
4813 						if (error == 0) {
4814 							if (fhiszero != 0)
4815 								nfsrv_pnfssetfh(
4816 								    vp, pf,
4817 								    devid,
4818 								    fnamep,
4819 								    nvp, p);
4820 							if (nvpp != NULL &&
4821 							    *nvpp == NULL) {
4822 								*nvpp = nvp;
4823 								*dsdirp = dsdir;
4824 							} else
4825 								vput(nvp);
4826 						}
4827 						if (error != 0 || lktype == 0)
4828 							NFSVOPUNLOCK(dvp);
4829 					}
4830 				}
4831 				if (error == 0) {
4832 					gotone++;
4833 					NFSD_DEBUG(4, "gotone=%d\n", gotone);
4834 					if (devid != NULL) {
4835 						NFSBCOPY(fndds->nfsdev_deviceid,
4836 						    devid, NFSX_V4DEVICEID);
4837 						devid += NFSX_V4DEVICEID;
4838 					}
4839 					if (dvpp != NULL)
4840 						*tdvpp++ = dvp;
4841 					if (fhp != NULL)
4842 						NFSBCOPY(&pf->dsf_fh, fhp++,
4843 						    NFSX_MYFH);
4844 					if (fnamep != NULL && gotone == 1)
4845 						strlcpy(fnamep,
4846 						    pf->dsf_filename,
4847 						    sizeof(pf->dsf_filename));
4848 				} else
4849 					NFSD_DEBUG(4, "nfsrv_dsgetsockmnt "
4850 					    "err=%d\n", error);
4851 			}
4852 		}
4853 	}
4854 	if (error == 0 && gotone == 0)
4855 		error = ENOENT;
4856 
4857 	NFSD_DEBUG(4, "eo nfsrv_dsgetsockmnt: gotone=%d err=%d\n", gotone,
4858 	    error);
4859 	if (error == 0)
4860 		*mirrorcntp = gotone;
4861 	else {
4862 		if (gotone > 0 && dvpp != NULL) {
4863 			/*
4864 			 * If the error didn't occur on the first one and
4865 			 * dvpp != NULL, the one(s) prior to the failure will
4866 			 * have locked dvp's that need to be unlocked.
4867 			 */
4868 			for (i = 0; i < gotone; i++) {
4869 				NFSVOPUNLOCK(*dvpp);
4870 				*dvpp++ = NULL;
4871 			}
4872 		}
4873 		/*
4874 		 * If it found the vnode to be copied from before a failure,
4875 		 * it needs to be vput()'d.
4876 		 */
4877 		if (nvpp != NULL && *nvpp != NULL) {
4878 			vput(*nvpp);
4879 			*nvpp = NULL;
4880 		}
4881 	}
4882 	return (error);
4883 }
4884 
4885 /*
4886  * Set the extended attribute for the Change attribute.
4887  */
4888 static int
4889 nfsrv_setextattr(struct vnode *vp, struct nfsvattr *nap, NFSPROC_T *p)
4890 {
4891 	struct pnfsdsattr dsattr;
4892 	int error;
4893 
4894 	ASSERT_VOP_ELOCKED(vp, "nfsrv_setextattr vp");
4895 	dsattr.dsa_filerev = nap->na_filerev;
4896 	dsattr.dsa_size = nap->na_size;
4897 	dsattr.dsa_atime = nap->na_atime;
4898 	dsattr.dsa_mtime = nap->na_mtime;
4899 	dsattr.dsa_bytes = nap->na_bytes;
4900 	error = vn_extattr_set(vp, IO_NODELOCKED, EXTATTR_NAMESPACE_SYSTEM,
4901 	    "pnfsd.dsattr", sizeof(dsattr), (char *)&dsattr, p);
4902 	if (error != 0)
4903 		printf("pNFS: setextattr=%d\n", error);
4904 	return (error);
4905 }
4906 
4907 static int
4908 nfsrv_readdsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred,
4909     NFSPROC_T *p, struct nfsmount *nmp, struct mbuf **mpp, struct mbuf **mpendp)
4910 {
4911 	uint32_t *tl;
4912 	struct nfsrv_descript *nd;
4913 	nfsv4stateid_t st;
4914 	struct mbuf *m, *m2;
4915 	int error = 0, retlen, tlen, trimlen;
4916 
4917 	NFSD_DEBUG(4, "in nfsrv_readdsrpc\n");
4918 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
4919 	*mpp = NULL;
4920 	/*
4921 	 * Use a stateid where other is an alternating 01010 pattern and
4922 	 * seqid is 0xffffffff.  This value is not defined as special by
4923 	 * the RFC and is used by the FreeBSD NFS server to indicate an
4924 	 * MDS->DS proxy operation.
4925 	 */
4926 	st.other[0] = 0x55555555;
4927 	st.other[1] = 0x55555555;
4928 	st.other[2] = 0x55555555;
4929 	st.seqid = 0xffffffff;
4930 	nfscl_reqstart(nd, NFSPROC_READDS, nmp, (u_int8_t *)fhp, sizeof(*fhp),
4931 	    NULL, NULL, 0, 0);
4932 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
4933 	NFSM_BUILD(tl, uint32_t *, NFSX_UNSIGNED * 3);
4934 	txdr_hyper(off, tl);
4935 	*(tl + 2) = txdr_unsigned(len);
4936 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
4937 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
4938 	if (error != 0) {
4939 		free(nd, M_TEMP);
4940 		return (error);
4941 	}
4942 	if (nd->nd_repstat == 0) {
4943 		NFSM_DISSECT(tl, u_int32_t *, NFSX_UNSIGNED);
4944 		NFSM_STRSIZ(retlen, len);
4945 		if (retlen > 0) {
4946 			/* Trim off the pre-data XDR from the mbuf chain. */
4947 			m = nd->nd_mrep;
4948 			while (m != NULL && m != nd->nd_md) {
4949 				if (m->m_next == nd->nd_md) {
4950 					m->m_next = NULL;
4951 					m_freem(nd->nd_mrep);
4952 					nd->nd_mrep = m = nd->nd_md;
4953 				} else
4954 					m = m->m_next;
4955 			}
4956 			if (m == NULL) {
4957 				printf("nfsrv_readdsrpc: busted mbuf list\n");
4958 				error = ENOENT;
4959 				goto nfsmout;
4960 			}
4961 
4962 			/*
4963 			 * Now, adjust first mbuf so that any XDR before the
4964 			 * read data is skipped over.
4965 			 */
4966 			trimlen = nd->nd_dpos - mtod(m, char *);
4967 			if (trimlen > 0) {
4968 				m->m_len -= trimlen;
4969 				NFSM_DATAP(m, trimlen);
4970 			}
4971 
4972 			/*
4973 			 * Truncate the mbuf chain at retlen bytes of data,
4974 			 * plus XDR padding that brings the length up to a
4975 			 * multiple of 4.
4976 			 */
4977 			tlen = NFSM_RNDUP(retlen);
4978 			do {
4979 				if (m->m_len >= tlen) {
4980 					m->m_len = tlen;
4981 					tlen = 0;
4982 					m2 = m->m_next;
4983 					m->m_next = NULL;
4984 					m_freem(m2);
4985 					break;
4986 				}
4987 				tlen -= m->m_len;
4988 				m = m->m_next;
4989 			} while (m != NULL);
4990 			if (tlen > 0) {
4991 				printf("nfsrv_readdsrpc: busted mbuf list\n");
4992 				error = ENOENT;
4993 				goto nfsmout;
4994 			}
4995 			*mpp = nd->nd_mrep;
4996 			*mpendp = m;
4997 			nd->nd_mrep = NULL;
4998 		}
4999 	} else
5000 		error = nd->nd_repstat;
5001 nfsmout:
5002 	/* If nd->nd_mrep is already NULL, this is a no-op. */
5003 	m_freem(nd->nd_mrep);
5004 	free(nd, M_TEMP);
5005 	NFSD_DEBUG(4, "nfsrv_readdsrpc error=%d\n", error);
5006 	return (error);
5007 }
5008 
5009 /*
5010  * Do a write RPC on a DS data file, using this structure for the arguments,
5011  * so that this function can be executed by a separate kernel process.
5012  */
5013 struct nfsrvwritedsdorpc {
5014 	int			done;
5015 	int			inprog;
5016 	struct task		tsk;
5017 	fhandle_t		fh;
5018 	off_t			off;
5019 	int			len;
5020 	struct nfsmount		*nmp;
5021 	struct ucred		*cred;
5022 	NFSPROC_T		*p;
5023 	struct mbuf		*m;
5024 	int			err;
5025 };
5026 
5027 static int
5028 nfsrv_writedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off, int len,
5029     struct nfsvattr *nap, struct mbuf *m, struct ucred *cred, NFSPROC_T *p)
5030 {
5031 	uint32_t *tl;
5032 	struct nfsrv_descript *nd;
5033 	nfsattrbit_t attrbits;
5034 	nfsv4stateid_t st;
5035 	int commit, error, retlen;
5036 
5037 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5038 	nfscl_reqstart(nd, NFSPROC_WRITE, nmp, (u_int8_t *)fhp,
5039 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
5040 
5041 	/*
5042 	 * Use a stateid where other is an alternating 01010 pattern and
5043 	 * seqid is 0xffffffff.  This value is not defined as special by
5044 	 * the RFC and is used by the FreeBSD NFS server to indicate an
5045 	 * MDS->DS proxy operation.
5046 	 */
5047 	st.other[0] = 0x55555555;
5048 	st.other[1] = 0x55555555;
5049 	st.other[2] = 0x55555555;
5050 	st.seqid = 0xffffffff;
5051 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
5052 	NFSM_BUILD(tl, u_int32_t *, NFSX_HYPER + 2 * NFSX_UNSIGNED);
5053 	txdr_hyper(off, tl);
5054 	tl += 2;
5055 	/*
5056 	 * Do all writes FileSync, since the server doesn't hold onto dirty
5057 	 * buffers.  Since clients should be accessing the DS servers directly
5058 	 * using the pNFS layouts, this just needs to work correctly as a
5059 	 * fallback.
5060 	 */
5061 	*tl++ = txdr_unsigned(NFSWRITE_FILESYNC);
5062 	*tl = txdr_unsigned(len);
5063 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: len=%d\n", len);
5064 
5065 	/* Put data in mbuf chain. */
5066 	nd->nd_mb->m_next = m;
5067 
5068 	/* Set nd_mb and nd_bpos to end of data. */
5069 	while (m->m_next != NULL)
5070 		m = m->m_next;
5071 	nd->nd_mb = m;
5072 	nd->nd_bpos = mtod(m, char *) + m->m_len;
5073 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: lastmb len=%d\n", m->m_len);
5074 
5075 	/* Do a Getattr for the attributes that change upon writing. */
5076 	NFSZERO_ATTRBIT(&attrbits);
5077 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
5078 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
5079 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
5080 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
5081 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
5082 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
5083 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
5084 	(void) nfsrv_putattrbit(nd, &attrbits);
5085 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
5086 	    cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5087 	if (error != 0) {
5088 		free(nd, M_TEMP);
5089 		return (error);
5090 	}
5091 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft writerpc=%d\n", nd->nd_repstat);
5092 	/* Get rid of weak cache consistency data for now. */
5093 	if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) ==
5094 	    (ND_NFSV4 | ND_V4WCCATTR)) {
5095 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
5096 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
5097 		NFSD_DEBUG(4, "nfsrv_writedsdorpc: wcc attr=%d\n", error);
5098 		if (error != 0)
5099 			goto nfsmout;
5100 		/*
5101 		 * Get rid of Op# and status for next op.
5102 		 */
5103 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
5104 		if (*++tl != 0)
5105 			nd->nd_flag |= ND_NOMOREDATA;
5106 	}
5107 	if (nd->nd_repstat == 0) {
5108 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED + NFSX_VERF);
5109 		retlen = fxdr_unsigned(int, *tl++);
5110 		commit = fxdr_unsigned(int, *tl);
5111 		if (commit != NFSWRITE_FILESYNC)
5112 			error = NFSERR_IO;
5113 		NFSD_DEBUG(4, "nfsrv_writedsdorpc:retlen=%d commit=%d err=%d\n",
5114 		    retlen, commit, error);
5115 	} else
5116 		error = nd->nd_repstat;
5117 	/* We have no use for the Write Verifier since we use FileSync. */
5118 
5119 	/*
5120 	 * Get the Change, Size, Access Time and Modify Time attributes and set
5121 	 * on the Metadata file, so its attributes will be what the file's
5122 	 * would be if it had been written.
5123 	 */
5124 	if (error == 0) {
5125 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
5126 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
5127 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
5128 	}
5129 	NFSD_DEBUG(4, "nfsrv_writedsdorpc: aft loadattr=%d\n", error);
5130 nfsmout:
5131 	m_freem(nd->nd_mrep);
5132 	free(nd, M_TEMP);
5133 	NFSD_DEBUG(4, "nfsrv_writedsdorpc error=%d\n", error);
5134 	return (error);
5135 }
5136 
5137 /*
5138  * Start up the thread that will execute nfsrv_writedsdorpc().
5139  */
5140 static void
5141 start_writedsdorpc(void *arg, int pending)
5142 {
5143 	struct nfsrvwritedsdorpc *drpc;
5144 
5145 	drpc = (struct nfsrvwritedsdorpc *)arg;
5146 	drpc->err = nfsrv_writedsdorpc(drpc->nmp, &drpc->fh, drpc->off,
5147 	    drpc->len, NULL, drpc->m, drpc->cred, drpc->p);
5148 	drpc->done = 1;
5149 	NFSD_DEBUG(4, "start_writedsdorpc: err=%d\n", drpc->err);
5150 }
5151 
5152 static int
5153 nfsrv_writedsrpc(fhandle_t *fhp, off_t off, int len, struct ucred *cred,
5154     NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
5155     struct mbuf **mpp, char *cp, int *failposp)
5156 {
5157 	struct nfsrvwritedsdorpc *drpc, *tdrpc = NULL;
5158 	struct nfsvattr na;
5159 	struct mbuf *m;
5160 	int error, i, offs, ret, timo;
5161 
5162 	NFSD_DEBUG(4, "in nfsrv_writedsrpc\n");
5163 	KASSERT(*mpp != NULL, ("nfsrv_writedsrpc: NULL mbuf chain"));
5164 	drpc = NULL;
5165 	if (mirrorcnt > 1)
5166 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
5167 		    M_WAITOK);
5168 
5169 	/* Calculate offset in mbuf chain that data starts. */
5170 	offs = cp - mtod(*mpp, char *);
5171 	NFSD_DEBUG(4, "nfsrv_writedsrpc: mcopy offs=%d len=%d\n", offs, len);
5172 
5173 	/*
5174 	 * Do the write RPC for every DS, using a separate kernel process
5175 	 * for every DS except the last one.
5176 	 */
5177 	error = 0;
5178 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5179 		tdrpc->done = 0;
5180 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
5181 		tdrpc->off = off;
5182 		tdrpc->len = len;
5183 		tdrpc->nmp = *nmpp;
5184 		tdrpc->cred = cred;
5185 		tdrpc->p = p;
5186 		tdrpc->inprog = 0;
5187 		tdrpc->err = 0;
5188 		tdrpc->m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
5189 		ret = EIO;
5190 		if (nfs_pnfsiothreads != 0) {
5191 			ret = nfs_pnfsio(start_writedsdorpc, tdrpc);
5192 			NFSD_DEBUG(4, "nfsrv_writedsrpc: nfs_pnfsio=%d\n",
5193 			    ret);
5194 		}
5195 		if (ret != 0) {
5196 			ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, NULL,
5197 			    tdrpc->m, cred, p);
5198 			if (nfsds_failerr(ret) && *failposp == -1)
5199 				*failposp = i;
5200 			else if (error == 0 && ret != 0)
5201 				error = ret;
5202 		}
5203 		nmpp++;
5204 		fhp++;
5205 	}
5206 	m = m_copym(*mpp, offs, NFSM_RNDUP(len), M_WAITOK);
5207 	ret = nfsrv_writedsdorpc(*nmpp, fhp, off, len, &na, m, cred, p);
5208 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
5209 		*failposp = mirrorcnt - 1;
5210 	else if (error == 0 && ret != 0)
5211 		error = ret;
5212 	if (error == 0)
5213 		error = nfsrv_setextattr(vp, &na, p);
5214 	NFSD_DEBUG(4, "nfsrv_writedsrpc: aft setextat=%d\n", error);
5215 	tdrpc = drpc;
5216 	timo = hz / 50;		/* Wait for 20msec. */
5217 	if (timo < 1)
5218 		timo = 1;
5219 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5220 		/* Wait for RPCs on separate threads to complete. */
5221 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
5222 			tsleep(&tdrpc->tsk, PVFS, "srvwrds", timo);
5223 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
5224 			*failposp = i;
5225 		else if (error == 0 && tdrpc->err != 0)
5226 			error = tdrpc->err;
5227 	}
5228 	free(drpc, M_TEMP);
5229 	return (error);
5230 }
5231 
5232 /*
5233  * Do a allocate RPC on a DS data file, using this structure for the arguments,
5234  * so that this function can be executed by a separate kernel process.
5235  */
5236 struct nfsrvallocatedsdorpc {
5237 	int			done;
5238 	int			inprog;
5239 	struct task		tsk;
5240 	fhandle_t		fh;
5241 	off_t			off;
5242 	off_t			len;
5243 	struct nfsmount		*nmp;
5244 	struct ucred		*cred;
5245 	NFSPROC_T		*p;
5246 	int			err;
5247 };
5248 
5249 static int
5250 nfsrv_allocatedsdorpc(struct nfsmount *nmp, fhandle_t *fhp, off_t off,
5251     off_t len, struct nfsvattr *nap, struct ucred *cred, NFSPROC_T *p)
5252 {
5253 	uint32_t *tl;
5254 	struct nfsrv_descript *nd;
5255 	nfsattrbit_t attrbits;
5256 	nfsv4stateid_t st;
5257 	int error;
5258 
5259 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5260 	nfscl_reqstart(nd, NFSPROC_ALLOCATE, nmp, (u_int8_t *)fhp,
5261 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
5262 
5263 	/*
5264 	 * Use a stateid where other is an alternating 01010 pattern and
5265 	 * seqid is 0xffffffff.  This value is not defined as special by
5266 	 * the RFC and is used by the FreeBSD NFS server to indicate an
5267 	 * MDS->DS proxy operation.
5268 	 */
5269 	st.other[0] = 0x55555555;
5270 	st.other[1] = 0x55555555;
5271 	st.other[2] = 0x55555555;
5272 	st.seqid = 0xffffffff;
5273 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
5274 	NFSM_BUILD(tl, uint32_t *, 2 * NFSX_HYPER + NFSX_UNSIGNED);
5275 	txdr_hyper(off, tl); tl += 2;
5276 	txdr_hyper(len, tl); tl += 2;
5277 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: len=%jd\n", (intmax_t)len);
5278 
5279 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
5280 	NFSGETATTR_ATTRBIT(&attrbits);
5281 	nfsrv_putattrbit(nd, &attrbits);
5282 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p,
5283 	    cred, NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5284 	if (error != 0) {
5285 		free(nd, M_TEMP);
5286 		return (error);
5287 	}
5288 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft allocaterpc=%d\n",
5289 	    nd->nd_repstat);
5290 	if (nd->nd_repstat == 0) {
5291 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
5292 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0, NULL, NULL,
5293 		    NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
5294 	} else
5295 		error = nd->nd_repstat;
5296 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc: aft loadattr=%d\n", error);
5297 nfsmout:
5298 	m_freem(nd->nd_mrep);
5299 	free(nd, M_TEMP);
5300 	NFSD_DEBUG(4, "nfsrv_allocatedsdorpc error=%d\n", error);
5301 	return (error);
5302 }
5303 
5304 /*
5305  * Start up the thread that will execute nfsrv_allocatedsdorpc().
5306  */
5307 static void
5308 start_allocatedsdorpc(void *arg, int pending)
5309 {
5310 	struct nfsrvallocatedsdorpc *drpc;
5311 
5312 	drpc = (struct nfsrvallocatedsdorpc *)arg;
5313 	drpc->err = nfsrv_allocatedsdorpc(drpc->nmp, &drpc->fh, drpc->off,
5314 	    drpc->len, NULL, drpc->cred, drpc->p);
5315 	drpc->done = 1;
5316 	NFSD_DEBUG(4, "start_allocatedsdorpc: err=%d\n", drpc->err);
5317 }
5318 
5319 static int
5320 nfsrv_allocatedsrpc(fhandle_t *fhp, off_t off, off_t len, struct ucred *cred,
5321     NFSPROC_T *p, struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
5322     int *failposp)
5323 {
5324 	struct nfsrvallocatedsdorpc *drpc, *tdrpc = NULL;
5325 	struct nfsvattr na;
5326 	int error, i, ret, timo;
5327 
5328 	NFSD_DEBUG(4, "in nfsrv_allocatedsrpc\n");
5329 	drpc = NULL;
5330 	if (mirrorcnt > 1)
5331 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
5332 		    M_WAITOK);
5333 
5334 	/*
5335 	 * Do the allocate RPC for every DS, using a separate kernel process
5336 	 * for every DS except the last one.
5337 	 */
5338 	error = 0;
5339 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5340 		tdrpc->done = 0;
5341 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
5342 		tdrpc->off = off;
5343 		tdrpc->len = len;
5344 		tdrpc->nmp = *nmpp;
5345 		tdrpc->cred = cred;
5346 		tdrpc->p = p;
5347 		tdrpc->inprog = 0;
5348 		tdrpc->err = 0;
5349 		ret = EIO;
5350 		if (nfs_pnfsiothreads != 0) {
5351 			ret = nfs_pnfsio(start_allocatedsdorpc, tdrpc);
5352 			NFSD_DEBUG(4, "nfsrv_allocatedsrpc: nfs_pnfsio=%d\n",
5353 			    ret);
5354 		}
5355 		if (ret != 0) {
5356 			ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, NULL,
5357 			    cred, p);
5358 			if (nfsds_failerr(ret) && *failposp == -1)
5359 				*failposp = i;
5360 			else if (error == 0 && ret != 0)
5361 				error = ret;
5362 		}
5363 		nmpp++;
5364 		fhp++;
5365 	}
5366 	ret = nfsrv_allocatedsdorpc(*nmpp, fhp, off, len, &na, cred, p);
5367 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
5368 		*failposp = mirrorcnt - 1;
5369 	else if (error == 0 && ret != 0)
5370 		error = ret;
5371 	if (error == 0)
5372 		error = nfsrv_setextattr(vp, &na, p);
5373 	NFSD_DEBUG(4, "nfsrv_allocatedsrpc: aft setextat=%d\n", error);
5374 	tdrpc = drpc;
5375 	timo = hz / 50;		/* Wait for 20msec. */
5376 	if (timo < 1)
5377 		timo = 1;
5378 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5379 		/* Wait for RPCs on separate threads to complete. */
5380 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
5381 			tsleep(&tdrpc->tsk, PVFS, "srvalds", timo);
5382 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
5383 			*failposp = i;
5384 		else if (error == 0 && tdrpc->err != 0)
5385 			error = tdrpc->err;
5386 	}
5387 	free(drpc, M_TEMP);
5388 	return (error);
5389 }
5390 
5391 static int
5392 nfsrv_setattrdsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
5393     struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap,
5394     struct nfsvattr *dsnap)
5395 {
5396 	uint32_t *tl;
5397 	struct nfsrv_descript *nd;
5398 	nfsv4stateid_t st;
5399 	nfsattrbit_t attrbits;
5400 	int error;
5401 
5402 	NFSD_DEBUG(4, "in nfsrv_setattrdsdorpc\n");
5403 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5404 	/*
5405 	 * Use a stateid where other is an alternating 01010 pattern and
5406 	 * seqid is 0xffffffff.  This value is not defined as special by
5407 	 * the RFC and is used by the FreeBSD NFS server to indicate an
5408 	 * MDS->DS proxy operation.
5409 	 */
5410 	st.other[0] = 0x55555555;
5411 	st.other[1] = 0x55555555;
5412 	st.other[2] = 0x55555555;
5413 	st.seqid = 0xffffffff;
5414 	nfscl_reqstart(nd, NFSPROC_SETATTR, nmp, (u_int8_t *)fhp, sizeof(*fhp),
5415 	    NULL, NULL, 0, 0);
5416 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
5417 	nfscl_fillsattr(nd, &nap->na_vattr, vp, NFSSATTR_FULL, 0);
5418 
5419 	/* Do a Getattr for the attributes that change due to writing. */
5420 	NFSZERO_ATTRBIT(&attrbits);
5421 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
5422 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
5423 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
5424 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
5425 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
5426 	NFSM_BUILD(tl, u_int32_t *, NFSX_UNSIGNED);
5427 	*tl = txdr_unsigned(NFSV4OP_GETATTR);
5428 	(void) nfsrv_putattrbit(nd, &attrbits);
5429 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
5430 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5431 	if (error != 0) {
5432 		free(nd, M_TEMP);
5433 		return (error);
5434 	}
5435 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattrrpc=%d\n",
5436 	    nd->nd_repstat);
5437 	/* Get rid of weak cache consistency data for now. */
5438 	if ((nd->nd_flag & (ND_NOMOREDATA | ND_NFSV4 | ND_V4WCCATTR)) ==
5439 	    (ND_NFSV4 | ND_V4WCCATTR)) {
5440 		error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL,
5441 		    NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
5442 		NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: wcc attr=%d\n", error);
5443 		if (error != 0)
5444 			goto nfsmout;
5445 		/*
5446 		 * Get rid of Op# and status for next op.
5447 		 */
5448 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
5449 		if (*++tl != 0)
5450 			nd->nd_flag |= ND_NOMOREDATA;
5451 	}
5452 	error = nfsrv_getattrbits(nd, &attrbits, NULL, NULL);
5453 	if (error != 0)
5454 		goto nfsmout;
5455 	if (nd->nd_repstat != 0)
5456 		error = nd->nd_repstat;
5457 	/*
5458 	 * Get the Change, Size, Access Time and Modify Time attributes and set
5459 	 * on the Metadata file, so its attributes will be what the file's
5460 	 * would be if it had been written.
5461 	 */
5462 	if (error == 0) {
5463 		NFSM_DISSECT(tl, uint32_t *, 2 * NFSX_UNSIGNED);
5464 		error = nfsv4_loadattr(nd, NULL, dsnap, NULL, NULL, 0, NULL,
5465 		    NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL, NULL, NULL);
5466 	}
5467 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc: aft setattr loadattr=%d\n", error);
5468 nfsmout:
5469 	m_freem(nd->nd_mrep);
5470 	free(nd, M_TEMP);
5471 	NFSD_DEBUG(4, "nfsrv_setattrdsdorpc error=%d\n", error);
5472 	return (error);
5473 }
5474 
5475 struct nfsrvsetattrdsdorpc {
5476 	int			done;
5477 	int			inprog;
5478 	struct task		tsk;
5479 	fhandle_t		fh;
5480 	struct nfsmount		*nmp;
5481 	struct vnode		*vp;
5482 	struct ucred		*cred;
5483 	NFSPROC_T		*p;
5484 	struct nfsvattr		na;
5485 	struct nfsvattr		dsna;
5486 	int			err;
5487 };
5488 
5489 /*
5490  * Start up the thread that will execute nfsrv_setattrdsdorpc().
5491  */
5492 static void
5493 start_setattrdsdorpc(void *arg, int pending)
5494 {
5495 	struct nfsrvsetattrdsdorpc *drpc;
5496 
5497 	drpc = (struct nfsrvsetattrdsdorpc *)arg;
5498 	drpc->err = nfsrv_setattrdsdorpc(&drpc->fh, drpc->cred, drpc->p,
5499 	    drpc->vp, drpc->nmp, &drpc->na, &drpc->dsna);
5500 	drpc->done = 1;
5501 }
5502 
5503 static int
5504 nfsrv_setattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
5505     struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt,
5506     struct nfsvattr *nap, int *failposp)
5507 {
5508 	struct nfsrvsetattrdsdorpc *drpc, *tdrpc = NULL;
5509 	struct nfsvattr na;
5510 	int error, i, ret, timo;
5511 
5512 	NFSD_DEBUG(4, "in nfsrv_setattrdsrpc\n");
5513 	drpc = NULL;
5514 	if (mirrorcnt > 1)
5515 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
5516 		    M_WAITOK);
5517 
5518 	/*
5519 	 * Do the setattr RPC for every DS, using a separate kernel process
5520 	 * for every DS except the last one.
5521 	 */
5522 	error = 0;
5523 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5524 		tdrpc->done = 0;
5525 		tdrpc->inprog = 0;
5526 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
5527 		tdrpc->nmp = *nmpp;
5528 		tdrpc->vp = vp;
5529 		tdrpc->cred = cred;
5530 		tdrpc->p = p;
5531 		tdrpc->na = *nap;
5532 		tdrpc->err = 0;
5533 		ret = EIO;
5534 		if (nfs_pnfsiothreads != 0) {
5535 			ret = nfs_pnfsio(start_setattrdsdorpc, tdrpc);
5536 			NFSD_DEBUG(4, "nfsrv_setattrdsrpc: nfs_pnfsio=%d\n",
5537 			    ret);
5538 		}
5539 		if (ret != 0) {
5540 			ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap,
5541 			    &na);
5542 			if (nfsds_failerr(ret) && *failposp == -1)
5543 				*failposp = i;
5544 			else if (error == 0 && ret != 0)
5545 				error = ret;
5546 		}
5547 		nmpp++;
5548 		fhp++;
5549 	}
5550 	ret = nfsrv_setattrdsdorpc(fhp, cred, p, vp, *nmpp, nap, &na);
5551 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
5552 		*failposp = mirrorcnt - 1;
5553 	else if (error == 0 && ret != 0)
5554 		error = ret;
5555 	if (error == 0)
5556 		error = nfsrv_setextattr(vp, &na, p);
5557 	NFSD_DEBUG(4, "nfsrv_setattrdsrpc: aft setextat=%d\n", error);
5558 	tdrpc = drpc;
5559 	timo = hz / 50;		/* Wait for 20msec. */
5560 	if (timo < 1)
5561 		timo = 1;
5562 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5563 		/* Wait for RPCs on separate threads to complete. */
5564 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
5565 			tsleep(&tdrpc->tsk, PVFS, "srvsads", timo);
5566 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
5567 			*failposp = i;
5568 		else if (error == 0 && tdrpc->err != 0)
5569 			error = tdrpc->err;
5570 	}
5571 	free(drpc, M_TEMP);
5572 	return (error);
5573 }
5574 
5575 /*
5576  * Do a Setattr of an NFSv4 ACL on the DS file.
5577  */
5578 static int
5579 nfsrv_setacldsdorpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
5580     struct vnode *vp, struct nfsmount *nmp, struct acl *aclp)
5581 {
5582 	struct nfsrv_descript *nd;
5583 	nfsv4stateid_t st;
5584 	nfsattrbit_t attrbits;
5585 	int error;
5586 
5587 	NFSD_DEBUG(4, "in nfsrv_setacldsdorpc\n");
5588 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5589 	/*
5590 	 * Use a stateid where other is an alternating 01010 pattern and
5591 	 * seqid is 0xffffffff.  This value is not defined as special by
5592 	 * the RFC and is used by the FreeBSD NFS server to indicate an
5593 	 * MDS->DS proxy operation.
5594 	 */
5595 	st.other[0] = 0x55555555;
5596 	st.other[1] = 0x55555555;
5597 	st.other[2] = 0x55555555;
5598 	st.seqid = 0xffffffff;
5599 	nfscl_reqstart(nd, NFSPROC_SETACL, nmp, (u_int8_t *)fhp, sizeof(*fhp),
5600 	    NULL, NULL, 0, 0);
5601 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
5602 	NFSZERO_ATTRBIT(&attrbits);
5603 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_ACL);
5604 	/*
5605 	 * The "vp" argument to nfsv4_fillattr() is only used for vnode_type(),
5606 	 * so passing in the metadata "vp" will be ok, since it is of
5607 	 * the same type (VREG).
5608 	 */
5609 	nfsv4_fillattr(nd, NULL, vp, aclp, NULL, NULL, 0, &attrbits, NULL,
5610 	    NULL, 0, 0, 0, 0, 0, NULL);
5611 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
5612 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5613 	if (error != 0) {
5614 		free(nd, M_TEMP);
5615 		return (error);
5616 	}
5617 	NFSD_DEBUG(4, "nfsrv_setacldsdorpc: aft setaclrpc=%d\n",
5618 	    nd->nd_repstat);
5619 	error = nd->nd_repstat;
5620 	m_freem(nd->nd_mrep);
5621 	free(nd, M_TEMP);
5622 	return (error);
5623 }
5624 
5625 struct nfsrvsetacldsdorpc {
5626 	int			done;
5627 	int			inprog;
5628 	struct task		tsk;
5629 	fhandle_t		fh;
5630 	struct nfsmount		*nmp;
5631 	struct vnode		*vp;
5632 	struct ucred		*cred;
5633 	NFSPROC_T		*p;
5634 	struct acl		*aclp;
5635 	int			err;
5636 };
5637 
5638 /*
5639  * Start up the thread that will execute nfsrv_setacldsdorpc().
5640  */
5641 static void
5642 start_setacldsdorpc(void *arg, int pending)
5643 {
5644 	struct nfsrvsetacldsdorpc *drpc;
5645 
5646 	drpc = (struct nfsrvsetacldsdorpc *)arg;
5647 	drpc->err = nfsrv_setacldsdorpc(&drpc->fh, drpc->cred, drpc->p,
5648 	    drpc->vp, drpc->nmp, drpc->aclp);
5649 	drpc->done = 1;
5650 }
5651 
5652 static int
5653 nfsrv_setacldsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
5654     struct vnode *vp, struct nfsmount **nmpp, int mirrorcnt, struct acl *aclp,
5655     int *failposp)
5656 {
5657 	struct nfsrvsetacldsdorpc *drpc, *tdrpc = NULL;
5658 	int error, i, ret, timo;
5659 
5660 	NFSD_DEBUG(4, "in nfsrv_setacldsrpc\n");
5661 	drpc = NULL;
5662 	if (mirrorcnt > 1)
5663 		tdrpc = drpc = malloc(sizeof(*drpc) * (mirrorcnt - 1), M_TEMP,
5664 		    M_WAITOK);
5665 
5666 	/*
5667 	 * Do the setattr RPC for every DS, using a separate kernel process
5668 	 * for every DS except the last one.
5669 	 */
5670 	error = 0;
5671 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5672 		tdrpc->done = 0;
5673 		tdrpc->inprog = 0;
5674 		NFSBCOPY(fhp, &tdrpc->fh, sizeof(*fhp));
5675 		tdrpc->nmp = *nmpp;
5676 		tdrpc->vp = vp;
5677 		tdrpc->cred = cred;
5678 		tdrpc->p = p;
5679 		tdrpc->aclp = aclp;
5680 		tdrpc->err = 0;
5681 		ret = EIO;
5682 		if (nfs_pnfsiothreads != 0) {
5683 			ret = nfs_pnfsio(start_setacldsdorpc, tdrpc);
5684 			NFSD_DEBUG(4, "nfsrv_setacldsrpc: nfs_pnfsio=%d\n",
5685 			    ret);
5686 		}
5687 		if (ret != 0) {
5688 			ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp,
5689 			    aclp);
5690 			if (nfsds_failerr(ret) && *failposp == -1)
5691 				*failposp = i;
5692 			else if (error == 0 && ret != 0)
5693 				error = ret;
5694 		}
5695 		nmpp++;
5696 		fhp++;
5697 	}
5698 	ret = nfsrv_setacldsdorpc(fhp, cred, p, vp, *nmpp, aclp);
5699 	if (nfsds_failerr(ret) && *failposp == -1 && mirrorcnt > 1)
5700 		*failposp = mirrorcnt - 1;
5701 	else if (error == 0 && ret != 0)
5702 		error = ret;
5703 	NFSD_DEBUG(4, "nfsrv_setacldsrpc: aft setextat=%d\n", error);
5704 	tdrpc = drpc;
5705 	timo = hz / 50;		/* Wait for 20msec. */
5706 	if (timo < 1)
5707 		timo = 1;
5708 	for (i = 0; i < mirrorcnt - 1; i++, tdrpc++) {
5709 		/* Wait for RPCs on separate threads to complete. */
5710 		while (tdrpc->inprog != 0 && tdrpc->done == 0)
5711 			tsleep(&tdrpc->tsk, PVFS, "srvacds", timo);
5712 		if (nfsds_failerr(tdrpc->err) && *failposp == -1)
5713 			*failposp = i;
5714 		else if (error == 0 && tdrpc->err != 0)
5715 			error = tdrpc->err;
5716 	}
5717 	free(drpc, M_TEMP);
5718 	return (error);
5719 }
5720 
5721 /*
5722  * Getattr call to the DS for the attributes that change due to writing.
5723  */
5724 static int
5725 nfsrv_getattrdsrpc(fhandle_t *fhp, struct ucred *cred, NFSPROC_T *p,
5726     struct vnode *vp, struct nfsmount *nmp, struct nfsvattr *nap)
5727 {
5728 	struct nfsrv_descript *nd;
5729 	int error;
5730 	nfsattrbit_t attrbits;
5731 
5732 	NFSD_DEBUG(4, "in nfsrv_getattrdsrpc\n");
5733 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5734 	nfscl_reqstart(nd, NFSPROC_GETATTR, nmp, (u_int8_t *)fhp,
5735 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
5736 	NFSZERO_ATTRBIT(&attrbits);
5737 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SIZE);
5738 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_CHANGE);
5739 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEACCESS);
5740 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_TIMEMODIFY);
5741 	NFSSETBIT_ATTRBIT(&attrbits, NFSATTRBIT_SPACEUSED);
5742 	(void) nfsrv_putattrbit(nd, &attrbits);
5743 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
5744 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5745 	if (error != 0) {
5746 		free(nd, M_TEMP);
5747 		return (error);
5748 	}
5749 	NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft getattrrpc=%d\n",
5750 	    nd->nd_repstat);
5751 	if (nd->nd_repstat == 0) {
5752 		error = nfsv4_loadattr(nd, NULL, nap, NULL, NULL, 0,
5753 		    NULL, NULL, NULL, NULL, NULL, 0, NULL, NULL, NULL,
5754 		    NULL, NULL);
5755 		/*
5756 		 * We can only save the updated values in the extended
5757 		 * attribute if the vp is exclusively locked.
5758 		 * This should happen when any of the following operations
5759 		 * occur on the vnode:
5760 		 *    Close, Delegreturn, LayoutCommit, LayoutReturn
5761 		 * As such, the updated extended attribute should get saved
5762 		 * before nfsrv_checkdsattr() returns 0 and allows the cached
5763 		 * attributes to be returned without calling this function.
5764 		 */
5765 		if (error == 0 && VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
5766 			error = nfsrv_setextattr(vp, nap, p);
5767 			NFSD_DEBUG(4, "nfsrv_getattrdsrpc: aft setextat=%d\n",
5768 			    error);
5769 		}
5770 	} else
5771 		error = nd->nd_repstat;
5772 	m_freem(nd->nd_mrep);
5773 	free(nd, M_TEMP);
5774 	NFSD_DEBUG(4, "nfsrv_getattrdsrpc error=%d\n", error);
5775 	return (error);
5776 }
5777 
5778 /*
5779  * Seek call to a DS.
5780  */
5781 static int
5782 nfsrv_seekdsrpc(fhandle_t *fhp, off_t *offp, int content, bool *eofp,
5783     struct ucred *cred, NFSPROC_T *p, struct nfsmount *nmp)
5784 {
5785 	uint32_t *tl;
5786 	struct nfsrv_descript *nd;
5787 	nfsv4stateid_t st;
5788 	int error;
5789 
5790 	NFSD_DEBUG(4, "in nfsrv_seekdsrpc\n");
5791 	/*
5792 	 * Use a stateid where other is an alternating 01010 pattern and
5793 	 * seqid is 0xffffffff.  This value is not defined as special by
5794 	 * the RFC and is used by the FreeBSD NFS server to indicate an
5795 	 * MDS->DS proxy operation.
5796 	 */
5797 	st.other[0] = 0x55555555;
5798 	st.other[1] = 0x55555555;
5799 	st.other[2] = 0x55555555;
5800 	st.seqid = 0xffffffff;
5801 	nd = malloc(sizeof(*nd), M_TEMP, M_WAITOK | M_ZERO);
5802 	nfscl_reqstart(nd, NFSPROC_SEEKDS, nmp, (u_int8_t *)fhp,
5803 	    sizeof(fhandle_t), NULL, NULL, 0, 0);
5804 	nfsm_stateidtom(nd, &st, NFSSTATEID_PUTSTATEID);
5805 	NFSM_BUILD(tl, uint32_t *, NFSX_HYPER + NFSX_UNSIGNED);
5806 	txdr_hyper(*offp, tl); tl += 2;
5807 	*tl = txdr_unsigned(content);
5808 	error = newnfs_request(nd, nmp, NULL, &nmp->nm_sockreq, NULL, p, cred,
5809 	    NFS_PROG, NFS_VER4, NULL, 1, NULL, NULL);
5810 	if (error != 0) {
5811 		free(nd, M_TEMP);
5812 		return (error);
5813 	}
5814 	NFSD_DEBUG(4, "nfsrv_seekdsrpc: aft seekrpc=%d\n", nd->nd_repstat);
5815 	if (nd->nd_repstat == 0) {
5816 		NFSM_DISSECT(tl, uint32_t *, NFSX_UNSIGNED + NFSX_HYPER);
5817 		if (*tl++ == newnfs_true)
5818 			*eofp = true;
5819 		else
5820 			*eofp = false;
5821 		*offp = fxdr_hyper(tl);
5822 	} else
5823 		error = nd->nd_repstat;
5824 nfsmout:
5825 	m_freem(nd->nd_mrep);
5826 	free(nd, M_TEMP);
5827 	NFSD_DEBUG(4, "nfsrv_seekdsrpc error=%d\n", error);
5828 	return (error);
5829 }
5830 
5831 /*
5832  * Get the device id and file handle for a DS file.
5833  */
5834 int
5835 nfsrv_dsgetdevandfh(struct vnode *vp, NFSPROC_T *p, int *mirrorcntp,
5836     fhandle_t *fhp, char *devid)
5837 {
5838 	int buflen, error;
5839 	char *buf;
5840 
5841 	buflen = 1024;
5842 	buf = malloc(buflen, M_TEMP, M_WAITOK);
5843 	error = nfsrv_dsgetsockmnt(vp, 0, buf, &buflen, mirrorcntp, p, NULL,
5844 	    fhp, devid, NULL, NULL, NULL, NULL, NULL, NULL);
5845 	free(buf, M_TEMP);
5846 	return (error);
5847 }
5848 
5849 /*
5850  * Do a Lookup against the DS for the filename.
5851  */
5852 static int
5853 nfsrv_pnfslookupds(struct vnode *vp, struct vnode *dvp, struct pnfsdsfile *pf,
5854     struct vnode **nvpp, NFSPROC_T *p)
5855 {
5856 	struct nameidata named;
5857 	struct ucred *tcred;
5858 	char *bufp;
5859 	u_long *hashp;
5860 	struct vnode *nvp;
5861 	int error;
5862 
5863 	tcred = newnfs_getcred();
5864 	named.ni_cnd.cn_nameiop = LOOKUP;
5865 	named.ni_cnd.cn_lkflags = LK_SHARED | LK_RETRY;
5866 	named.ni_cnd.cn_cred = tcred;
5867 	named.ni_cnd.cn_thread = p;
5868 	named.ni_cnd.cn_flags = ISLASTCN | LOCKPARENT | LOCKLEAF | SAVENAME;
5869 	nfsvno_setpathbuf(&named, &bufp, &hashp);
5870 	named.ni_cnd.cn_nameptr = bufp;
5871 	named.ni_cnd.cn_namelen = strlen(pf->dsf_filename);
5872 	strlcpy(bufp, pf->dsf_filename, NAME_MAX);
5873 	NFSD_DEBUG(4, "nfsrv_pnfslookupds: filename=%s\n", bufp);
5874 	error = VOP_LOOKUP(dvp, &nvp, &named.ni_cnd);
5875 	NFSD_DEBUG(4, "nfsrv_pnfslookupds: aft LOOKUP=%d\n", error);
5876 	NFSFREECRED(tcred);
5877 	nfsvno_relpathbuf(&named);
5878 	if (error == 0)
5879 		*nvpp = nvp;
5880 	NFSD_DEBUG(4, "eo nfsrv_pnfslookupds=%d\n", error);
5881 	return (error);
5882 }
5883 
5884 /*
5885  * Set the file handle to the correct one.
5886  */
5887 static void
5888 nfsrv_pnfssetfh(struct vnode *vp, struct pnfsdsfile *pf, char *devid,
5889     char *fnamep, struct vnode *nvp, NFSPROC_T *p)
5890 {
5891 	struct nfsnode *np;
5892 	int ret = 0;
5893 
5894 	np = VTONFS(nvp);
5895 	NFSBCOPY(np->n_fhp->nfh_fh, &pf->dsf_fh, NFSX_MYFH);
5896 	/*
5897 	 * We can only do a vn_set_extattr() if the vnode is exclusively
5898 	 * locked and vn_start_write() has been done.  If devid != NULL or
5899 	 * fnamep != NULL or the vnode is shared locked, vn_start_write()
5900 	 * may not have been done.
5901 	 * If not done now, it will be done on a future call.
5902 	 */
5903 	if (devid == NULL && fnamep == NULL && NFSVOPISLOCKED(vp) ==
5904 	    LK_EXCLUSIVE)
5905 		ret = vn_extattr_set(vp, IO_NODELOCKED,
5906 		    EXTATTR_NAMESPACE_SYSTEM, "pnfsd.dsfile", sizeof(*pf),
5907 		    (char *)pf, p);
5908 	NFSD_DEBUG(4, "eo nfsrv_pnfssetfh=%d\n", ret);
5909 }
5910 
5911 /*
5912  * Cause RPCs waiting on "nmp" to fail.  This is called for a DS mount point
5913  * when the DS has failed.
5914  */
5915 void
5916 nfsrv_killrpcs(struct nfsmount *nmp)
5917 {
5918 
5919 	/*
5920 	 * Call newnfs_nmcancelreqs() to cause
5921 	 * any RPCs in progress on the mount point to
5922 	 * fail.
5923 	 * This will cause any process waiting for an
5924 	 * RPC to complete while holding a vnode lock
5925 	 * on the mounted-on vnode (such as "df" or
5926 	 * a non-forced "umount") to fail.
5927 	 * This will unlock the mounted-on vnode so
5928 	 * a forced dismount can succeed.
5929 	 * The NFSMNTP_CANCELRPCS flag should be set when this function is
5930 	 * called.
5931 	 */
5932 	newnfs_nmcancelreqs(nmp);
5933 }
5934 
5935 /*
5936  * Sum up the statfs info for each of the DSs, so that the client will
5937  * receive the total for all DSs.
5938  */
5939 static int
5940 nfsrv_pnfsstatfs(struct statfs *sf, struct mount *mp)
5941 {
5942 	struct statfs *tsf;
5943 	struct nfsdevice *ds;
5944 	struct vnode **dvpp, **tdvpp, *dvp;
5945 	uint64_t tot;
5946 	int cnt, error = 0, i;
5947 
5948 	if (nfsrv_devidcnt <= 0)
5949 		return (ENXIO);
5950 	dvpp = mallocarray(nfsrv_devidcnt, sizeof(*dvpp), M_TEMP, M_WAITOK);
5951 	tsf = malloc(sizeof(*tsf), M_TEMP, M_WAITOK);
5952 
5953 	/* Get an array of the dvps for the DSs. */
5954 	tdvpp = dvpp;
5955 	i = 0;
5956 	NFSDDSLOCK();
5957 	/* First, search for matches for same file system. */
5958 	TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
5959 		if (ds->nfsdev_nmp != NULL && ds->nfsdev_mdsisset != 0 &&
5960 		    ds->nfsdev_mdsfsid.val[0] == mp->mnt_stat.f_fsid.val[0] &&
5961 		    ds->nfsdev_mdsfsid.val[1] == mp->mnt_stat.f_fsid.val[1]) {
5962 			if (++i > nfsrv_devidcnt)
5963 				break;
5964 			*tdvpp++ = ds->nfsdev_dvp;
5965 		}
5966 	}
5967 	/*
5968 	 * If no matches for same file system, total all servers not assigned
5969 	 * to a file system.
5970 	 */
5971 	if (i == 0) {
5972 		TAILQ_FOREACH(ds, &nfsrv_devidhead, nfsdev_list) {
5973 			if (ds->nfsdev_nmp != NULL &&
5974 			    ds->nfsdev_mdsisset == 0) {
5975 				if (++i > nfsrv_devidcnt)
5976 					break;
5977 				*tdvpp++ = ds->nfsdev_dvp;
5978 			}
5979 		}
5980 	}
5981 	NFSDDSUNLOCK();
5982 	cnt = i;
5983 
5984 	/* Do a VFS_STATFS() for each of the DSs and sum them up. */
5985 	tdvpp = dvpp;
5986 	for (i = 0; i < cnt && error == 0; i++) {
5987 		dvp = *tdvpp++;
5988 		error = VFS_STATFS(dvp->v_mount, tsf);
5989 		if (error == 0) {
5990 			if (sf->f_bsize == 0) {
5991 				if (tsf->f_bsize > 0)
5992 					sf->f_bsize = tsf->f_bsize;
5993 				else
5994 					sf->f_bsize = 8192;
5995 			}
5996 			if (tsf->f_blocks > 0) {
5997 				if (sf->f_bsize != tsf->f_bsize) {
5998 					tot = tsf->f_blocks * tsf->f_bsize;
5999 					sf->f_blocks += (tot / sf->f_bsize);
6000 				} else
6001 					sf->f_blocks += tsf->f_blocks;
6002 			}
6003 			if (tsf->f_bfree > 0) {
6004 				if (sf->f_bsize != tsf->f_bsize) {
6005 					tot = tsf->f_bfree * tsf->f_bsize;
6006 					sf->f_bfree += (tot / sf->f_bsize);
6007 				} else
6008 					sf->f_bfree += tsf->f_bfree;
6009 			}
6010 			if (tsf->f_bavail > 0) {
6011 				if (sf->f_bsize != tsf->f_bsize) {
6012 					tot = tsf->f_bavail * tsf->f_bsize;
6013 					sf->f_bavail += (tot / sf->f_bsize);
6014 				} else
6015 					sf->f_bavail += tsf->f_bavail;
6016 			}
6017 		}
6018 	}
6019 	free(tsf, M_TEMP);
6020 	free(dvpp, M_TEMP);
6021 	return (error);
6022 }
6023 
6024 /*
6025  * Set an NFSv4 acl.
6026  */
6027 int
6028 nfsrv_setacl(struct vnode *vp, NFSACL_T *aclp, struct ucred *cred, NFSPROC_T *p)
6029 {
6030 	int error;
6031 
6032 	if (nfsrv_useacl == 0 || nfs_supportsnfsv4acls(vp) == 0) {
6033 		error = NFSERR_ATTRNOTSUPP;
6034 		goto out;
6035 	}
6036 	/*
6037 	 * With NFSv4 ACLs, chmod(2) may need to add additional entries.
6038 	 * Make sure it has enough room for that - splitting every entry
6039 	 * into two and appending "canonical six" entries at the end.
6040 	 * Cribbed out of kern/vfs_acl.c - Rick M.
6041 	 */
6042 	if (aclp->acl_cnt > (ACL_MAX_ENTRIES - 6) / 2) {
6043 		error = NFSERR_ATTRNOTSUPP;
6044 		goto out;
6045 	}
6046 	error = VOP_SETACL(vp, ACL_TYPE_NFS4, aclp, cred, p);
6047 	if (error == 0) {
6048 		error = nfsrv_dssetacl(vp, aclp, cred, p);
6049 		if (error == ENOENT)
6050 			error = 0;
6051 	}
6052 
6053 out:
6054 	NFSEXITCODE(error);
6055 	return (error);
6056 }
6057 
6058 /*
6059  * Seek vnode op call (actually it is a VOP_IOCTL()).
6060  * This function is called with the vnode locked, but unlocks and vrele()s
6061  * the vp before returning.
6062  */
6063 int
6064 nfsvno_seek(struct nfsrv_descript *nd, struct vnode *vp, u_long cmd,
6065     off_t *offp, int content, bool *eofp, struct ucred *cred, NFSPROC_T *p)
6066 {
6067 	struct nfsvattr at;
6068 	int error, ret;
6069 
6070 	ASSERT_VOP_LOCKED(vp, "nfsvno_seek vp");
6071 	/*
6072 	 * Attempt to seek on a DS file. A return of ENOENT implies
6073 	 * there is no DS file to seek on.
6074 	 */
6075 	error = nfsrv_proxyds(vp, 0, 0, cred, p, NFSPROC_SEEKDS, NULL,
6076 	    NULL, NULL, NULL, NULL, offp, content, eofp);
6077 	if (error != ENOENT) {
6078 		vput(vp);
6079 		return (error);
6080 	}
6081 
6082 	/*
6083 	 * Do the VOP_IOCTL() call.  For the case where *offp == file_size,
6084 	 * VOP_IOCTL() will return ENXIO.  However, the correct reply for
6085 	 * NFSv4.2 is *eofp == true and error == 0 for this case.
6086 	 */
6087 	NFSVOPUNLOCK(vp);
6088 	error = VOP_IOCTL(vp, cmd, offp, 0, cred, p);
6089 	*eofp = false;
6090 	if (error == ENXIO || (error == 0 && cmd == FIOSEEKHOLE)) {
6091 		/* Handle the cases where we might be at EOF. */
6092 		ret = nfsvno_getattr(vp, &at, nd, p, 0, NULL);
6093 		if (ret == 0 && *offp == at.na_size) {
6094 			*eofp = true;
6095 			error = 0;
6096 		}
6097 		if (ret != 0 && error == 0)
6098 			error = ret;
6099 	}
6100 	vrele(vp);
6101 	NFSEXITCODE(error);
6102 	return (error);
6103 }
6104 
6105 /*
6106  * Allocate vnode op call.
6107  */
6108 int
6109 nfsvno_allocate(struct vnode *vp, off_t off, off_t len, struct ucred *cred,
6110     NFSPROC_T *p)
6111 {
6112 	int error, trycnt;
6113 
6114 	ASSERT_VOP_ELOCKED(vp, "nfsvno_allocate vp");
6115 	/*
6116 	 * Attempt to allocate on a DS file. A return of ENOENT implies
6117 	 * there is no DS file to allocate on.
6118 	 */
6119 	error = nfsrv_proxyds(vp, off, 0, cred, p, NFSPROC_ALLOCATE, NULL,
6120 	    NULL, NULL, NULL, NULL, &len, 0, NULL);
6121 	if (error != ENOENT)
6122 		return (error);
6123 	error = 0;
6124 
6125 	/*
6126 	 * Do the actual VOP_ALLOCATE(), looping a reasonable number of
6127 	 * times to achieve completion.
6128 	 */
6129 	trycnt = 0;
6130 	while (error == 0 && len > 0 && trycnt++ < 20)
6131 		error = VOP_ALLOCATE(vp, &off, &len);
6132 	if (error == 0 && len > 0)
6133 		error = NFSERR_IO;
6134 	NFSEXITCODE(error);
6135 	return (error);
6136 }
6137 
6138 /*
6139  * Get Extended Atribute vnode op into an mbuf list.
6140  */
6141 int
6142 nfsvno_getxattr(struct vnode *vp, char *name, uint32_t maxresp,
6143     struct ucred *cred, struct thread *p, struct mbuf **mpp,
6144     struct mbuf **mpendp, int *lenp)
6145 {
6146 	struct iovec *iv;
6147 	struct uio io, *uiop = &io;
6148 	struct mbuf *m, *m2;
6149 	int alen, error, len, tlen;
6150 	size_t siz;
6151 
6152 	/* First, find out the size of the extended attribute. */
6153 	error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL,
6154 	    &siz, cred, p);
6155 	if (error != 0)
6156 		return (NFSERR_NOXATTR);
6157 	if (siz > maxresp - NFS_MAXXDR)
6158 		return (NFSERR_XATTR2BIG);
6159 	len = siz;
6160 	tlen = NFSM_RNDUP(len);
6161 	uiop->uio_iovcnt = nfsrv_createiovec(tlen, &m, &m2, &iv);
6162 	uiop->uio_iov = iv;
6163 	uiop->uio_offset = 0;
6164 	uiop->uio_resid = tlen;
6165 	uiop->uio_rw = UIO_READ;
6166 	uiop->uio_segflg = UIO_SYSSPACE;
6167 	uiop->uio_td = p;
6168 #ifdef MAC
6169 	error = mac_vnode_check_getextattr(cred, vp, EXTATTR_NAMESPACE_USER,
6170 	    name);
6171 	if (error != 0)
6172 		goto out;
6173 #endif
6174 
6175 	error = VOP_GETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop, NULL,
6176 	    cred, p);
6177 	if (error != 0)
6178 		goto out;
6179 	if (uiop->uio_resid > 0) {
6180 		alen = tlen;
6181 		len = tlen - uiop->uio_resid;
6182 		tlen = NFSM_RNDUP(len);
6183 		if (alen != tlen)
6184 			printf("nfsvno_getxattr: weird size read\n");
6185 		nfsrv_adj(m, alen - tlen, tlen - len);
6186 	}
6187 	*lenp = len;
6188 	*mpp = m;
6189 	*mpendp = m2;
6190 
6191 out:
6192 	if (error != 0) {
6193 		m_freem(m);
6194 		*lenp = 0;
6195 	}
6196 	free(iv, M_TEMP);
6197 	NFSEXITCODE(error);
6198 	return (error);
6199 }
6200 
6201 /*
6202  * Set Extended attribute vnode op from an mbuf list.
6203  */
6204 int
6205 nfsvno_setxattr(struct vnode *vp, char *name, int len, struct mbuf *m,
6206     char *cp, struct ucred *cred, struct thread *p)
6207 {
6208 	struct iovec *iv;
6209 	struct uio uio, *uiop = &uio;
6210 	int cnt, error;
6211 
6212 	error = 0;
6213 #ifdef MAC
6214 	error = mac_vnode_check_setextattr(cred, vp, EXTATTR_NAMESPACE_USER,
6215 	    name);
6216 #endif
6217 	if (error != 0)
6218 		goto out;
6219 
6220 	uiop->uio_rw = UIO_WRITE;
6221 	uiop->uio_segflg = UIO_SYSSPACE;
6222 	uiop->uio_td = p;
6223 	uiop->uio_offset = 0;
6224 	uiop->uio_resid = len;
6225 	error = nfsrv_createiovecw(len, m, cp, &iv, &cnt);
6226 	uiop->uio_iov = iv;
6227 	uiop->uio_iovcnt = cnt;
6228 	if (error == 0) {
6229 		error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, uiop,
6230 		    cred, p);
6231 		free(iv, M_TEMP);
6232 	}
6233 
6234 out:
6235 	NFSEXITCODE(error);
6236 	return (error);
6237 }
6238 
6239 /*
6240  * Remove Extended attribute vnode op.
6241  */
6242 int
6243 nfsvno_rmxattr(struct nfsrv_descript *nd, struct vnode *vp, char *name,
6244     struct ucred *cred, struct thread *p)
6245 {
6246 	int error;
6247 
6248 	/*
6249 	 * Get rid of any delegations.  I am not sure why this is required,
6250 	 * but RFC-8276 says so.
6251 	 */
6252 	error = nfsrv_checkremove(vp, 0, nd, nd->nd_clientid, p);
6253 	if (error != 0)
6254 		goto out;
6255 #ifdef MAC
6256 	error = mac_vnode_check_deleteextattr(cred, vp, EXTATTR_NAMESPACE_USER,
6257 	    name);
6258 	if (error != 0)
6259 		goto out;
6260 #endif
6261 
6262 	error = VOP_DELETEEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, cred, p);
6263 	if (error == EOPNOTSUPP)
6264 		error = VOP_SETEXTATTR(vp, EXTATTR_NAMESPACE_USER, name, NULL,
6265 		    cred, p);
6266 out:
6267 	NFSEXITCODE(error);
6268 	return (error);
6269 }
6270 
6271 /*
6272  * List Extended Atribute vnode op into an mbuf list.
6273  */
6274 int
6275 nfsvno_listxattr(struct vnode *vp, uint64_t cookie, struct ucred *cred,
6276     struct thread *p, u_char **bufp, uint32_t *lenp, bool *eofp)
6277 {
6278 	struct iovec iv;
6279 	struct uio io;
6280 	int error;
6281 	size_t siz;
6282 
6283 	*bufp = NULL;
6284 	/* First, find out the size of the extended attribute. */
6285 	error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, NULL, &siz, cred,
6286 	    p);
6287 	if (error != 0)
6288 		return (NFSERR_NOXATTR);
6289 	if (siz <= cookie) {
6290 		*lenp = 0;
6291 		*eofp = true;
6292 		goto out;
6293 	}
6294 	if (siz > cookie + *lenp) {
6295 		siz = cookie + *lenp;
6296 		*eofp = false;
6297 	} else
6298 		*eofp = true;
6299 	/* Just choose a sanity limit of 10Mbytes for malloc(M_TEMP). */
6300 	if (siz > 10 * 1024 * 1024) {
6301 		error = NFSERR_XATTR2BIG;
6302 		goto out;
6303 	}
6304 	*bufp = malloc(siz, M_TEMP, M_WAITOK);
6305 	iv.iov_base = *bufp;
6306 	iv.iov_len = siz;
6307 	io.uio_iovcnt = 1;
6308 	io.uio_iov = &iv;
6309 	io.uio_offset = 0;
6310 	io.uio_resid = siz;
6311 	io.uio_rw = UIO_READ;
6312 	io.uio_segflg = UIO_SYSSPACE;
6313 	io.uio_td = p;
6314 #ifdef MAC
6315 	error = mac_vnode_check_listextattr(cred, vp, EXTATTR_NAMESPACE_USER);
6316 	if (error != 0)
6317 		goto out;
6318 #endif
6319 
6320 	error = VOP_LISTEXTATTR(vp, EXTATTR_NAMESPACE_USER, &io, NULL, cred,
6321 	    p);
6322 	if (error != 0)
6323 		goto out;
6324 	if (io.uio_resid > 0)
6325 		siz -= io.uio_resid;
6326 	*lenp = siz;
6327 
6328 out:
6329 	if (error != 0) {
6330 		free(*bufp, M_TEMP);
6331 		*bufp = NULL;
6332 	}
6333 	NFSEXITCODE(error);
6334 	return (error);
6335 }
6336 
6337 extern int (*nfsd_call_nfsd)(struct thread *, struct nfssvc_args *);
6338 
6339 /*
6340  * Called once to initialize data structures...
6341  */
6342 static int
6343 nfsd_modevent(module_t mod, int type, void *data)
6344 {
6345 	int error = 0, i;
6346 	static int loaded = 0;
6347 
6348 	switch (type) {
6349 	case MOD_LOAD:
6350 		if (loaded)
6351 			goto out;
6352 		newnfs_portinit();
6353 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
6354 			mtx_init(&nfsrchash_table[i].mtx, "nfsrtc", NULL,
6355 			    MTX_DEF);
6356 			mtx_init(&nfsrcahash_table[i].mtx, "nfsrtca", NULL,
6357 			    MTX_DEF);
6358 		}
6359 		mtx_init(&nfsrc_udpmtx, "nfsuc", NULL, MTX_DEF);
6360 		mtx_init(&nfs_v4root_mutex, "nfs4rt", NULL, MTX_DEF);
6361 		mtx_init(&nfsv4root_mnt.mnt_mtx, "nfs4mnt", NULL, MTX_DEF);
6362 		mtx_init(&nfsrv_dontlistlock_mtx, "nfs4dnl", NULL, MTX_DEF);
6363 		mtx_init(&nfsrv_recalllock_mtx, "nfs4rec", NULL, MTX_DEF);
6364 		lockinit(&nfsv4root_mnt.mnt_explock, PVFS, "explock", 0, 0);
6365 		nfsrvd_initcache();
6366 		nfsd_init();
6367 		NFSD_LOCK();
6368 		nfsrvd_init(0);
6369 		NFSD_UNLOCK();
6370 		nfsd_mntinit();
6371 #ifdef VV_DISABLEDELEG
6372 		vn_deleg_ops.vndeleg_recall = nfsd_recalldelegation;
6373 		vn_deleg_ops.vndeleg_disable = nfsd_disabledelegation;
6374 #endif
6375 		nfsd_call_servertimer = nfsrv_servertimer;
6376 		nfsd_call_nfsd = nfssvc_nfsd;
6377 		loaded = 1;
6378 		break;
6379 
6380 	case MOD_UNLOAD:
6381 		if (newnfs_numnfsd != 0) {
6382 			error = EBUSY;
6383 			break;
6384 		}
6385 
6386 #ifdef VV_DISABLEDELEG
6387 		vn_deleg_ops.vndeleg_recall = NULL;
6388 		vn_deleg_ops.vndeleg_disable = NULL;
6389 #endif
6390 		nfsd_call_servertimer = NULL;
6391 		nfsd_call_nfsd = NULL;
6392 
6393 		/* Clean out all NFSv4 state. */
6394 		nfsrv_throwawayallstate(curthread);
6395 
6396 		/* Clean the NFS server reply cache */
6397 		nfsrvd_cleancache();
6398 
6399 		/* Free up the krpc server pool. */
6400 		if (nfsrvd_pool != NULL)
6401 			svcpool_destroy(nfsrvd_pool);
6402 
6403 		/* and get rid of the locks */
6404 		for (i = 0; i < NFSRVCACHE_HASHSIZE; i++) {
6405 			mtx_destroy(&nfsrchash_table[i].mtx);
6406 			mtx_destroy(&nfsrcahash_table[i].mtx);
6407 		}
6408 		mtx_destroy(&nfsrc_udpmtx);
6409 		mtx_destroy(&nfs_v4root_mutex);
6410 		mtx_destroy(&nfsv4root_mnt.mnt_mtx);
6411 		mtx_destroy(&nfsrv_dontlistlock_mtx);
6412 		mtx_destroy(&nfsrv_recalllock_mtx);
6413 		for (i = 0; i < nfsrv_sessionhashsize; i++)
6414 			mtx_destroy(&nfssessionhash[i].mtx);
6415 		if (nfslayouthash != NULL) {
6416 			for (i = 0; i < nfsrv_layouthashsize; i++)
6417 				mtx_destroy(&nfslayouthash[i].mtx);
6418 			free(nfslayouthash, M_NFSDSESSION);
6419 		}
6420 		lockdestroy(&nfsv4root_mnt.mnt_explock);
6421 		free(nfsclienthash, M_NFSDCLIENT);
6422 		free(nfslockhash, M_NFSDLOCKFILE);
6423 		free(nfssessionhash, M_NFSDSESSION);
6424 		loaded = 0;
6425 		break;
6426 	default:
6427 		error = EOPNOTSUPP;
6428 		break;
6429 	}
6430 
6431 out:
6432 	NFSEXITCODE(error);
6433 	return (error);
6434 }
6435 static moduledata_t nfsd_mod = {
6436 	"nfsd",
6437 	nfsd_modevent,
6438 	NULL,
6439 };
6440 DECLARE_MODULE(nfsd, nfsd_mod, SI_SUB_VFS, SI_ORDER_ANY);
6441 
6442 /* So that loader and kldload(2) can find us, wherever we are.. */
6443 MODULE_VERSION(nfsd, 1);
6444 MODULE_DEPEND(nfsd, nfscommon, 1, 1, 1);
6445 MODULE_DEPEND(nfsd, nfslock, 1, 1, 1);
6446 MODULE_DEPEND(nfsd, nfslockd, 1, 1, 1);
6447 MODULE_DEPEND(nfsd, krpc, 1, 1, 1);
6448 MODULE_DEPEND(nfsd, nfssvc, 1, 1, 1);
6449 
6450