xref: /freebsd/contrib/lib9p/backend/fs.c (revision e6bfd18d21b225af6a0ed67ceeaf1293b7b9eba5)
1 /*
2  * Copyright 2016 Jakub Klama <jceel@FreeBSD.org>
3  * All rights reserved
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted providing that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
15  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
18  * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
22  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
23  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
24  * POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Based on libixp code: �2007-2010 Kris Maglione <maglione.k at Gmail>
30  */
31 
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
35 #include <stdbool.h>
36 #include <fcntl.h>
37 #include <errno.h>
38 #include <assert.h>
39 #include <sys/types.h>
40 #include <sys/stat.h>
41 #include <sys/mount.h>
42 #include <sys/param.h>
43 #include <sys/queue.h>
44 #include <sys/socket.h>
45 #include <sys/un.h>
46 #include <dirent.h>
47 #include <pwd.h>
48 #include <grp.h>
49 #include <libgen.h>
50 #include <pthread.h>
51 #include "../lib9p.h"
52 #include "../lib9p_impl.h"
53 #include "../fid.h"
54 #include "../log.h"
55 #include "../rfuncs.h"
56 #include "../genacl.h"
57 #include "backend.h"
58 #include "fs.h"
59 
60 #if defined(WITH_CASPER)
61   #include <libcasper.h>
62   #include <casper/cap_pwd.h>
63   #include <casper/cap_grp.h>
64 #endif
65 
66 #if defined(__FreeBSD__)
67   #include <sys/param.h>
68   #if __FreeBSD_version >= 1000000
69     #define	HAVE_BINDAT
70   #endif
71 #endif
72 
73 #if defined(__FreeBSD__)
74   #define	HAVE_BIRTHTIME
75 #endif
76 
77 #if defined(__APPLE__)
78   #include <sys/syscall.h>
79   #include "Availability.h"
80   #define ACL_TYPE_NFS4 ACL_TYPE_EXTENDED
81 #endif
82 
83 struct fs_softc {
84 	int 	fs_rootfd;
85 	bool	fs_readonly;
86 #if defined(WITH_CASPER)
87 	cap_channel_t *fs_cappwd;
88 	cap_channel_t *fs_capgrp;
89 #endif
90 };
91 
92 struct fs_fid {
93 	DIR	*ff_dir;
94 	int	ff_dirfd;
95 	int	ff_fd;
96 	int	ff_flags;
97 	char	*ff_name;
98 	struct fs_authinfo *ff_ai;
99 	pthread_mutex_t ff_mtx;
100 	struct l9p_acl *ff_acl; /* cached ACL if any */
101 };
102 
103 #define	FF_NO_NFSV4_ACL	0x01	/* don't go looking for NFSv4 ACLs */
104 /*	FF_NO_POSIX_ACL	0x02	-- not yet */
105 
106 /*
107  * Our authinfo consists of:
108  *
109  *  - a reference count
110  *  - a uid
111  *  - a gid-set
112  *
113  * The "default" gid is the first gid in the git-set, provided the
114  * set size is at least 1.  The set-size may be zero, though.
115  *
116  * Adjustments to the ref-count must be atomic, once it's shared.
117  * It would be nice to use C11 atomics here but they are not common
118  * enough to all systems just yet; for now, we use a mutex.
119  *
120  * Note that some ops (Linux style ones) pass an effective gid for
121  * the op, in which case, that gid may override.  To achieve this
122  * effect, permissions testing functions also take an extra gid.
123  * If this gid is (gid_t)-1 it is not used and only the remaining
124  * gids take part.
125  *
126  * The uid may also be (uid_t)-1, meaning "no uid was available
127  * at all at attach time".  In this case, new files inherit parent
128  * directory uids.
129  *
130  * The refcount is simply the number of "openfile"s using this
131  * authinfo (so that when the last ref goes away, we can free it).
132  *
133  * There are also master ACL flags (same as in ff_flags).
134  */
135 struct fs_authinfo {
136 	pthread_mutex_t ai_mtx;	/* lock for refcnt */
137 	uint32_t ai_refcnt;
138 	int	ai_flags;
139 	uid_t	ai_uid;
140 	int	ai_ngids;
141 	gid_t	ai_gids[];	/* NB: flexible array member */
142 };
143 
144 /*
145  * We have a global-static mutex for single-threading Tattach
146  * requests, which use getpwnam (and indirectly, getgr* functions)
147  * which are not reentrant.
148  */
149 static bool fs_attach_mutex_inited;
150 static pthread_mutex_t fs_attach_mutex;
151 
152 /*
153  * Internal functions (except inline functions).
154  */
155 static struct passwd *fs_getpwuid(struct fs_softc *, uid_t, struct r_pgdata *);
156 static struct group *fs_getgrgid(struct fs_softc *, gid_t, struct r_pgdata *);
157 static int fs_buildname(struct l9p_fid *, char *, char *, size_t);
158 static int fs_pdir(struct fs_softc *, struct l9p_fid *, char *, size_t,
159     struct stat *st);
160 static int fs_dpf(char *, char *, size_t);
161 static int fs_oflags_dotu(int, int *);
162 static int fs_oflags_dotl(uint32_t, int *, enum l9p_omode *);
163 static int fs_nde(struct fs_softc *, struct l9p_fid *, bool, gid_t,
164     struct stat *, uid_t *, gid_t *);
165 static struct fs_fid *open_fid(int, const char *, struct fs_authinfo *, bool);
166 static void dostat(struct fs_softc *, struct l9p_stat *, char *,
167     struct stat *, bool dotu);
168 static void dostatfs(struct l9p_statfs *, struct statfs *, long);
169 static void fillacl(struct fs_fid *ff);
170 static struct l9p_acl *getacl(struct fs_fid *ff, int fd, const char *path);
171 static void dropacl(struct fs_fid *ff);
172 static struct l9p_acl *look_for_nfsv4_acl(struct fs_fid *ff, int fd,
173     const char *path);
174 static int check_access(int32_t,
175     struct l9p_acl *, struct stat *, struct l9p_acl *, struct stat *,
176     struct fs_authinfo *, gid_t);
177 static void generate_qid(struct stat *, struct l9p_qid *);
178 
179 static int fs_icreate(void *, struct l9p_fid *, char *, int,
180     bool, mode_t, gid_t, struct stat *);
181 static int fs_iopen(void *, struct l9p_fid *, int, enum l9p_omode,
182     gid_t, struct stat *);
183 static int fs_imkdir(void *, struct l9p_fid *, char *,
184     bool, mode_t, gid_t, struct stat *);
185 static int fs_imkfifo(void *, struct l9p_fid *, char *,
186     bool, mode_t, gid_t, struct stat *);
187 static int fs_imknod(void *, struct l9p_fid *, char *,
188     bool, mode_t, dev_t, gid_t, struct stat *);
189 static int fs_imksocket(void *, struct l9p_fid *, char *,
190     bool, mode_t, gid_t, struct stat *);
191 static int fs_isymlink(void *, struct l9p_fid *, char *, char *,
192     gid_t, struct stat *);
193 
194 /*
195  * Internal functions implementing backend.
196  */
197 static int fs_attach(void *, struct l9p_request *);
198 static int fs_clunk(void *, struct l9p_fid *);
199 static int fs_create(void *, struct l9p_request *);
200 static int fs_open(void *, struct l9p_request *);
201 static int fs_read(void *, struct l9p_request *);
202 static int fs_remove(void *, struct l9p_fid *);
203 static int fs_stat(void *, struct l9p_request *);
204 static int fs_walk(void *, struct l9p_request *);
205 static int fs_write(void *, struct l9p_request *);
206 static int fs_wstat(void *, struct l9p_request *);
207 static int fs_statfs(void *, struct l9p_request *);
208 static int fs_lopen(void *, struct l9p_request *);
209 static int fs_lcreate(void *, struct l9p_request *);
210 static int fs_symlink(void *, struct l9p_request *);
211 static int fs_mknod(void *, struct l9p_request *);
212 static int fs_rename(void *, struct l9p_request *);
213 static int fs_readlink(void *, struct l9p_request *);
214 static int fs_getattr(void *, struct l9p_request *);
215 static int fs_setattr(void *, struct l9p_request *);
216 static int fs_xattrwalk(void *, struct l9p_request *);
217 static int fs_xattrcreate(void *, struct l9p_request *);
218 static int fs_readdir(void *, struct l9p_request *);
219 static int fs_fsync(void *, struct l9p_request *);
220 static int fs_lock(void *, struct l9p_request *);
221 static int fs_getlock(void *, struct l9p_request *);
222 static int fs_link(void *, struct l9p_request *);
223 static int fs_renameat(void *, struct l9p_request *);
224 static int fs_unlinkat(void *, struct l9p_request *);
225 static void fs_freefid(void *, struct l9p_fid *);
226 
227 /*
228  * Convert from 9p2000 open/create mode to Unix-style O_* flags.
229  * This includes 9p2000.u extensions, but not 9p2000.L protocol,
230  * which has entirely different open, create, etc., flag bits.
231  *
232  * The <mode> given here is the one-byte (uint8_t) "mode"
233  * argument to Tcreate or Topen, so it can have at most 8 bits.
234  *
235  * https://swtch.com/plan9port/man/man9/open.html and
236  * http://plan9.bell-labs.com/magic/man2html/5/open
237  * both say:
238  *
239  *   The [low two bits of the] mode field determines the
240  *   type of I/O ... [I]f mode has the OTRUNC (0x10) bit
241  *   set, the file is to be truncated, which requires write
242  *   permission ...; if the mode has the ORCLOSE (0x40) bit
243  *   set, the file is to be removed when the fid is clunked,
244  *   which requires permission to remove the file from its
245  *   directory.  All other bits in mode should be zero.  It
246  *   is illegal to write a directory, truncate it, or
247  *   attempt to remove it on close.
248  *
249  * 9P2000.u may add ODIRECT (0x80); this is not completely clear.
250  * The fcall.h header defines OCEXEC (0x20) as well, but it makes
251  * no sense to send this to a server.  There seem to be no bits
252  * 0x04 and 0x08.
253  *
254  * We always turn on O_NOCTTY since as a server, we never want
255  * to gain a controlling terminal.  We always turn on O_NOFOLLOW
256  * for reasons described elsewhere.
257  */
258 static int
259 fs_oflags_dotu(int mode, int *aflags)
260 {
261 	int flags;
262 #define	CONVERT(theirs, ours) \
263 	do { \
264 		if (mode & (theirs)) { \
265 			mode &= ~(theirs); \
266 			flags |= ours; \
267 		} \
268 	} while (0)
269 
270 	switch (mode & L9P_OACCMODE) {
271 
272 	case L9P_OREAD:
273 	default:
274 		flags = O_RDONLY;
275 		break;
276 
277 	case L9P_OWRITE:
278 		flags = O_WRONLY;
279 		break;
280 
281 	case L9P_ORDWR:
282 		flags = O_RDWR;
283 		break;
284 
285 	case L9P_OEXEC:
286 		if (mode & L9P_OTRUNC)
287 			return (EINVAL);
288 		flags = O_RDONLY;
289 		break;
290 	}
291 
292 	flags |= O_NOCTTY | O_NOFOLLOW;
293 
294 	CONVERT(L9P_OTRUNC, O_TRUNC);
295 
296 	/*
297 	 * Now take away some flags locally:
298 	 *   the access mode (already translated)
299 	 *   ORCLOSE - caller only
300 	 *   OCEXEC - makes no sense in server
301 	 *   ODIRECT - not applicable here
302 	 * If there are any flag bits left after this,
303 	 * we were unable to translate them.  For now, let's
304 	 * treat this as EINVAL so that we can catch problems.
305 	 */
306 	mode &= ~(L9P_OACCMODE | L9P_ORCLOSE | L9P_OCEXEC | L9P_ODIRECT);
307 	if (mode != 0) {
308 		L9P_LOG(L9P_INFO,
309 		    "fs_oflags_dotu: untranslated bits: %#x",
310 		    (unsigned)mode);
311 		return (EINVAL);
312 	}
313 
314 	*aflags = flags;
315 	return (0);
316 #undef CONVERT
317 }
318 
319 /*
320  * Convert from 9P2000.L (Linux) open mode bits to O_* flags.
321  * See fs_oflags_dotu above.
322  *
323  * Linux currently does not have open-for-exec, but there is a
324  * proposal for it using O_PATH|O_NOFOLLOW, now handled here.
325  *
326  * We may eventually also set L9P_ORCLOSE for L_O_TMPFILE.
327  */
328 static int
329 fs_oflags_dotl(uint32_t l_mode, int *aflags, enum l9p_omode *ap9)
330 {
331 	int flags;
332 	enum l9p_omode p9;
333 #define	CLEAR(theirs)	l_mode &= ~(uint32_t)(theirs)
334 #define	CONVERT(theirs, ours) \
335 	do { \
336 		if (l_mode & (theirs)) { \
337 			CLEAR(theirs); \
338 			flags |= ours; \
339 		} \
340 	} while (0)
341 
342 	/*
343 	 * Linux O_RDONLY, O_WRONLY, O_RDWR (0,1,2) match BSD/MacOS.
344 	 */
345 	flags = l_mode & O_ACCMODE;
346 	if (flags == 3)
347 		return (EINVAL);
348 	CLEAR(O_ACCMODE);
349 
350 	if ((l_mode & (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) ==
351 		    (L9P_L_O_PATH | L9P_L_O_NOFOLLOW)) {
352 		CLEAR(L9P_L_O_PATH | L9P_L_O_NOFOLLOW);
353 		p9 = L9P_OEXEC;
354 	} else {
355 		/*
356 		 * Slightly dirty, but same dirt, really, as
357 		 * setting flags from l_mode & O_ACCMODE.
358 		 */
359 		p9 = (enum l9p_omode)flags;	/* slightly dirty */
360 	}
361 
362 	/* turn L_O_TMPFILE into L9P_ORCLOSE in *p9? */
363 	if (l_mode & L9P_L_O_TRUNC)
364 		p9 |= L9P_OTRUNC;	/* but don't CLEAR yet */
365 
366 	flags |= O_NOCTTY | O_NOFOLLOW;
367 
368 	/*
369 	 * L_O_CREAT seems to be noise, since we get separate open
370 	 * and create.  But it is actually set sometimes.  We just
371 	 * throw it out here; create ops must set it themselves and
372 	 * open ops have no permissions bits and hence cannot create.
373 	 *
374 	 * L_O_EXCL does make sense on create ops, i.e., we can
375 	 * take a create op with or without L_O_EXCL.  We pass that
376 	 * through.
377 	 */
378 	CLEAR(L9P_L_O_CREAT);
379 	CONVERT(L9P_L_O_EXCL, O_EXCL);
380 	CONVERT(L9P_L_O_TRUNC, O_TRUNC);
381 	CONVERT(L9P_L_O_DIRECTORY, O_DIRECTORY);
382 	CONVERT(L9P_L_O_APPEND, O_APPEND);
383 	CONVERT(L9P_L_O_NONBLOCK, O_NONBLOCK);
384 
385 	/*
386 	 * Discard these as useless noise at our (server) end.
387 	 * (NOATIME might be useful but we can only set it on a
388 	 * per-mount basis.)
389 	 */
390 	CLEAR(L9P_L_O_CLOEXEC);
391 	CLEAR(L9P_L_O_DIRECT);
392 	CLEAR(L9P_L_O_DSYNC);
393 	CLEAR(L9P_L_O_FASYNC);
394 	CLEAR(L9P_L_O_LARGEFILE);
395 	CLEAR(L9P_L_O_NOATIME);
396 	CLEAR(L9P_L_O_NOCTTY);
397 	CLEAR(L9P_L_O_NOFOLLOW);
398 	CLEAR(L9P_L_O_SYNC);
399 
400 	if (l_mode != 0) {
401 		L9P_LOG(L9P_INFO,
402 		    "fs_oflags_dotl: untranslated bits: %#x",
403 		    (unsigned)l_mode);
404 		return (EINVAL);
405 	}
406 
407 	*aflags = flags;
408 	*ap9 = p9;
409 	return (0);
410 #undef CLEAR
411 #undef CONVERT
412 }
413 
414 static struct passwd *
415 fs_getpwuid(struct fs_softc *sc, uid_t uid, struct r_pgdata *pg)
416 {
417 #if defined(WITH_CASPER)
418 	return (r_cap_getpwuid(sc->fs_cappwd, uid, pg));
419 #else
420 	(void)sc;
421 	return (r_getpwuid(uid, pg));
422 #endif
423 }
424 
425 static struct group *
426 fs_getgrgid(struct fs_softc *sc, gid_t gid, struct r_pgdata *pg)
427 {
428 #if defined(WITH_CASPER)
429 	return (r_cap_getgrgid(sc->fs_capgrp, gid, pg));
430 #else
431 	(void)sc;
432 	return (r_getgrgid(gid, pg));
433 #endif
434 }
435 
436 /*
437  * Build full name of file by appending given name to directory name.
438  */
439 static int
440 fs_buildname(struct l9p_fid *dir, char *name, char *buf, size_t size)
441 {
442 	struct fs_fid *dirf = dir->lo_aux;
443 	size_t dlen, nlen1;
444 
445 	assert(dirf != NULL);
446 	dlen = strlen(dirf->ff_name);
447 	nlen1 = strlen(name) + 1;	/* +1 for '\0' */
448 	if (dlen + 1 + nlen1 > size)
449 		return (ENAMETOOLONG);
450 	memcpy(buf, dirf->ff_name, dlen);
451 	buf[dlen] = '/';
452 	memcpy(buf + dlen + 1, name, nlen1);
453 	return (0);
454 }
455 
456 /*
457  * Build parent name of file by splitting it off.  Return an error
458  * if the given fid represents the root, so that there is no such
459  * parent, or if the discovered parent is not a directory.
460  */
461 static int
462 fs_pdir(struct fs_softc *sc __unused, struct l9p_fid *fid, char *buf,
463     size_t size, struct stat *st)
464 {
465 	struct fs_fid *ff;
466 	char *path;
467 
468 	ff = fid->lo_aux;
469 	assert(ff != NULL);
470 	path = ff->ff_name;
471 	path = r_dirname(path, buf, size);
472 	if (path == NULL)
473 		return (ENAMETOOLONG);
474 	if (fstatat(ff->ff_dirfd, path, st, AT_SYMLINK_NOFOLLOW) != 0)
475 		return (errno);
476 	if (!S_ISDIR(st->st_mode))
477 		return (ENOTDIR);
478 	return (0);
479 }
480 
481 /*
482  * Like fs_buildname() but for adding a file name to a buffer
483  * already holding a directory name.  Essentially does
484  *     strcat(dbuf, "/");
485  *     strcat(dbuf, fname);
486  * but with size checking and an ENAMETOOLONG error as needed.
487  *
488  * (Think of the function name as "directory plus-equals file".)
489  */
490 static int
491 fs_dpf(char *dbuf, char *fname, size_t size)
492 {
493 	size_t dlen, nlen1;
494 
495 	dlen = strlen(dbuf);
496 	nlen1 = strlen(fname) + 1;
497 	if (dlen + 1 + nlen1 > size)
498 		return (ENAMETOOLONG);
499 	dbuf[dlen] = '/';
500 	memcpy(dbuf + dlen + 1, fname, nlen1);
501 	return (0);
502 }
503 
504 /*
505  * Prepare to create a new directory entry (open with O_CREAT,
506  * mkdir, etc -- any operation that creates a new inode),
507  * operating in parent data <dir>, based on authinfo <ai> and
508  * effective gid <egid>.
509  *
510  * The new entity should be owned by user/group <*nuid, *ngid>,
511  * if it's really a new entity.  It will be a directory if isdir.
512  *
513  * Returns an error number if the entry should not be created
514  * (e.g., read-only file system or no permission to write in
515  * parent directory).  Always sets *nuid and *ngid on success:
516  * in the worst case, when there is no available ID, this will
517  * use the parent directory's IDs.  Fills in <*st> on success.
518  */
519 static int
520 fs_nde(struct fs_softc *sc, struct l9p_fid *dir, bool isdir, gid_t egid,
521     struct stat *st, uid_t *nuid, gid_t *ngid)
522 {
523 	struct fs_fid *dirf;
524 	struct fs_authinfo *ai;
525 	int32_t op;
526 	int error;
527 
528 	if (sc->fs_readonly)
529 		return (EROFS);
530 	dirf = dir->lo_aux;
531 	assert(dirf != NULL);
532 	if (fstatat(dirf->ff_dirfd, dirf->ff_name, st,
533 	    AT_SYMLINK_NOFOLLOW) != 0)
534 		return (errno);
535 	if (!S_ISDIR(st->st_mode))
536 		return (ENOTDIR);
537 	dirf = dir->lo_aux;
538 	ai = dirf->ff_ai;
539 	fillacl(dirf);
540 	op = isdir ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
541 	error = check_access(op, dirf->ff_acl, st, NULL, NULL, ai, egid);
542 	if (error)
543 		return (EPERM);
544 
545 	*nuid = ai->ai_uid != (uid_t)-1 ? ai->ai_uid : st->st_uid;
546 	*ngid = egid != (gid_t)-1 ? egid :
547 	    ai->ai_ngids > 0 ?  ai->ai_gids[0] : st->st_gid;
548 	return (0);
549 }
550 
551 /*
552  * Allocate new open-file data structure to attach to a fid.
553  *
554  * The new file's authinfo is the same as the old one's, and
555  * we gain a reference.
556  */
557 static struct fs_fid *
558 open_fid(int dirfd, const char *path, struct fs_authinfo *ai, bool creating)
559 {
560 	struct fs_fid *ret;
561 	uint32_t newcount;
562 	int error;
563 
564 	ret = l9p_calloc(1, sizeof(*ret));
565 	error = pthread_mutex_init(&ret->ff_mtx, NULL);
566 	if (error) {
567 		free(ret);
568 		return (NULL);
569 	}
570 	ret->ff_fd = -1;
571 	ret->ff_dirfd = dirfd;
572 	ret->ff_name = strdup(path);
573 	if (ret->ff_name == NULL) {
574 		pthread_mutex_destroy(&ret->ff_mtx);
575 		free(ret);
576 		return (NULL);
577 	}
578 	pthread_mutex_lock(&ai->ai_mtx);
579 	newcount = ++ai->ai_refcnt;
580 	pthread_mutex_unlock(&ai->ai_mtx);
581 	/*
582 	 * If we just incremented the count to 1, we're the *first*
583 	 * reference.  This is only allowed when creating the authinfo,
584 	 * otherwise it means something has gone wrong.  This cannot
585 	 * catch every bad (re)use of a freed authinfo but it may catch
586 	 * a few.
587 	 */
588 	assert(newcount > 1 || creating);
589 	L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
590 	    (void *)ai, (u_long)newcount);
591 	ret->ff_ai = ai;
592 	return (ret);
593 }
594 
595 static void
596 dostat(struct fs_softc *sc, struct l9p_stat *s, char *name,
597     struct stat *buf, bool dotu)
598 {
599 	struct passwd *user;
600 	struct group *group;
601 
602 	memset(s, 0, sizeof(struct l9p_stat));
603 
604 	generate_qid(buf, &s->qid);
605 
606 	s->type = 0;
607 	s->dev = 0;
608 	s->mode = buf->st_mode & 0777;
609 
610 	if (S_ISDIR(buf->st_mode))
611 		s->mode |= L9P_DMDIR;
612 
613 	if (S_ISLNK(buf->st_mode) && dotu)
614 		s->mode |= L9P_DMSYMLINK;
615 
616 	if (S_ISCHR(buf->st_mode) || S_ISBLK(buf->st_mode))
617 		s->mode |= L9P_DMDEVICE;
618 
619 	if (S_ISSOCK(buf->st_mode))
620 		s->mode |= L9P_DMSOCKET;
621 
622 	if (S_ISFIFO(buf->st_mode))
623 		s->mode |= L9P_DMNAMEDPIPE;
624 
625 	s->atime = (uint32_t)buf->st_atime;
626 	s->mtime = (uint32_t)buf->st_mtime;
627 	s->length = (uint64_t)buf->st_size;
628 
629 	s->name = r_basename(name, NULL, 0);
630 
631 	if (!dotu) {
632 		struct r_pgdata udata, gdata;
633 
634 		user = fs_getpwuid(sc, buf->st_uid, &udata);
635 		group = fs_getgrgid(sc, buf->st_gid, &gdata);
636 		s->uid = user != NULL ? strdup(user->pw_name) : NULL;
637 		s->gid = group != NULL ? strdup(group->gr_name) : NULL;
638 		s->muid = user != NULL ? strdup(user->pw_name) : NULL;
639 		r_pgfree(&udata);
640 		r_pgfree(&gdata);
641 	} else {
642 		/*
643 		 * When using 9P2000.u, we don't need to bother about
644 		 * providing user and group names in textual form.
645 		 *
646 		 * NB: if the asprintf()s fail, s->extension should
647 		 * be unset so we can ignore these.
648 		 */
649 		s->n_uid = buf->st_uid;
650 		s->n_gid = buf->st_gid;
651 		s->n_muid = buf->st_uid;
652 
653 		if (S_ISLNK(buf->st_mode)) {
654 			char target[MAXPATHLEN];
655 			ssize_t ret = readlink(name, target, MAXPATHLEN);
656 
657 			if (ret < 0) {
658 				s->extension = NULL;
659 				return;
660 			}
661 
662 			s->extension = strndup(target, (size_t)ret);
663 		}
664 
665 		if (S_ISBLK(buf->st_mode)) {
666 			asprintf(&s->extension, "b %d %d", major(buf->st_rdev),
667 			    minor(buf->st_rdev));
668 		}
669 
670 		if (S_ISCHR(buf->st_mode)) {
671 			asprintf(&s->extension, "c %d %d", major(buf->st_rdev),
672 			    minor(buf->st_rdev));
673 		}
674 	}
675 }
676 
677 static void dostatfs(struct l9p_statfs *out, struct statfs *in, long namelen)
678 {
679 
680 	out->type = L9P_FSTYPE;
681 	out->bsize = in->f_bsize;
682 	out->blocks = in->f_blocks;
683 	out->bfree = in->f_bfree;
684 	out->bavail = in->f_bavail;
685 	out->files = in->f_files;
686 	out->ffree = in->f_ffree;
687 	out->namelen = (uint32_t)namelen;
688 	out->fsid = ((uint64_t)in->f_fsid.val[0] << 32) |
689 	    (uint64_t)in->f_fsid.val[1];
690 }
691 
692 static void
693 generate_qid(struct stat *buf, struct l9p_qid *qid)
694 {
695 	qid->path = buf->st_ino;
696 	qid->version = 0;
697 
698 	if (S_ISREG(buf->st_mode))
699 		qid->type |= L9P_QTFILE;
700 
701 	if (S_ISDIR(buf->st_mode))
702 		qid->type |= L9P_QTDIR;
703 
704 	if (S_ISLNK(buf->st_mode))
705 		qid->type |= L9P_QTSYMLINK;
706 }
707 
708 /*
709  * Fill in ff->ff_acl if it's not set yet.  Skip if the "don't use
710  * ACLs" flag is set, and use the flag to remember failure so
711  * we don't bother retrying either.
712  */
713 static void
714 fillacl(struct fs_fid *ff)
715 {
716 
717 	if (ff->ff_acl == NULL && (ff->ff_flags & FF_NO_NFSV4_ACL) == 0) {
718 		ff->ff_acl = look_for_nfsv4_acl(ff, ff->ff_fd, ff->ff_name);
719 		if (ff->ff_acl == NULL)
720 			ff->ff_flags |= FF_NO_NFSV4_ACL;
721 	}
722 }
723 
724 /*
725  * Get an ACL given fd and/or path name.  We check for the "don't get
726  * ACL" flag in the given ff_fid data structure first, but don't set
727  * the flag here.  The fillacl() code is similar but will set the
728  * flag; it also uses the ff_fd and ff_name directly.
729  *
730  * (This is used to get ACLs for parent directories, for instance.)
731  */
732 static struct l9p_acl *
733 getacl(struct fs_fid *ff, int fd, const char *path)
734 {
735 
736 	if (ff->ff_flags & FF_NO_NFSV4_ACL)
737 		return (NULL);
738 	return look_for_nfsv4_acl(ff, fd, path);
739 }
740 
741 /*
742  * Drop cached ff->ff_acl, e.g., after moving from one directory to
743  * another, where inherited ACLs might change.
744  */
745 static void
746 dropacl(struct fs_fid *ff)
747 {
748 
749 	l9p_acl_free(ff->ff_acl);
750 	ff->ff_acl = NULL;
751 	ff->ff_flags = ff->ff_ai->ai_flags;
752 }
753 
754 /*
755  * Check to see if we can find NFSv4 ACLs for the given file.
756  * If we have an open fd, we can use that, otherwise we need
757  * to use the path.
758  */
759 static struct l9p_acl *
760 look_for_nfsv4_acl(struct fs_fid *ff, int fd, const char *path)
761 {
762 	struct l9p_acl *acl;
763 	acl_t sysacl;
764 	int doclose = 0;
765 
766 	if (fd < 0) {
767 		fd = openat(ff->ff_dirfd, path, 0);
768 		doclose = 1;
769 	}
770 
771 	sysacl = acl_get_fd_np(fd, ACL_TYPE_NFS4);
772 	if (sysacl == NULL) {
773 		/*
774 		 * EINVAL means no NFSv4 ACLs apply for this file.
775 		 * Other error numbers indicate some kind of problem.
776 		 */
777 		if (errno != EINVAL) {
778 			L9P_LOG(L9P_ERROR,
779 			    "error retrieving NFSv4 ACL from "
780 			    "fdesc %d (%s): %s", fd,
781 			    path, strerror(errno));
782 		}
783 
784 		if (doclose)
785 			close(fd);
786 
787 		return (NULL);
788 	}
789 #if defined(HAVE_FREEBSD_ACLS)
790 	acl = l9p_freebsd_nfsv4acl_to_acl(sysacl);
791 #else
792 	acl = NULL; /* XXX need a l9p_darwin_acl_to_acl */
793 #endif
794 	acl_free(sysacl);
795 
796 	if (doclose)
797 		close(fd);
798 
799 	return (acl);
800 }
801 
802 /*
803  * Verify that the user whose authinfo is in <ai> and effective
804  * group ID is <egid> ((gid_t)-1 means no egid supplied) has
805  * permission to do something.
806  *
807  * The "something" may be rather complex: we allow NFSv4 style
808  * operation masks here, and provide parent and child ACLs and
809  * stat data.  At most one of pacl+pst and cacl+cst can be NULL,
810  * unless ACLs are not supported; then pacl and cacl can both
811  * be NULL but pst or cst must be non-NULL depending on the
812  * operation.
813  */
814 static int
815 check_access(int32_t opmask,
816     struct l9p_acl *pacl, struct stat *pst,
817     struct l9p_acl *cacl, struct stat *cst,
818     struct fs_authinfo *ai, gid_t egid)
819 {
820 	struct l9p_acl_check_args args;
821 
822 	/*
823 	 * If we have ACLs, use them exclusively, ignoring Unix
824 	 * permissions.  Otherwise, fall back on stat st_mode
825 	 * bits, and allow super-user as well.
826 	 */
827 	args.aca_uid = ai->ai_uid;
828 	args.aca_gid = egid;
829 	args.aca_groups = ai->ai_gids;
830 	args.aca_ngroups = (size_t)ai->ai_ngids;
831 	args.aca_parent = pacl;
832 	args.aca_pstat = pst;
833 	args.aca_child = cacl;
834 	args.aca_cstat = cst;
835 	args.aca_aclmode = pacl == NULL && cacl == NULL
836 	    ? L9P_ACM_STAT_MODE
837 	    : L9P_ACM_NFS_ACL | L9P_ACM_ZFS_ACL;
838 
839 	args.aca_superuser = true;
840 	return (l9p_acl_check_access(opmask, &args));
841 }
842 
843 static int
844 fs_attach(void *softc, struct l9p_request *req)
845 {
846 	struct fs_authinfo *ai;
847 	struct fs_softc *sc = (struct fs_softc *)softc;
848 	struct fs_fid *file;
849 	struct passwd *pwd;
850 	struct stat st;
851 	struct r_pgdata udata;
852 	uint32_t n_uname;
853 	gid_t *gids;
854 	uid_t uid;
855 	int error;
856 	int ngroups;
857 
858 	assert(req->lr_fid != NULL);
859 
860 	/*
861 	 * Single-thread pwd/group related items.  We have a reentrant
862 	 * r_getpwuid but not a reentrant r_getpwnam, and l9p_getgrlist
863 	 * may use non-reentrant C library getgr* routines.
864 	 */
865 	pthread_mutex_lock(&fs_attach_mutex);
866 
867 	n_uname = req->lr_req.tattach.n_uname;
868 	if (n_uname != L9P_NONUNAME) {
869 		uid = (uid_t)n_uname;
870 		pwd = fs_getpwuid(sc, uid, &udata);
871 		if (pwd == NULL)
872 			L9P_LOG(L9P_DEBUG,
873 			    "Tattach: uid %ld: no such user", (long)uid);
874 	} else {
875 		uid = (uid_t)-1;
876 #if defined(WITH_CASPER)
877 		pwd = cap_getpwnam(sc->fs_cappwd, req->lr_req.tattach.uname);
878 #else
879 		pwd = getpwnam(req->lr_req.tattach.uname);
880 #endif
881 		if (pwd == NULL)
882 			L9P_LOG(L9P_DEBUG,
883 			    "Tattach: %s: no such user",
884 			    req->lr_req.tattach.uname);
885 	}
886 
887 	/*
888 	 * If caller didn't give a numeric UID, pick it up from pwd
889 	 * if possible.  If that doesn't work we can't continue.
890 	 *
891 	 * Note that pwd also supplies the group set.  This assumes
892 	 * the server has the right mapping; this needs improvement.
893 	 * We do at least support ai->ai_ngids==0 properly now though.
894 	 */
895 	if (uid == (uid_t)-1 && pwd != NULL)
896 		uid = pwd->pw_uid;
897 	if (uid == (uid_t)-1)
898 		error = EPERM;
899 	else {
900 		error = 0;
901 		if (fstat(sc->fs_rootfd, &st) != 0)
902 			error = errno;
903 		else if (!S_ISDIR(st.st_mode))
904 			error = ENOTDIR;
905 	}
906 	if (error) {
907 		pthread_mutex_unlock(&fs_attach_mutex);
908 		L9P_LOG(L9P_DEBUG,
909 		    "Tattach: denying uid=%ld access to rootdir: %s",
910 		    (long)uid, strerror(error));
911 		/*
912 		 * Pass ENOENT and ENOTDIR through for diagnosis;
913 		 * others become EPERM.  This should not leak too
914 		 * much security.
915 		 */
916 		return (error == ENOENT || error == ENOTDIR ? error : EPERM);
917 	}
918 
919 	if (pwd != NULL) {
920 		/*
921 		 * This either succeeds and fills in ngroups and
922 		 * returns non-NULL, or fails and sets ngroups to 0
923 		 * and returns NULL.  Either way ngroups is correct.
924 		 */
925 		gids = l9p_getgrlist(pwd->pw_name, pwd->pw_gid, &ngroups);
926 	} else {
927 		gids = NULL;
928 		ngroups = 0;
929 	}
930 
931 	/*
932 	 * Done with pwd and group related items that may use
933 	 * non-reentrant C library routines; allow other threads in.
934 	 */
935 	pthread_mutex_unlock(&fs_attach_mutex);
936 
937 	ai = malloc(sizeof(*ai) + (size_t)ngroups * sizeof(gid_t));
938 	if (ai == NULL) {
939 		free(gids);
940 		return (ENOMEM);
941 	}
942 	error = pthread_mutex_init(&ai->ai_mtx, NULL);
943 	if (error) {
944 		free(gids);
945 		free(ai);
946 		return (error);
947 	}
948 	ai->ai_refcnt = 0;
949 	ai->ai_uid = uid;
950 	ai->ai_flags = 0;	/* XXX for now */
951 	ai->ai_ngids = ngroups;
952 	memcpy(ai->ai_gids, gids, (size_t)ngroups * sizeof(gid_t));
953 	free(gids);
954 
955 	file = open_fid(sc->fs_rootfd, ".", ai, true);
956 	if (file == NULL) {
957 		pthread_mutex_destroy(&ai->ai_mtx);
958 		free(ai);
959 		return (ENOMEM);
960 	}
961 
962 	req->lr_fid->lo_aux = file;
963 	generate_qid(&st, &req->lr_resp.rattach.qid);
964 	return (0);
965 }
966 
967 static int
968 fs_clunk(void *softc __unused, struct l9p_fid *fid)
969 {
970 	struct fs_fid *file;
971 
972 	file = fid->lo_aux;
973 	assert(file != NULL);
974 
975 	if (file->ff_dir) {
976 		closedir(file->ff_dir);
977 		file->ff_dir = NULL;
978 	} else if (file->ff_fd != -1) {
979 		close(file->ff_fd);
980 		file->ff_fd = -1;
981 	}
982 
983 	return (0);
984 }
985 
986 /*
987  * Create ops.
988  *
989  * We are to create a new file under some existing path,
990  * where the new file's name is in the Tcreate request and the
991  * existing path is due to a fid-based file (req->lr_fid).
992  *
993  * One op (create regular file) sets file->fd, the rest do not.
994  */
995 static int
996 fs_create(void *softc, struct l9p_request *req)
997 {
998 	struct l9p_fid *dir;
999 	struct stat st;
1000 	uint32_t dmperm;
1001 	mode_t perm;
1002 	char *name;
1003 	int error;
1004 
1005 	dir = req->lr_fid;
1006 	name = req->lr_req.tcreate.name;
1007 	dmperm = req->lr_req.tcreate.perm;
1008 	perm = (mode_t)(dmperm & 0777);
1009 
1010 	if (dmperm & L9P_DMDIR)
1011 		error = fs_imkdir(softc, dir, name, true,
1012 		    perm, (gid_t)-1, &st);
1013 	else if (dmperm & L9P_DMSYMLINK)
1014 		error = fs_isymlink(softc, dir, name,
1015 		    req->lr_req.tcreate.extension, (gid_t)-1, &st);
1016 	else if (dmperm & L9P_DMNAMEDPIPE)
1017 		error = fs_imkfifo(softc, dir, name, true,
1018 		    perm, (gid_t)-1, &st);
1019 	else if (dmperm & L9P_DMSOCKET)
1020 		error = fs_imksocket(softc, dir, name, true,
1021 		    perm, (gid_t)-1, &st);
1022 	else if (dmperm & L9P_DMDEVICE) {
1023 		unsigned int major, minor;
1024 		char type;
1025 		dev_t dev;
1026 
1027 		/*
1028 		 * ??? Should this be testing < 3?  For now, allow a single
1029 		 * integer mode with minor==0 implied.
1030 		 */
1031 		minor = 0;
1032 		if (sscanf(req->lr_req.tcreate.extension, "%c %u %u",
1033 		    &type, &major, &minor) < 2) {
1034 			return (EINVAL);
1035 		}
1036 
1037 		switch (type) {
1038 		case 'b':
1039 			perm |= S_IFBLK;
1040 			break;
1041 		case 'c':
1042 			perm |= S_IFCHR;
1043 			break;
1044 		default:
1045 			return (EINVAL);
1046 		}
1047 		dev = makedev(major, minor);
1048 		error = fs_imknod(softc, dir, name, true, perm, dev,
1049 		    (gid_t)-1, &st);
1050 	} else {
1051 		enum l9p_omode p9;
1052 		int flags;
1053 
1054 		p9 = req->lr_req.tcreate.mode;
1055 		error = fs_oflags_dotu(p9, &flags);
1056 		if (error)
1057 			return (error);
1058 		error = fs_icreate(softc, dir, name, flags,
1059 		    true, perm, (gid_t)-1, &st);
1060 		req->lr_resp.rcreate.iounit = req->lr_conn->lc_max_io_size;
1061 	}
1062 
1063 	if (error == 0)
1064 		generate_qid(&st, &req->lr_resp.rcreate.qid);
1065 
1066 	return (error);
1067 }
1068 
1069 /*
1070  * https://swtch.com/plan9port/man/man9/open.html and
1071  * http://plan9.bell-labs.com/magic/man2html/5/open
1072  * say that permissions are actually
1073  *     perm & (~0666 | (dir.perm & 0666))
1074  * for files, and
1075  *     perm & (~0777 | (dir.perm & 0777))
1076  * for directories.  That is, the parent directory may
1077  * take away permissions granted by the operation.
1078  *
1079  * This seems a bit restrictive; probably
1080  * there should be a control knob for this.
1081  */
1082 static inline mode_t
1083 fs_p9perm(mode_t perm, mode_t dir_perm, bool isdir)
1084 {
1085 
1086 	if (isdir)
1087 		perm &= ~0777 | (dir_perm & 0777);
1088 	else
1089 		perm &= ~0666 | (dir_perm & 0666);
1090 	return (perm);
1091 }
1092 
1093 /*
1094  * Internal form of create (plain file).
1095  *
1096  * Our caller takes care of splitting off all the special
1097  * types of create (mknod, etc), so this is purely for files.
1098  * We receive the fs_softc <softc>, the directory fid <dir>
1099  * in which the new file is to be created, the name of the
1100  * new file, a flag <isp9> indicating whether to do plan9 style
1101  * permissions or Linux style permissions, the permissions <perm>,
1102  * an effective group id <egid>, and a pointer to a stat structure
1103  * <st> to fill in describing the final result on success.
1104  *
1105  * On successful create, the fid switches to the newly created
1106  * file, which is now open; its associated file-name changes too.
1107  *
1108  * Note that the original (dir) fid is never currently open,
1109  * so there is nothing to close.
1110  */
1111 static int
1112 fs_icreate(void *softc, struct l9p_fid *dir, char *name, int flags,
1113     bool isp9, mode_t perm, gid_t egid, struct stat *st)
1114 {
1115 	struct fs_fid *file;
1116 	gid_t gid;
1117 	uid_t uid;
1118 	char newname[MAXPATHLEN];
1119 	int error, fd;
1120 
1121 	file = dir->lo_aux;
1122 
1123 	/*
1124 	 * Build full path name from directory + file name.  We'll
1125 	 * check permissions on the parent directory, then race to
1126 	 * create the file before anything bad happens like symlinks.
1127 	 *
1128 	 * (To close this race we need to use openat(), which is
1129 	 * left for a later version of this code.)
1130 	 */
1131 	error = fs_buildname(dir, name, newname, sizeof(newname));
1132 	if (error)
1133 		return (error);
1134 
1135 	/* In case of success, we will need a new file->ff_name. */
1136 	name = strdup(newname);
1137 	if (name == NULL)
1138 		return (ENOMEM);
1139 
1140 	/* Check create permission and compute new file ownership. */
1141 	error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
1142 	if (error) {
1143 		free(name);
1144 		return (error);
1145 	}
1146 
1147 	/* Adjust new-file permissions for Plan9 protocol. */
1148 	if (isp9)
1149 		perm = fs_p9perm(perm, st->st_mode, false);
1150 
1151 	/* Create is always exclusive so O_TRUNC is irrelevant. */
1152 	fd = openat(file->ff_dirfd, newname, flags | O_CREAT | O_EXCL, perm);
1153 	if (fd < 0) {
1154 		error = errno;
1155 		free(name);
1156 		return (error);
1157 	}
1158 
1159 	/* Fix permissions and owner. */
1160 	if (fchmod(fd, perm) != 0 ||
1161 	    fchown(fd, uid, gid) != 0 ||
1162 	    fstat(fd, st) != 0) {
1163 		error = errno;
1164 		(void) close(fd);
1165 		/* unlink(newname); ? */
1166 		free(name);
1167 		return (error);
1168 	}
1169 
1170 	/* It *was* a directory; now it's a file, and it's open. */
1171 	free(file->ff_name);
1172 	file->ff_name = name;
1173 	file->ff_fd = fd;
1174 	return (0);
1175 }
1176 
1177 /*
1178  * Internal form of open: stat file and verify permissions (from p9
1179  * argument), then open the file-or-directory, leaving the internal
1180  * fs_fid fields set up.  If we cannot open the file, return a
1181  * suitable error number, and leave everything unchanged.
1182  *
1183  * To mitigate the race between permissions testing and the actual
1184  * open, we can stat the file twice (once with lstat() before open,
1185  * then with fstat() after).  We assume O_NOFOLLOW is set in flags,
1186  * so if some other race-winner substitutes in a symlink we won't
1187  * open it here.  (However, embedded symlinks, if they occur, are
1188  * still an issue.  Ideally we would like to have an O_NEVERFOLLOW
1189  * that fails on embedded symlinks, and a way to pass this to
1190  * lstat() as well.)
1191  *
1192  * When we use opendir() we cannot pass O_NOFOLLOW, so we must rely
1193  * on substitution-detection via fstat().  To simplify the code we
1194  * just always re-check.
1195  *
1196  * (For a proper fix in the future, we can require openat(), keep
1197  * each parent directory open during walk etc, and allow only final
1198  * name components with O_NOFOLLOW.)
1199  *
1200  * On successful return, st has been filled in.
1201  */
1202 static int
1203 fs_iopen(void *softc, struct l9p_fid *fid, int flags, enum l9p_omode p9,
1204     gid_t egid __unused, struct stat *st)
1205 {
1206 	struct fs_softc *sc = softc;
1207 	struct fs_fid *file;
1208 	struct stat first;
1209 	int32_t op;
1210 	char *name;
1211 	int error;
1212 	int fd;
1213 	DIR *dirp;
1214 
1215 	/* Forbid write ops on read-only file system. */
1216 	if (sc->fs_readonly) {
1217 		if ((flags & O_TRUNC) != 0)
1218 			return (EROFS);
1219 		if ((flags & O_ACCMODE) != O_RDONLY)
1220 			return (EROFS);
1221 		if (p9 & L9P_ORCLOSE)
1222 			return (EROFS);
1223 	}
1224 
1225 	file = fid->lo_aux;
1226 	assert(file != NULL);
1227 	name = file->ff_name;
1228 
1229 	if (fstatat(file->ff_dirfd, name, &first, AT_SYMLINK_NOFOLLOW) != 0)
1230 		return (errno);
1231 	if (S_ISLNK(first.st_mode))
1232 		return (EPERM);
1233 
1234 	/* Can we rely on O_APPEND here?  Best not, can be cleared. */
1235 	switch (flags & O_ACCMODE) {
1236 	case O_RDONLY:
1237 		op = L9P_ACE_READ_DATA;
1238 		break;
1239 	case O_WRONLY:
1240 		op = L9P_ACE_WRITE_DATA;
1241 		break;
1242 	case O_RDWR:
1243 		op = L9P_ACE_READ_DATA | L9P_ACE_WRITE_DATA;
1244 		break;
1245 	default:
1246 		return (EINVAL);
1247 	}
1248 	fillacl(file);
1249 	error = check_access(op, NULL, NULL, file->ff_acl, &first,
1250 	    file->ff_ai, (gid_t)-1);
1251 	if (error)
1252 		return (error);
1253 
1254 	if (S_ISDIR(first.st_mode)) {
1255 		/* Forbid write or truncate on directory. */
1256 		if ((flags & O_ACCMODE) != O_RDONLY || (flags & O_TRUNC))
1257 			return (EPERM);
1258 		fd = openat(file->ff_dirfd, name, O_DIRECTORY);
1259 		dirp = fdopendir(fd);
1260 		if (dirp == NULL)
1261 			return (EPERM);
1262 		fd = dirfd(dirp);
1263 	} else {
1264 		dirp = NULL;
1265 		fd = openat(file->ff_dirfd, name, flags);
1266 		if (fd < 0)
1267 			return (EPERM);
1268 	}
1269 
1270 	/*
1271 	 * We have a valid fd, and maybe non-null dirp.  Re-check
1272 	 * the file, and fail if st_dev or st_ino changed.
1273 	 */
1274 	if (fstat(fd, st) != 0 ||
1275 	    first.st_dev != st->st_dev ||
1276 	    first.st_ino != st->st_ino) {
1277 		if (dirp != NULL)
1278 			(void) closedir(dirp);
1279 		else
1280 			(void) close(fd);
1281 		return (EPERM);
1282 	}
1283 	if (dirp != NULL)
1284 		file->ff_dir = dirp;
1285 	else
1286 		file->ff_fd = fd;
1287 	return (0);
1288 }
1289 
1290 /*
1291  * Internal form of mkdir (common code for all forms).
1292  * We receive the fs_softc <softc>, the directory fid <dir>
1293  * in which the new entry is to be created, the name of the
1294  * new entry, a flag <isp9> indicating whether to do plan9 style
1295  * permissions or Linux style permissions, the permissions <perm>,
1296  * an effective group id <egid>, and a pointer to a stat structure
1297  * <st> to fill in describing the final result on success.
1298  *
1299  * See also fs_icreate() above.
1300  */
1301 static int
1302 fs_imkdir(void *softc, struct l9p_fid *dir, char *name,
1303     bool isp9, mode_t perm, gid_t egid, struct stat *st)
1304 {
1305 	struct fs_fid *ff;
1306 	gid_t gid;
1307 	uid_t uid;
1308 	char newname[MAXPATHLEN];
1309 	int error, fd;
1310 
1311 	ff = dir->lo_aux;
1312 	error = fs_buildname(dir, name, newname, sizeof(newname));
1313 	if (error)
1314 		return (error);
1315 
1316 	error = fs_nde(softc, dir, true, egid, st, &uid, &gid);
1317 	if (error)
1318 		return (error);
1319 
1320 	if (isp9)
1321 		perm = fs_p9perm(perm, st->st_mode, true);
1322 
1323 	if (mkdirat(ff->ff_dirfd, newname, perm) != 0)
1324 		return (errno);
1325 
1326 	fd = openat(ff->ff_dirfd, newname,
1327 	    O_DIRECTORY | O_RDONLY | O_NOFOLLOW);
1328 	if (fd < 0 ||
1329 	    fchown(fd, uid, gid) != 0 ||
1330 	    fchmod(fd, perm) != 0 ||
1331 	    fstat(fd, st) != 0) {
1332 		error = errno;
1333 		/* rmdir(newname) ? */
1334 	}
1335 	if (fd >= 0)
1336 		(void) close(fd);
1337 
1338 	return (error);
1339 }
1340 
1341 #ifdef __APPLE__
1342 /*
1343  * This is an undocumented OS X syscall. It would be best to avoid it,
1344  * but there doesn't seem to be another safe way to implement mknodat.
1345  * Dear Apple, please implement mknodat before you remove this syscall.
1346  */
1347 static int fs_ifchdir_thread_local(int fd)
1348 {
1349 #pragma clang diagnostic push
1350 #pragma clang diagnostic ignored "-Wdeprecated-declarations"
1351 	return syscall(SYS___pthread_fchdir, fd);
1352 #pragma clang diagnostic pop
1353 }
1354 #endif
1355 
1356 /*
1357  * Internal form of mknod (special device).
1358  *
1359  * The device type (S_IFBLK, S_IFCHR) is included in the <mode> parameter.
1360  */
1361 static int
1362 fs_imknod(void *softc, struct l9p_fid *dir, char *name,
1363     bool isp9, mode_t mode, dev_t dev, gid_t egid, struct stat *st)
1364 {
1365 	struct fs_fid *ff;
1366 	mode_t perm;
1367 	gid_t gid;
1368 	uid_t uid;
1369 	char newname[MAXPATHLEN];
1370 	int error;
1371 
1372 	ff = dir->lo_aux;
1373 	error = fs_buildname(dir, name, newname, sizeof(newname));
1374 	if (error)
1375 		return (error);
1376 
1377 	error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
1378 	if (error)
1379 		return (error);
1380 
1381 	if (isp9) {
1382 		perm = fs_p9perm(mode & 0777, st->st_mode, false);
1383 		mode = (mode & ~0777) | perm;
1384 	} else {
1385 		perm = mode & 0777;
1386 	}
1387 
1388 #ifdef __APPLE__
1389 	if (fs_ifchdir_thread_local(ff->ff_dirfd) < 0) {
1390 		return -1;
1391 	}
1392 	error = mknod(newname, mode, dev);
1393 	int preserved_errno = errno;
1394 	/* Stop using the thread-local cwd */
1395 	fs_ifchdir_thread_local(-1);
1396 	if (error < 0) {
1397 		errno = preserved_errno;
1398 		return errno;
1399 	}
1400 #else
1401 	if (mknodat(ff->ff_dirfd, newname, mode, dev) != 0)
1402 		return (errno);
1403 #endif
1404 
1405 	/* We cannot open the new name; race to use l* syscalls. */
1406 	if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
1407 	    fchmodat(ff->ff_dirfd, newname, perm, AT_SYMLINK_NOFOLLOW) != 0 ||
1408 	    fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
1409 		error = errno;
1410 	else if ((st->st_mode & S_IFMT) != (mode & S_IFMT))
1411 		error = EPERM;		/* ??? lost a race anyway */
1412 
1413 	/* if (error) unlink(newname) ? */
1414 
1415 	return (error);
1416 }
1417 
1418 /*
1419  * Internal form of mkfifo.
1420  */
1421 static int
1422 fs_imkfifo(void *softc, struct l9p_fid *dir, char *name,
1423     bool isp9, mode_t perm, gid_t egid, struct stat *st)
1424 {
1425 	struct fs_fid *ff;
1426 	gid_t gid;
1427 	uid_t uid;
1428 	char newname[MAXPATHLEN];
1429 	int error;
1430 
1431 	ff = dir->lo_aux;
1432 	error = fs_buildname(dir, name, newname, sizeof(newname));
1433 	if (error)
1434 		return (error);
1435 
1436 	error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
1437 	if (error)
1438 		return (error);
1439 
1440 	if (isp9)
1441 		perm = fs_p9perm(perm, st->st_mode, false);
1442 
1443 	if (mkfifo(newname, perm) != 0)
1444 		return (errno);
1445 
1446 	/* We cannot open the new name; race to use l* syscalls. */
1447 	if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
1448 	    fchmodat(ff->ff_dirfd, newname, perm, AT_SYMLINK_NOFOLLOW) != 0 ||
1449 	    fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
1450 		error = errno;
1451 	else if (!S_ISFIFO(st->st_mode))
1452 		error = EPERM;		/* ??? lost a race anyway */
1453 
1454 	/* if (error) unlink(newname) ? */
1455 
1456 	return (error);
1457 }
1458 
1459 /*
1460  * Internal form of mksocket.
1461  *
1462  * This is a bit different because of the horrible socket naming
1463  * system (bind() with sockaddr_un sun_path).
1464  */
1465 static int
1466 fs_imksocket(void *softc, struct l9p_fid *dir, char *name,
1467     bool isp9, mode_t perm, gid_t egid, struct stat *st)
1468 {
1469 	struct fs_fid *ff;
1470 	struct sockaddr_un sun;
1471 	char *path;
1472 	char newname[MAXPATHLEN];
1473 	gid_t gid;
1474 	uid_t uid;
1475 	int error = 0, s, fd;
1476 
1477 	ff = dir->lo_aux;
1478 	error = fs_buildname(dir, name, newname, sizeof(newname));
1479 	if (error)
1480 		return (error);
1481 
1482 	error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
1483 	if (error)
1484 		return (error);
1485 
1486 	if (isp9)
1487 		perm = fs_p9perm(perm, st->st_mode, false);
1488 
1489 	s = socket(AF_UNIX, SOCK_STREAM, 0);
1490 	if (s < 0)
1491 		return (errno);
1492 
1493 	path = newname;
1494 	fd = -1;
1495 #ifdef HAVE_BINDAT
1496 	/* Try bindat() if needed. */
1497 	if (strlen(path) >= sizeof(sun.sun_path)) {
1498 		fd = openat(ff->ff_dirfd, ff->ff_name,
1499 		    O_RDONLY | O_DIRECTORY | O_NOFOLLOW);
1500 		if (fd >= 0)
1501 			path = name;
1502 	}
1503 #endif
1504 
1505 	/*
1506 	 * Can only create the socket if the path will fit.
1507 	 * Even if we are using bindat() there are limits
1508 	 * (the API for AF_UNIX sockets is ... not good).
1509 	 *
1510 	 * Note: in theory we can fill sun_path to the end
1511 	 * (omitting a terminating '\0') but in at least one
1512 	 * Unix-like system, this was known to behave oddly,
1513 	 * so we test for ">=" rather than just ">".
1514 	 */
1515 	if (strlen(path) >= sizeof(sun.sun_path)) {
1516 		error = ENAMETOOLONG;
1517 		goto out;
1518 	}
1519 	sun.sun_family = AF_UNIX;
1520 	sun.sun_len = sizeof(struct sockaddr_un);
1521 	strncpy(sun.sun_path, path, sizeof(sun.sun_path));
1522 
1523 #ifdef HAVE_BINDAT
1524 	if (fd >= 0) {
1525 		if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0)
1526 			error = errno;
1527 		goto out;	/* done now, for good or ill */
1528 	}
1529 #endif
1530 
1531 	if (bind(s, (struct sockaddr *)&sun, sun.sun_len) < 0)
1532 		error = errno;
1533 out:
1534 
1535 	if (error == 0) {
1536 		/*
1537 		 * We believe we created the socket-inode.  Fix
1538 		 * permissions etc.  Note that we cannot use
1539 		 * fstat() on the socket descriptor: it succeeds,
1540 		 * but we get bogus data!
1541 		 */
1542 		if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
1543 		    fchmodat(ff->ff_dirfd, newname, perm, AT_SYMLINK_NOFOLLOW) != 0 ||
1544 		    fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
1545 			error = errno;
1546 		else if (!S_ISSOCK(st->st_mode))
1547 			error = EPERM;		/* ??? lost a race anyway */
1548 
1549 		/* if (error) unlink(newname) ? */
1550 	}
1551 
1552 	/*
1553 	 * It's not clear which error should override, although
1554 	 * ideally we should never see either close() call fail.
1555 	 * In any case we do want to try to close both fd and s,
1556 	 * always.  Let's set error only if it is not already set,
1557 	 * so that all exit paths can use the same code.
1558 	 */
1559 	if (fd >= 0 && close(fd) != 0)
1560 		if (error == 0)
1561 			error = errno;
1562 	if (close(s) != 0)
1563 		if (error == 0)
1564 			error = errno;
1565 
1566 	return (error);
1567 }
1568 
1569 /*
1570  * Internal form of symlink.
1571  *
1572  * Note that symlinks are presumed to carry no permission bits.
1573  * They do have owners, however (who may be charged for quotas).
1574  */
1575 static int
1576 fs_isymlink(void *softc, struct l9p_fid *dir, char *name,
1577     char *symtgt, gid_t egid, struct stat *st)
1578 {
1579 	struct fs_fid *ff;
1580 	gid_t gid;
1581 	uid_t uid;
1582 	char newname[MAXPATHLEN];
1583 	int error;
1584 
1585 	ff = dir->lo_aux;
1586 	error = fs_buildname(dir, name, newname, sizeof(newname));
1587 	if (error)
1588 		return (error);
1589 
1590 	error = fs_nde(softc, dir, false, egid, st, &uid, &gid);
1591 	if (error)
1592 		return (error);
1593 
1594 	if (symlinkat(symtgt, ff->ff_dirfd, newname) != 0)
1595 		return (errno);
1596 
1597 	/* We cannot open the new name; race to use l* syscalls. */
1598 	if (fchownat(ff->ff_dirfd, newname, uid, gid, AT_SYMLINK_NOFOLLOW) != 0 ||
1599 	    fstatat(ff->ff_dirfd, newname, st, AT_SYMLINK_NOFOLLOW) != 0)
1600 		error = errno;
1601 	else if (!S_ISLNK(st->st_mode))
1602 		error = EPERM;		/* ??? lost a race anyway */
1603 
1604 	/* if (error) unlink(newname) ? */
1605 
1606 	return (error);
1607 }
1608 
1609 static int
1610 fs_open(void *softc, struct l9p_request *req)
1611 {
1612 	struct l9p_fid *fid = req->lr_fid;
1613 	struct stat st;
1614 	enum l9p_omode p9;
1615 	int error, flags;
1616 
1617 	p9 = req->lr_req.topen.mode;
1618 	error = fs_oflags_dotu(p9, &flags);
1619 	if (error)
1620 		return (error);
1621 
1622 	error = fs_iopen(softc, fid, flags, p9, (gid_t)-1, &st);
1623 	if (error)
1624 		return (error);
1625 
1626 	generate_qid(&st, &req->lr_resp.ropen.qid);
1627 	req->lr_resp.ropen.iounit = req->lr_conn->lc_max_io_size;
1628 	return (0);
1629 }
1630 
1631 /*
1632  * Helper for directory read.  We want to run an lstat on each
1633  * file name within the directory.  This is a lot faster if we
1634  * have lstatat (or fstatat with AT_SYMLINK_NOFOLLOW), but not
1635  * all systems do, so hide the ifdef-ed code in an inline function.
1636  */
1637 static inline int
1638 fs_lstatat(struct fs_fid *file, char *name, struct stat *st)
1639 {
1640 
1641 	return (fstatat(dirfd(file->ff_dir), name, st, AT_SYMLINK_NOFOLLOW));
1642 }
1643 
1644 static int
1645 fs_read(void *softc, struct l9p_request *req)
1646 {
1647 	struct l9p_stat l9stat;
1648 	struct fs_softc *sc;
1649 	struct fs_fid *file;
1650 	bool dotu = req->lr_conn->lc_version >= L9P_2000U;
1651 	ssize_t ret;
1652 
1653 	sc = softc;
1654 	file = req->lr_fid->lo_aux;
1655 	assert(file != NULL);
1656 
1657 	if (file->ff_dir != NULL) {
1658 		struct dirent *d;
1659 		struct stat st;
1660 		struct l9p_message msg;
1661 		long o;
1662 
1663 		pthread_mutex_lock(&file->ff_mtx);
1664 
1665 		/*
1666 		 * Must use telldir before readdir since seekdir
1667 		 * takes cookie values.  Unfortunately this wastes
1668 		 * a lot of time (and memory) building unneeded
1669 		 * cookies that can only be flushed by closing
1670 		 * the directory.
1671 		 *
1672 		 * NB: FreeBSD libc seekdir has SINGLEUSE defined,
1673 		 * so in fact, we can discard the cookies by
1674 		 * calling seekdir on them.  This clears up wasted
1675 		 * memory at the cost of even more wasted time...
1676 		 *
1677 		 * XXX: readdir/telldir/seekdir not thread safe
1678 		 */
1679 		l9p_init_msg(&msg, req, L9P_PACK);
1680 		for (;;) {
1681 			o = telldir(file->ff_dir);
1682 			d = readdir(file->ff_dir);
1683 			if (d == NULL)
1684 				break;
1685 			if (fs_lstatat(file, d->d_name, &st))
1686 				continue;
1687 			dostat(sc, &l9stat, d->d_name, &st, dotu);
1688 			if (l9p_pack_stat(&msg, req, &l9stat) != 0) {
1689 				seekdir(file->ff_dir, o);
1690 				break;
1691 			}
1692 #if defined(__FreeBSD__)
1693 			seekdir(file->ff_dir, o);
1694 			(void) readdir(file->ff_dir);
1695 #endif
1696 		}
1697 
1698 		pthread_mutex_unlock(&file->ff_mtx);
1699 	} else {
1700 		size_t niov = l9p_truncate_iov(req->lr_data_iov,
1701                     req->lr_data_niov, req->lr_req.io.count);
1702 
1703 #if defined(__FreeBSD__)
1704 		ret = preadv(file->ff_fd, req->lr_data_iov, niov,
1705 		    req->lr_req.io.offset);
1706 #else
1707 		/* XXX: not thread safe, should really use aio_listio. */
1708 		if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
1709 			return (errno);
1710 
1711 		ret = (uint32_t)readv(file->ff_fd, req->lr_data_iov, (int)niov);
1712 #endif
1713 
1714 		if (ret < 0)
1715 			return (errno);
1716 
1717 		req->lr_resp.io.count = (uint32_t)ret;
1718 	}
1719 
1720 	return (0);
1721 }
1722 
1723 static int
1724 fs_remove(void *softc, struct l9p_fid *fid)
1725 {
1726 	struct fs_softc *sc = softc;
1727 	struct l9p_acl *parent_acl;
1728 	struct fs_fid *file;
1729 	struct stat pst, cst;
1730 	char dirname[MAXPATHLEN];
1731 	int error;
1732 
1733 	if (sc->fs_readonly)
1734 		return (EROFS);
1735 
1736 	error = fs_pdir(sc, fid, dirname, sizeof(dirname), &pst);
1737 	if (error)
1738 		return (error);
1739 
1740 	file = fid->lo_aux;
1741 	if (fstatat(file->ff_dirfd, file->ff_name, &cst, AT_SYMLINK_NOFOLLOW) != 0)
1742 		return (error);
1743 
1744 	parent_acl = getacl(file, -1, dirname);
1745 	fillacl(file);
1746 
1747 	error = check_access(L9P_ACOP_UNLINK,
1748 	    parent_acl, &pst, file->ff_acl, &cst, file->ff_ai, (gid_t)-1);
1749 	l9p_acl_free(parent_acl);
1750 	if (error)
1751 		return (error);
1752 
1753 	if (unlinkat(file->ff_dirfd, file->ff_name,
1754 	    S_ISDIR(cst.st_mode) ? AT_REMOVEDIR : 0) != 0)
1755 		error = errno;
1756 
1757 	return (error);
1758 }
1759 
1760 static int
1761 fs_stat(void *softc, struct l9p_request *req)
1762 {
1763 	struct fs_softc *sc;
1764 	struct fs_fid *file;
1765 	struct stat st;
1766 	bool dotu = req->lr_conn->lc_version >= L9P_2000U;
1767 
1768 	sc = softc;
1769 	file = req->lr_fid->lo_aux;
1770 	assert(file);
1771 
1772 	if (fstatat(file->ff_dirfd, file->ff_name, &st,
1773 	    AT_SYMLINK_NOFOLLOW) != 0)
1774 		return (errno);
1775 
1776 	dostat(sc, &req->lr_resp.rstat.stat, file->ff_name, &st, dotu);
1777 	return (0);
1778 }
1779 
1780 static int
1781 fs_walk(void *softc, struct l9p_request *req)
1782 {
1783 	struct l9p_acl *acl;
1784 	struct fs_authinfo *ai;
1785 	struct fs_fid *file = req->lr_fid->lo_aux;
1786 	struct fs_fid *newfile;
1787 	struct stat st;
1788 	size_t clen, namelen, need;
1789 	char *comp, *succ, *next, *swtmp;
1790 	bool atroot;
1791 	bool dotdot;
1792 	int i, nwname;
1793 	int error = 0;
1794 	char namebufs[2][MAXPATHLEN];
1795 
1796 	/*
1797 	 * https://swtch.com/plan9port/man/man9/walk.html:
1798 	 *
1799 	 *    It is legal for nwname to be zero, in which case newfid
1800 	 *    will represent the same file as fid and the walk will
1801 	 *    usually succeed; this is equivalent to walking to dot.
1802 	 * [Aside: it's not clear if we should test S_ISDIR here.]
1803 	 *    ...
1804 	 *    The name ".." ... represents the parent directory.
1805 	 *    The name "." ... is not used in the protocol.
1806 	 *    ... A walk of the name ".." in the root directory
1807 	 *    of the server is equivalent to a walk with no name
1808 	 *    elements.
1809 	 *
1810 	 * Note that req.twalk.nwname never exceeds L9P_MAX_WELEM,
1811 	 * so it is safe to convert to plain int.
1812 	 *
1813 	 * We are to return an error only if the first walk fails,
1814 	 * else stop at the end of the names or on the first error.
1815 	 * The final fid is based on the last name successfully
1816 	 * walked.
1817 	 *
1818 	 * Note that we *do* get Twalk requests with nwname==0 on files.
1819 	 *
1820 	 * Set up "successful name" buffer pointer with base fid name,
1821 	 * initially.  We'll swap each new success into it as we go.
1822 	 *
1823 	 * Invariant: atroot and stat data correspond to current
1824 	 * (succ) path.
1825 	 */
1826 	succ = namebufs[0];
1827 	next = namebufs[1];
1828 	namelen = strlcpy(succ, file->ff_name, MAXPATHLEN);
1829 	if (namelen >= MAXPATHLEN)
1830 		return (ENAMETOOLONG);
1831 	if (fstatat(file->ff_dirfd, succ, &st, AT_SYMLINK_NOFOLLOW) < 0)
1832 		return (errno);
1833 	ai = file->ff_ai;
1834 	atroot = strlen(succ) == 0; /* XXX? */
1835 	fillacl(file);
1836 	acl = file->ff_acl;
1837 
1838 	nwname = (int)req->lr_req.twalk.nwname;
1839 
1840 	for (i = 0; i < nwname; i++) {
1841 		/*
1842 		 * Must have execute permission to search a directory.
1843 		 * Then, look up each component in its directory-so-far.
1844 		 * Check for ".." along the way, handlng specially
1845 		 * as needed.  Forbid "/" in name components.
1846 		 *
1847 		 */
1848 		if (!S_ISDIR(st.st_mode)) {
1849 			error = ENOTDIR;
1850 			goto out;
1851 		}
1852 		error = check_access(L9P_ACE_EXECUTE,
1853 		     NULL, NULL, acl, &st, ai, (gid_t)-1);
1854 		if (error) {
1855 			L9P_LOG(L9P_DEBUG,
1856 			    "Twalk: denying dir-walk on \"%s\" for uid %u",
1857 			    succ, (unsigned)ai->ai_uid);
1858 			error = EPERM;
1859 			goto out;
1860 		}
1861 		comp = req->lr_req.twalk.wname[i];
1862 		if (strchr(comp, '/') != NULL) {
1863 			error = EINVAL;
1864 			break;
1865 		}
1866 
1867 		clen = strlen(comp);
1868 		dotdot = false;
1869 
1870 		/*
1871 		 * Build next pathname (into "next").  If "..",
1872 		 * just strip one name component off the success
1873 		 * name so far.  Since we know this name fits, the
1874 		 * stripped down version also fits.  Otherwise,
1875 		 * the name is the base name plus '/' plus the
1876 		 * component name plus terminating '\0'; this may
1877 		 * or may not fit.
1878 		 */
1879 		if (comp[0] == '.') {
1880 			if (clen == 1) {
1881 				error = EINVAL;
1882 				break;
1883 			}
1884 			if (comp[1] == '.' && clen == 2)
1885 				dotdot = true;
1886 		}
1887 		if (dotdot) {
1888 			/*
1889 			 * It's not clear how ".." at root should
1890 			 * be handled when i > 0.  Obeying the man
1891 			 * page exactly, we reset i to 0 and stop,
1892 			 * declaring terminal success.
1893 			 *
1894 			 * Otherwise, we just climbed up one level
1895 			 * so adjust "atroot".
1896 			 */
1897 			if (atroot) {
1898 				i = 0;
1899 				break;
1900 			}
1901 			(void) r_dirname(succ, next, MAXPATHLEN);
1902 			namelen = strlen(next);
1903 			atroot = strlen(next) == 0; /* XXX? */
1904 		} else {
1905 			need = namelen + 1 + clen + 1;
1906 			if (need > MAXPATHLEN) {
1907 				error = ENAMETOOLONG;
1908 				break;
1909 			}
1910 			memcpy(next, succ, namelen);
1911 			next[namelen++] = '/';
1912 			memcpy(&next[namelen], comp, clen + 1);
1913 			namelen += clen;
1914 			/*
1915 			 * Since name is never ".", we are necessarily
1916 			 * descending below the root now.
1917 			 */
1918 			atroot = false;
1919 		}
1920 
1921 		if (fstatat(file->ff_dirfd, next, &st, AT_SYMLINK_NOFOLLOW) < 0) {
1922 			error = ENOENT;
1923 			break;
1924 		}
1925 
1926 		/*
1927 		 * Success: generate qid and swap this
1928 		 * successful name into place.  Update acl.
1929 		 */
1930 		generate_qid(&st, &req->lr_resp.rwalk.wqid[i]);
1931 		swtmp = succ;
1932 		succ = next;
1933 		next = swtmp;
1934 		if (acl != NULL && acl != file->ff_acl)
1935 			l9p_acl_free(acl);
1936 		acl = getacl(file, -1, next);
1937 	}
1938 
1939 	/*
1940 	 * Fail only if we failed on the first name.
1941 	 * Otherwise we succeeded on something, and "succ"
1942 	 * points to the last successful name in namebufs[].
1943 	 */
1944 	if (error) {
1945 		if (i == 0)
1946 			goto out;
1947 		error = 0;
1948 	}
1949 
1950 	newfile = open_fid(file->ff_dirfd, succ, ai, false);
1951 	if (newfile == NULL) {
1952 		error = ENOMEM;
1953 		goto out;
1954 	}
1955 	if (req->lr_newfid == req->lr_fid) {
1956 		/*
1957 		 * Before overwriting fid->lo_aux, free the old value.
1958 		 * Note that this doesn't free the l9p_fid data,
1959 		 * just the fs_fid data.  (But it does ditch ff_acl.)
1960 		 */
1961 		if (acl == file->ff_acl)
1962 			acl = NULL;
1963 		fs_freefid(softc, req->lr_fid);
1964 		file = NULL;
1965 	}
1966 	req->lr_newfid->lo_aux = newfile;
1967 	if (file != NULL && acl != file->ff_acl) {
1968 		newfile->ff_acl = acl;
1969 		acl = NULL;
1970 	}
1971 	req->lr_resp.rwalk.nwqid = (uint16_t)i;
1972 out:
1973 	if (file != NULL && acl != file->ff_acl)
1974 		l9p_acl_free(acl);
1975 	return (error);
1976 }
1977 
1978 static int
1979 fs_write(void *softc, struct l9p_request *req)
1980 {
1981 	struct fs_softc *sc = softc;
1982 	struct fs_fid *file;
1983 	ssize_t ret;
1984 
1985 	file = req->lr_fid->lo_aux;
1986 	assert(file != NULL);
1987 
1988 	if (sc->fs_readonly)
1989 		return (EROFS);
1990 
1991 	size_t niov = l9p_truncate_iov(req->lr_data_iov,
1992             req->lr_data_niov, req->lr_req.io.count);
1993 
1994 #if defined(__FreeBSD__)
1995 	ret = pwritev(file->ff_fd, req->lr_data_iov, niov,
1996 	    req->lr_req.io.offset);
1997 #else
1998 	/* XXX: not thread safe, should really use aio_listio. */
1999 	if (lseek(file->ff_fd, (off_t)req->lr_req.io.offset, SEEK_SET) < 0)
2000 		return (errno);
2001 
2002 	ret = writev(file->ff_fd, req->lr_data_iov,
2003 	    (int)niov);
2004 #endif
2005 
2006 	if (ret < 0)
2007 		return (errno);
2008 
2009 	req->lr_resp.io.count = (uint32_t)ret;
2010 	return (0);
2011 }
2012 
2013 static int
2014 fs_wstat(void *softc, struct l9p_request *req)
2015 {
2016 	struct fs_softc *sc = softc;
2017 	struct l9p_stat *l9stat = &req->lr_req.twstat.stat;
2018 	struct l9p_fid *fid;
2019 	struct fs_fid *file;
2020 	int error = 0;
2021 
2022 	fid = req->lr_fid;
2023 	file = fid->lo_aux;
2024 	assert(file != NULL);
2025 
2026 	/*
2027 	 * XXX:
2028 	 *
2029 	 * stat(9P) sez:
2030 	 *
2031 	 * Either all the changes in wstat request happen, or none of them
2032 	 * does: if the request succeeds, all changes were made; if it fails,
2033 	 * none were.
2034 	 *
2035 	 * Atomicity is clearly missing in current implementation.
2036 	 */
2037 
2038 	if (sc->fs_readonly)
2039 		return (EROFS);
2040 
2041 	if (l9stat->atime != (uint32_t)~0) {
2042 		/* XXX: not implemented, ignore */
2043 	}
2044 
2045 	if (l9stat->mtime != (uint32_t)~0) {
2046 		/* XXX: not implemented, ignore */
2047 	}
2048 
2049 	if (l9stat->dev != (uint32_t)~0) {
2050 		error = EPERM;
2051 		goto out;
2052 	}
2053 
2054 	if (l9stat->length != (uint64_t)~0) {
2055 		if (file->ff_dir != NULL) {
2056 			error = EINVAL;
2057 			goto out;
2058 		}
2059 
2060 		if (truncate(file->ff_name, (off_t)l9stat->length) != 0) {
2061 			error = errno;
2062 			goto out;
2063 		}
2064 	}
2065 
2066 	if (req->lr_conn->lc_version >= L9P_2000U) {
2067 		if (fchownat(file->ff_dirfd, file->ff_name, l9stat->n_uid,
2068 		    l9stat->n_gid, AT_SYMLINK_NOFOLLOW) != 0) {
2069 			error = errno;
2070 			goto out;
2071 		}
2072 	}
2073 
2074 	if (l9stat->mode != (uint32_t)~0) {
2075 		if (fchmodat(file->ff_dirfd, file->ff_name,
2076 		    l9stat->mode & 0777, 0) != 0) {
2077 			error = errno;
2078 			goto out;
2079 		}
2080 	}
2081 
2082 	if (strlen(l9stat->name) > 0) {
2083 		struct l9p_acl *parent_acl;
2084 		struct stat st;
2085 		char *tmp;
2086 		char newname[MAXPATHLEN];
2087 
2088 		/*
2089 		 * Rename-within-directory: it's not deleting anything,
2090 		 * but we need write permission on the directory.  This
2091 		 * should suffice.
2092 		 */
2093 		error = fs_pdir(softc, fid, newname, sizeof(newname), &st);
2094 		if (error)
2095 			goto out;
2096 		parent_acl = getacl(file, -1, newname);
2097 		error = check_access(L9P_ACE_ADD_FILE,
2098 		    parent_acl, &st, NULL, NULL, file->ff_ai, (gid_t)-1);
2099 		l9p_acl_free(parent_acl);
2100 		if (error)
2101 			goto out;
2102 		error = fs_dpf(newname, l9stat->name, sizeof(newname));
2103 		if (error)
2104 			goto out;
2105 		tmp = strdup(newname);
2106 		if (tmp == NULL) {
2107 			error = ENOMEM;
2108 			goto out;
2109 		}
2110 		if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
2111 		    tmp) != 0) {
2112 			error = errno;
2113 			free(tmp);
2114 			goto out;
2115 		}
2116 		/* Successful rename, update file->ff_name.  ACL can stay. */
2117 		free(file->ff_name);
2118 		file->ff_name = tmp;
2119 	}
2120 out:
2121 	return (error);
2122 }
2123 
2124 static int
2125 fs_statfs(void *softc __unused, struct l9p_request *req)
2126 {
2127 	struct fs_fid *file;
2128 	struct stat st;
2129 	struct statfs f;
2130 	long name_max;
2131 	int error;
2132 	int fd;
2133 
2134 	file = req->lr_fid->lo_aux;
2135 	assert(file);
2136 
2137 	if (fstatat(file->ff_dirfd, file->ff_name, &st,
2138 	    AT_SYMLINK_NOFOLLOW) != 0)
2139 		return (errno);
2140 
2141 	/*
2142 	 * Not entirely clear what access to require; we'll go
2143 	 * for "read data".
2144 	 */
2145 	fillacl(file);
2146 	error = check_access(L9P_ACE_READ_DATA, NULL, NULL,
2147 	    file->ff_acl, &st, file->ff_ai, (gid_t)-1);
2148 	if (error)
2149 		return (error);
2150 
2151 	fd = openat(file->ff_dirfd, file->ff_name, 0);
2152 	if (fd < 0)
2153 		return (errno);
2154 
2155 	if (fstatfs(fd, &f) != 0)
2156 		return (errno);
2157 
2158 	name_max = fpathconf(fd, _PC_NAME_MAX);
2159 	error = errno;
2160 	close(fd);
2161 
2162 	if (name_max == -1)
2163 		return (error);
2164 
2165 	dostatfs(&req->lr_resp.rstatfs.statfs, &f, name_max);
2166 
2167 	return (0);
2168 }
2169 
2170 static int
2171 fs_lopen(void *softc, struct l9p_request *req)
2172 {
2173 	struct l9p_fid *fid = req->lr_fid;
2174 	struct stat st;
2175 	enum l9p_omode p9;
2176 	gid_t gid;
2177 	int error, flags;
2178 
2179 	error = fs_oflags_dotl(req->lr_req.tlopen.flags, &flags, &p9);
2180 	if (error)
2181 		return (error);
2182 
2183 	gid = req->lr_req.tlopen.gid;
2184 	error = fs_iopen(softc, fid, flags, p9, gid, &st);
2185 	if (error)
2186 		return (error);
2187 
2188 	generate_qid(&st, &req->lr_resp.rlopen.qid);
2189 	req->lr_resp.rlopen.iounit = req->lr_conn->lc_max_io_size;
2190 	return (0);
2191 }
2192 
2193 static int
2194 fs_lcreate(void *softc, struct l9p_request *req)
2195 {
2196 	struct l9p_fid *dir;
2197 	struct stat st;
2198 	enum l9p_omode p9;
2199 	char *name;
2200 	mode_t perm;
2201 	gid_t gid;
2202 	int error, flags;
2203 
2204 	dir = req->lr_fid;
2205 	name = req->lr_req.tlcreate.name;
2206 
2207 	error = fs_oflags_dotl(req->lr_req.tlcreate.flags, &flags, &p9);
2208 	if (error)
2209 		return (error);
2210 
2211 	perm = (mode_t)req->lr_req.tlcreate.mode & 0777; /* ? set-id bits? */
2212 	gid = req->lr_req.tlcreate.gid;
2213 	error = fs_icreate(softc, dir, name, flags, false, perm, gid, &st);
2214 	if (error == 0)
2215 		generate_qid(&st, &req->lr_resp.rlcreate.qid);
2216 	req->lr_resp.rlcreate.iounit = req->lr_conn->lc_max_io_size;
2217 	return (error);
2218 }
2219 
2220 static int
2221 fs_symlink(void *softc, struct l9p_request *req)
2222 {
2223 	struct l9p_fid *dir;
2224 	struct stat st;
2225 	gid_t gid;
2226 	char *name, *symtgt;
2227 	int error;
2228 
2229 	dir = req->lr_fid;
2230 	name = req->lr_req.tsymlink.name;
2231 	symtgt = req->lr_req.tsymlink.symtgt;
2232 	gid = req->lr_req.tsymlink.gid;
2233 	error = fs_isymlink(softc, dir, name, symtgt, gid, &st);
2234 	if (error == 0)
2235 		generate_qid(&st, &req->lr_resp.rsymlink.qid);
2236 	return (error);
2237 }
2238 
2239 static int
2240 fs_mknod(void *softc, struct l9p_request *req)
2241 {
2242 	struct l9p_fid *dir;
2243 	struct stat st;
2244 	uint32_t mode, major, minor;
2245 	dev_t dev;
2246 	gid_t gid;
2247 	char *name;
2248 	int error;
2249 
2250 	dir = req->lr_fid;
2251 	name = req->lr_req.tmknod.name;
2252 	mode = req->lr_req.tmknod.mode;
2253 	gid = req->lr_req.tmknod.gid;
2254 
2255 	switch (mode & S_IFMT) {
2256 	case S_IFBLK:
2257 	case S_IFCHR:
2258 		mode = (mode & S_IFMT) | (mode & 0777);	/* ??? */
2259 		major = req->lr_req.tmknod.major;
2260 		minor = req->lr_req.tmknod.major;
2261 		dev = makedev(major, minor);
2262 		error = fs_imknod(softc, dir, name, false,
2263 		    (mode_t)mode, dev, gid, &st);
2264 		break;
2265 
2266 	case S_IFIFO:
2267 		error = fs_imkfifo(softc, dir, name, false,
2268 		    (mode_t)(mode & 0777), gid, &st);
2269 		break;
2270 
2271 	case S_IFSOCK:
2272 		error = fs_imksocket(softc, dir, name, false,
2273 		    (mode_t)(mode & 0777), gid, &st);
2274 		break;
2275 
2276 	default:
2277 		error = EINVAL;
2278 		break;
2279 	}
2280 	if (error == 0)
2281 		generate_qid(&st, &req->lr_resp.rmknod.qid);
2282 	return (error);
2283 }
2284 
2285 static int
2286 fs_rename(void *softc, struct l9p_request *req)
2287 {
2288 	struct fs_softc *sc = softc;
2289 	struct fs_authinfo *ai;
2290 	struct l9p_acl *oparent_acl;
2291 	struct l9p_fid *fid, *f2;
2292 	struct fs_fid *file, *f2ff;
2293 	struct stat cst, opst, npst;
2294 	int32_t op;
2295 	bool reparenting;
2296 	char *tmp;
2297 	char olddir[MAXPATHLEN], newname[MAXPATHLEN];
2298 	int error;
2299 
2300 	if (sc->fs_readonly)
2301 		return (EROFS);
2302 
2303 	/*
2304 	 * Note: lr_fid represents the file that is to be renamed,
2305 	 * so we must locate its parent directory and verify that
2306 	 * both this parent directory and the new directory f2 are
2307 	 * writable.  But if the new parent directory is the same
2308 	 * path as the old parent directory, our job is simpler.
2309 	 */
2310 	fid = req->lr_fid;
2311 	file = fid->lo_aux;
2312 	assert(file != NULL);
2313 	ai = file->ff_ai;
2314 
2315 	error = fs_pdir(sc, fid, olddir, sizeof(olddir), &opst);
2316 	if (error)
2317 		return (error);
2318 
2319 	f2 = req->lr_fid2;
2320 	f2ff = f2->lo_aux;
2321 	assert(f2ff != NULL);
2322 
2323 	reparenting = strcmp(olddir, f2ff->ff_name) != 0;
2324 
2325 	fillacl(file);
2326 	fillacl(f2ff);
2327 
2328 	if (fstatat(file->ff_dirfd, file->ff_name, &cst,
2329 	    AT_SYMLINK_NOFOLLOW) != 0)
2330 		return (errno);
2331 
2332 	/*
2333 	 * Are we moving from olddir?  If so, we're unlinking
2334 	 * from it, in terms of ACL access.
2335 	 */
2336 	if (reparenting) {
2337 		oparent_acl = getacl(file, -1, olddir);
2338 		error = check_access(L9P_ACOP_UNLINK,
2339 		    oparent_acl, &opst, file->ff_acl, &cst, ai, (gid_t)-1);
2340 		l9p_acl_free(oparent_acl);
2341 		if (error)
2342 			return (error);
2343 	}
2344 
2345 	/*
2346 	 * Now check that we're allowed to "create" a file or directory in
2347 	 * f2.  (Should we do this, too, only if reparenting?  Maybe check
2348 	 * for dir write permission if not reparenting -- but that's just
2349 	 * add-file/add-subdir, which means doing this always.)
2350 	 */
2351 	if (fstatat(f2ff->ff_dirfd, f2ff->ff_name, &npst,
2352 	    AT_SYMLINK_NOFOLLOW) != 0)
2353 		return (errno);
2354 
2355 	op = S_ISDIR(cst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
2356 	error = check_access(op, f2ff->ff_acl, &npst, NULL, NULL,
2357 	    ai, (gid_t)-1);
2358 	if (error)
2359 		return (error);
2360 
2361 	/*
2362 	 * Directories OK, file systems not R/O, etc; build final name.
2363 	 * f2ff->ff_name cannot exceed MAXPATHLEN, but out of general
2364 	 * paranoia, let's double check anyway.
2365 	 */
2366 	if (strlcpy(newname, f2ff->ff_name, sizeof(newname)) >= sizeof(newname))
2367 		return (ENAMETOOLONG);
2368 	error = fs_dpf(newname, req->lr_req.trename.name, sizeof(newname));
2369 	if (error)
2370 		return (error);
2371 	tmp = strdup(newname);
2372 	if (tmp == NULL)
2373 		return (ENOMEM);
2374 
2375 	if (renameat(file->ff_dirfd, file->ff_name, file->ff_dirfd, tmp) != 0) {
2376 		error = errno;
2377 		free(tmp);
2378 		return (error);
2379 	}
2380 
2381 	/* file has been renamed but old fid is not clunked */
2382 	free(file->ff_name);
2383 	file->ff_name = tmp;
2384 
2385 	dropacl(file);
2386 	return (0);
2387 }
2388 
2389 static int
2390 fs_readlink(void *softc __unused, struct l9p_request *req)
2391 {
2392 	struct fs_fid *file;
2393 	ssize_t linklen;
2394 	char buf[MAXPATHLEN];
2395 	int error = 0;
2396 
2397 	file = req->lr_fid->lo_aux;
2398 	assert(file);
2399 
2400 	linklen = readlinkat(file->ff_dirfd, file->ff_name, buf, sizeof(buf));
2401 	if (linklen < 0)
2402 		error = errno;
2403 	else if ((size_t)linklen >= sizeof(buf))
2404 		error = ENOMEM; /* todo: allocate dynamically */
2405 	else if ((req->lr_resp.rreadlink.target = strndup(buf,
2406 	    (size_t)linklen)) == NULL)
2407 		error = ENOMEM;
2408 	return (error);
2409 }
2410 
2411 static int
2412 fs_getattr(void *softc __unused, struct l9p_request *req)
2413 {
2414 	uint64_t mask, valid;
2415 	struct fs_fid *file;
2416 	struct stat st;
2417 	int error = 0;
2418 
2419 	file = req->lr_fid->lo_aux;
2420 	assert(file);
2421 
2422 	valid = 0;
2423 	if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
2424 		error = errno;
2425 		goto out;
2426 	}
2427 	/* ?? Can we provide items not-requested? If so, can skip tests. */
2428 	mask = req->lr_req.tgetattr.request_mask;
2429 	if (mask & L9PL_GETATTR_MODE) {
2430 		/* It is not clear if we need any translations. */
2431 		req->lr_resp.rgetattr.mode = st.st_mode;
2432 		valid |= L9PL_GETATTR_MODE;
2433 	}
2434 	if (mask & L9PL_GETATTR_NLINK) {
2435 		req->lr_resp.rgetattr.nlink = st.st_nlink;
2436 		valid |= L9PL_GETATTR_NLINK;
2437 	}
2438 	if (mask & L9PL_GETATTR_UID) {
2439 		/* provide st_uid, or file->ff_uid? */
2440 		req->lr_resp.rgetattr.uid = st.st_uid;
2441 		valid |= L9PL_GETATTR_UID;
2442 	}
2443 	if (mask & L9PL_GETATTR_GID) {
2444 		/* provide st_gid, or file->ff_gid? */
2445 		req->lr_resp.rgetattr.gid = st.st_gid;
2446 		valid |= L9PL_GETATTR_GID;
2447 	}
2448 	if (mask & L9PL_GETATTR_RDEV) {
2449 		/* It is not clear if we need any translations. */
2450 		req->lr_resp.rgetattr.rdev = (uint64_t)st.st_rdev;
2451 		valid |= L9PL_GETATTR_RDEV;
2452 	}
2453 	if (mask & L9PL_GETATTR_ATIME) {
2454 		req->lr_resp.rgetattr.atime_sec =
2455 		    (uint64_t)st.st_atimespec.tv_sec;
2456 		req->lr_resp.rgetattr.atime_nsec =
2457 		    (uint64_t)st.st_atimespec.tv_nsec;
2458 		valid |= L9PL_GETATTR_ATIME;
2459 	}
2460 	if (mask & L9PL_GETATTR_MTIME) {
2461 		req->lr_resp.rgetattr.mtime_sec =
2462 		    (uint64_t)st.st_mtimespec.tv_sec;
2463 		req->lr_resp.rgetattr.mtime_nsec =
2464 		    (uint64_t)st.st_mtimespec.tv_nsec;
2465 		valid |= L9PL_GETATTR_MTIME;
2466 	}
2467 	if (mask & L9PL_GETATTR_CTIME) {
2468 		req->lr_resp.rgetattr.ctime_sec =
2469 		    (uint64_t)st.st_ctimespec.tv_sec;
2470 		req->lr_resp.rgetattr.ctime_nsec =
2471 		    (uint64_t)st.st_ctimespec.tv_nsec;
2472 		valid |= L9PL_GETATTR_CTIME;
2473 	}
2474 	if (mask & L9PL_GETATTR_BTIME) {
2475 #if defined(HAVE_BIRTHTIME)
2476 		req->lr_resp.rgetattr.btime_sec =
2477 		    (uint64_t)st.st_birthtim.tv_sec;
2478 		req->lr_resp.rgetattr.btime_nsec =
2479 		    (uint64_t)st.st_birthtim.tv_nsec;
2480 #else
2481 		req->lr_resp.rgetattr.btime_sec = 0;
2482 		req->lr_resp.rgetattr.btime_nsec = 0;
2483 #endif
2484 		valid |= L9PL_GETATTR_BTIME;
2485 	}
2486 	if (mask & L9PL_GETATTR_INO)
2487 		valid |= L9PL_GETATTR_INO;
2488 	if (mask & L9PL_GETATTR_SIZE) {
2489 		req->lr_resp.rgetattr.size = (uint64_t)st.st_size;
2490 		valid |= L9PL_GETATTR_SIZE;
2491 	}
2492 	if (mask & L9PL_GETATTR_BLOCKS) {
2493 		req->lr_resp.rgetattr.blksize = (uint64_t)st.st_blksize;
2494 		req->lr_resp.rgetattr.blocks = (uint64_t)st.st_blocks;
2495 		valid |= L9PL_GETATTR_BLOCKS;
2496 	}
2497 	if (mask & L9PL_GETATTR_GEN) {
2498 		req->lr_resp.rgetattr.gen = st.st_gen;
2499 		valid |= L9PL_GETATTR_GEN;
2500 	}
2501 	/* don't know what to do with data version yet */
2502 
2503 	generate_qid(&st, &req->lr_resp.rgetattr.qid);
2504 out:
2505 	req->lr_resp.rgetattr.valid = valid;
2506 	return (error);
2507 }
2508 
2509 /*
2510  * Should combine some of this with wstat code.
2511  */
2512 static int
2513 fs_setattr(void *softc, struct l9p_request *req)
2514 {
2515 	uint64_t mask;
2516 	struct fs_softc *sc = softc;
2517 	struct timespec ts[2];
2518 	struct fs_fid *file;
2519 	struct stat st;
2520 	int error = 0;
2521 	uid_t uid, gid;
2522 
2523 	file = req->lr_fid->lo_aux;
2524 	assert(file);
2525 
2526 	if (sc->fs_readonly)
2527 		return (EROFS);
2528 
2529 	/*
2530 	 * As with WSTAT we have atomicity issues.
2531 	 */
2532 	mask = req->lr_req.tsetattr.valid;
2533 
2534 	if (fstatat(file->ff_dirfd, file->ff_name, &st, AT_SYMLINK_NOFOLLOW)) {
2535 		error = errno;
2536 		goto out;
2537 	}
2538 
2539 	if ((mask & L9PL_SETATTR_SIZE) && S_ISDIR(st.st_mode)) {
2540 		error = EISDIR;
2541 		goto out;
2542 	}
2543 
2544 	if (mask & L9PL_SETATTR_MODE) {
2545 		if (fchmodat(file->ff_dirfd, file->ff_name,
2546 		    req->lr_req.tsetattr.mode & 0777,
2547 		    AT_SYMLINK_NOFOLLOW)) {
2548 			error = errno;
2549 			goto out;
2550 		}
2551 	}
2552 
2553 	if (mask & (L9PL_SETATTR_UID | L9PL_SETATTR_GID)) {
2554 		uid = mask & L9PL_SETATTR_UID
2555 		    ? req->lr_req.tsetattr.uid
2556 		    : (uid_t)-1;
2557 
2558 		gid = mask & L9PL_SETATTR_GID
2559 		    ? req->lr_req.tsetattr.gid
2560 		    : (gid_t)-1;
2561 
2562 		if (fchownat(file->ff_dirfd, file->ff_name, uid, gid,
2563 		    AT_SYMLINK_NOFOLLOW)) {
2564 			error = errno;
2565 			goto out;
2566 		}
2567 	}
2568 
2569 	if (mask & L9PL_SETATTR_SIZE) {
2570 		/* Truncate follows symlinks, is this OK? */
2571 		int fd = openat(file->ff_dirfd, file->ff_name, O_RDWR);
2572 		if (ftruncate(fd, (off_t)req->lr_req.tsetattr.size)) {
2573 			error = errno;
2574 			(void) close(fd);
2575 			goto out;
2576 		}
2577 		(void) close(fd);
2578 	}
2579 
2580 	if (mask & (L9PL_SETATTR_ATIME | L9PL_SETATTR_MTIME)) {
2581 		ts[0].tv_sec = st.st_atimespec.tv_sec;
2582 		ts[0].tv_nsec = st.st_atimespec.tv_nsec;
2583 		ts[1].tv_sec = st.st_mtimespec.tv_sec;
2584 		ts[1].tv_nsec = st.st_mtimespec.tv_nsec;
2585 
2586 		if (mask & L9PL_SETATTR_ATIME) {
2587 			if (mask & L9PL_SETATTR_ATIME_SET) {
2588 				ts[0].tv_sec = req->lr_req.tsetattr.atime_sec;
2589 				ts[0].tv_nsec = req->lr_req.tsetattr.atime_nsec;
2590 			} else {
2591 				if (clock_gettime(CLOCK_REALTIME, &ts[0]) != 0) {
2592 					error = errno;
2593 					goto out;
2594 				}
2595 			}
2596 		}
2597 
2598 		if (mask & L9PL_SETATTR_MTIME) {
2599 			if (mask & L9PL_SETATTR_MTIME_SET) {
2600 				ts[1].tv_sec = req->lr_req.tsetattr.mtime_sec;
2601 				ts[1].tv_nsec = req->lr_req.tsetattr.mtime_nsec;
2602 			} else {
2603 				if (clock_gettime(CLOCK_REALTIME, &ts[1]) != 0) {
2604 					error = errno;
2605 					goto out;
2606 				}
2607 			}
2608 		}
2609 
2610 		if (utimensat(file->ff_dirfd, file->ff_name, ts,
2611 		    AT_SYMLINK_NOFOLLOW)) {
2612 			error = errno;
2613 			goto out;
2614 		}
2615 	}
2616 out:
2617 	return (error);
2618 }
2619 
2620 static int
2621 fs_xattrwalk(void *softc __unused, struct l9p_request *req __unused)
2622 {
2623 	return (EOPNOTSUPP);
2624 }
2625 
2626 static int
2627 fs_xattrcreate(void *softc __unused, struct l9p_request *req __unused)
2628 {
2629 	return (EOPNOTSUPP);
2630 }
2631 
2632 static int
2633 fs_readdir(void *softc __unused, struct l9p_request *req)
2634 {
2635 	struct l9p_message msg;
2636 	struct l9p_dirent de;
2637 	struct fs_fid *file;
2638 	struct dirent *dp;
2639 	struct stat st;
2640 	uint32_t count;
2641 	int error = 0;
2642 
2643 	file = req->lr_fid->lo_aux;
2644 	assert(file);
2645 
2646 	if (file->ff_dir == NULL)
2647 		return (ENOTDIR);
2648 
2649 	pthread_mutex_lock(&file->ff_mtx);
2650 
2651 	/*
2652 	 * It's not clear whether we can use the same trick for
2653 	 * discarding offsets here as we do in fs_read.  It
2654 	 * probably should work, we'll have to see if some
2655 	 * client(s) use the zero-offset thing to rescan without
2656 	 * clunking the directory first.
2657 	 *
2658 	 * Probably the thing to do is switch to calling
2659 	 * getdirentries() / getdents() directly, instead of
2660 	 * going through libc.
2661 	 */
2662 	if (req->lr_req.io.offset == 0)
2663 		rewinddir(file->ff_dir);
2664 	else
2665 		seekdir(file->ff_dir, (long)req->lr_req.io.offset);
2666 
2667 	l9p_init_msg(&msg, req, L9P_PACK);
2668 	count = (uint32_t)msg.lm_size; /* in case we get no entries */
2669 	while ((dp = readdir(file->ff_dir)) != NULL) {
2670 		/*
2671 		 * Although "." is forbidden in naming and ".." is
2672 		 * special cased, testing shows that we must transmit
2673 		 * them through readdir.  (For ".." at root, we
2674 		 * should perhaps alter the inode number, but not
2675 		 * yet.)
2676 		 */
2677 
2678 		/*
2679 		 * TODO: we do a full lstat here; could use dp->d_*
2680 		 * to construct the qid more efficiently, as long
2681 		 * as dp->d_type != DT_UNKNOWN.
2682 		 */
2683 		if (fs_lstatat(file, dp->d_name, &st))
2684 			continue;
2685 
2686 		de.qid.type = 0;
2687 		generate_qid(&st, &de.qid);
2688 		de.offset = (uint64_t)telldir(file->ff_dir);
2689 		de.type = dp->d_type;
2690 		de.name = dp->d_name;
2691 
2692 		/* Update count only if we completely pack the dirent. */
2693 		if (l9p_pudirent(&msg, &de) < 0)
2694 			break;
2695 		count = (uint32_t)msg.lm_size;
2696 	}
2697 
2698 	pthread_mutex_unlock(&file->ff_mtx);
2699 	req->lr_resp.io.count = count;
2700 	return (error);
2701 }
2702 
2703 static int
2704 fs_fsync(void *softc __unused, struct l9p_request *req)
2705 {
2706 	struct fs_fid *file;
2707 	int error = 0;
2708 
2709 	file = req->lr_fid->lo_aux;
2710 	assert(file);
2711 	if (fsync(file->ff_dir != NULL ? dirfd(file->ff_dir) : file->ff_fd))
2712 		error = errno;
2713 	return (error);
2714 }
2715 
2716 static int
2717 fs_lock(void *softc __unused, struct l9p_request *req)
2718 {
2719 
2720 	switch (req->lr_req.tlock.type) {
2721 	case L9PL_LOCK_TYPE_RDLOCK:
2722 	case L9PL_LOCK_TYPE_WRLOCK:
2723 	case L9PL_LOCK_TYPE_UNLOCK:
2724 		break;
2725 	default:
2726 		return (EINVAL);
2727 	}
2728 
2729 	req->lr_resp.rlock.status = L9PL_LOCK_SUCCESS;
2730 	return (0);
2731 }
2732 
2733 static int
2734 fs_getlock(void *softc __unused, struct l9p_request *req)
2735 {
2736 
2737 	/*
2738 	 * Client wants to see if a request to lock a region would
2739 	 * block.  This is, of course, not atomic anyway, so the
2740 	 * op is useless.  QEMU simply says "unlocked!", so we do
2741 	 * too.
2742 	 */
2743 	switch (req->lr_req.getlock.type) {
2744 	case L9PL_LOCK_TYPE_RDLOCK:
2745 	case L9PL_LOCK_TYPE_WRLOCK:
2746 	case L9PL_LOCK_TYPE_UNLOCK:
2747 		break;
2748 	default:
2749 		return (EINVAL);
2750 	}
2751 
2752 	req->lr_resp.getlock = req->lr_req.getlock;
2753 	req->lr_resp.getlock.type = L9PL_LOCK_TYPE_UNLOCK;
2754 	req->lr_resp.getlock.client_id = strdup("");  /* XXX what should go here? */
2755 	return (0);
2756 }
2757 
2758 static int
2759 fs_link(void *softc __unused, struct l9p_request *req)
2760 {
2761 	struct l9p_fid *dir;
2762 	struct fs_fid *file;
2763 	struct fs_fid *dirf;
2764 	struct stat fst, tdst;
2765 	int32_t op;
2766 	char *name;
2767 	char newname[MAXPATHLEN];
2768 	int error;
2769 
2770 	/* N.B.: lr_fid is the file to link, lr_fid2 is the target dir */
2771 	dir = req->lr_fid2;
2772 	dirf = dir->lo_aux;
2773 	assert(dirf != NULL);
2774 
2775 	name = req->lr_req.tlink.name;
2776 	error = fs_buildname(dir, name, newname, sizeof(newname));
2777 	if (error)
2778 		return (error);
2779 
2780 	file = req->lr_fid->lo_aux;
2781 	assert(file != NULL);
2782 
2783 	if (fstatat(dirf->ff_dirfd, dirf->ff_name, &tdst, AT_SYMLINK_NOFOLLOW) != 0 ||
2784 	    fstatat(file->ff_dirfd, file->ff_name, &fst, AT_SYMLINK_NOFOLLOW) != 0)
2785 		return (errno);
2786 	if (S_ISDIR(fst.st_mode))
2787 		return (EISDIR);
2788 	fillacl(dirf);
2789 	op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY : L9P_ACE_ADD_FILE;
2790 	error = check_access(op,
2791 	    dirf->ff_acl, &tdst, NULL, NULL, file->ff_ai, (gid_t)-1);
2792 	if (error)
2793 		return (error);
2794 
2795 	if (linkat(file->ff_dirfd, file->ff_name, file->ff_dirfd,
2796 	    newname, 0) != 0)
2797 		error = errno;
2798 	else
2799 		dropacl(file);
2800 
2801 	return (error);
2802 }
2803 
2804 static int
2805 fs_mkdir(void *softc, struct l9p_request *req)
2806 {
2807 	struct l9p_fid *dir;
2808 	struct stat st;
2809 	mode_t perm;
2810 	gid_t gid;
2811 	char *name;
2812 	int error;
2813 
2814 	dir = req->lr_fid;
2815 	name = req->lr_req.tmkdir.name;
2816 	perm = (mode_t)req->lr_req.tmkdir.mode;
2817 	gid = req->lr_req.tmkdir.gid;
2818 
2819 	error = fs_imkdir(softc, dir, name, false, perm, gid, &st);
2820 	if (error == 0)
2821 		generate_qid(&st, &req->lr_resp.rmkdir.qid);
2822 	return (error);
2823 }
2824 
2825 static int
2826 fs_renameat(void *softc, struct l9p_request *req)
2827 {
2828 	struct fs_softc *sc = softc;
2829 	struct l9p_fid *olddir, *newdir;
2830 	struct l9p_acl *facl;
2831 	struct fs_fid *off, *nff;
2832 	struct stat odst, ndst, fst;
2833 	int32_t op;
2834 	bool reparenting;
2835 	char *onp, *nnp;
2836 	char onb[MAXPATHLEN], nnb[MAXPATHLEN];
2837 	int error;
2838 
2839 	if (sc->fs_readonly)
2840 		return (EROFS);
2841 
2842 	olddir = req->lr_fid;
2843 	newdir = req->lr_fid2;
2844 	assert(olddir != NULL && newdir != NULL);
2845 	off = olddir->lo_aux;
2846 	nff = newdir->lo_aux;
2847 	assert(off != NULL && nff != NULL);
2848 
2849 	onp = req->lr_req.trenameat.oldname;
2850 	nnp = req->lr_req.trenameat.newname;
2851 	error = fs_buildname(olddir, onp, onb, sizeof(onb));
2852 	if (error)
2853 		return (error);
2854 	error = fs_buildname(newdir, nnp, nnb, sizeof(nnb));
2855 	if (error)
2856 		return (error);
2857 	if (fstatat(off->ff_dirfd, onb, &fst, AT_SYMLINK_NOFOLLOW) != 0)
2858 		return (errno);
2859 
2860 	reparenting = olddir != newdir &&
2861 	    strcmp(off->ff_name, nff->ff_name) != 0;
2862 
2863 	if (fstatat(off->ff_dirfd, off->ff_name, &odst, AT_SYMLINK_NOFOLLOW) != 0)
2864 		return (errno);
2865 	if (!S_ISDIR(odst.st_mode))
2866 		return (ENOTDIR);
2867 	fillacl(off);
2868 
2869 	if (reparenting) {
2870 		if (fstatat(nff->ff_dirfd, nff->ff_name, &ndst, AT_SYMLINK_NOFOLLOW) != 0)
2871 			return (errno);
2872 		if (!S_ISDIR(ndst.st_mode))
2873 			return (ENOTDIR);
2874 		facl = getacl(off, -1, onb);
2875 		fillacl(nff);
2876 
2877 		error = check_access(L9P_ACOP_UNLINK,
2878 		    off->ff_acl, &odst, facl, &fst, off->ff_ai, (gid_t)-1);
2879 		l9p_acl_free(facl);
2880 		if (error)
2881 			return (error);
2882 		op = S_ISDIR(fst.st_mode) ? L9P_ACE_ADD_SUBDIRECTORY :
2883 		    L9P_ACE_ADD_FILE;
2884 		error = check_access(op,
2885 		    nff->ff_acl, &ndst, NULL, NULL, nff->ff_ai, (gid_t)-1);
2886 		if (error)
2887 			return (error);
2888 	}
2889 
2890 	if (renameat(off->ff_dirfd, onb, nff->ff_dirfd, nnb))
2891 		error = errno;
2892 
2893 	return (error);
2894 }
2895 
2896 /*
2897  * Unlink file in given directory, or remove directory in given
2898  * directory, based on flags.
2899  */
2900 static int
2901 fs_unlinkat(void *softc, struct l9p_request *req)
2902 {
2903 	struct fs_softc *sc = softc;
2904 	struct l9p_acl *facl;
2905 	struct l9p_fid *dir;
2906 	struct fs_fid *dirff;
2907 	struct stat dirst, fst;
2908 	char *name;
2909 	char newname[MAXPATHLEN];
2910 	int error;
2911 
2912 	if (sc->fs_readonly)
2913 		return (EROFS);
2914 
2915 	dir = req->lr_fid;
2916 	dirff = dir->lo_aux;
2917 	assert(dirff != NULL);
2918 	name = req->lr_req.tunlinkat.name;
2919 	error = fs_buildname(dir, name, newname, sizeof(newname));
2920 	if (error)
2921 		return (error);
2922 	if (fstatat(dirff->ff_dirfd, newname, &fst, AT_SYMLINK_NOFOLLOW) != 0 ||
2923 	    fstatat(dirff->ff_dirfd, dirff->ff_name, &dirst, AT_SYMLINK_NOFOLLOW) != 0)
2924 		return (errno);
2925 	fillacl(dirff);
2926 	facl = getacl(dirff, -1, newname);
2927 	error = check_access(L9P_ACOP_UNLINK,
2928 	    dirff->ff_acl, &dirst, facl, &fst, dirff->ff_ai, (gid_t)-1);
2929 	l9p_acl_free(facl);
2930 	if (error)
2931 		return (error);
2932 
2933 	if (req->lr_req.tunlinkat.flags & L9PL_AT_REMOVEDIR) {
2934 		if (unlinkat(dirff->ff_dirfd, newname, AT_REMOVEDIR) != 0)
2935 			error = errno;
2936 	} else {
2937 		if (unlinkat(dirff->ff_dirfd, newname, 0) != 0)
2938 			error = errno;
2939 	}
2940 	return (error);
2941 }
2942 
2943 static void
2944 fs_freefid(void *softc __unused, struct l9p_fid *fid)
2945 {
2946 	struct fs_fid *f = fid->lo_aux;
2947 	struct fs_authinfo *ai;
2948 	uint32_t newcount;
2949 
2950 	if (f == NULL) {
2951 		/* Nothing to do here */
2952 		return;
2953 	}
2954 
2955 	if (f->ff_fd != -1)
2956 		close(f->ff_fd);
2957 
2958 	if (f->ff_dir)
2959 		closedir(f->ff_dir);
2960 
2961 	pthread_mutex_destroy(&f->ff_mtx);
2962 	free(f->ff_name);
2963 	ai = f->ff_ai;
2964 	l9p_acl_free(f->ff_acl);
2965 	free(f);
2966 	pthread_mutex_lock(&ai->ai_mtx);
2967 	newcount = --ai->ai_refcnt;
2968 	pthread_mutex_unlock(&ai->ai_mtx);
2969 	if (newcount == 0) {
2970 		/*
2971 		 * We *were* the last ref, no one can have gained a ref.
2972 		 */
2973 		L9P_LOG(L9P_DEBUG, "dropped last ref to authinfo %p",
2974 		    (void *)ai);
2975 		pthread_mutex_destroy(&ai->ai_mtx);
2976 		free(ai);
2977 	} else {
2978 		L9P_LOG(L9P_DEBUG, "authinfo %p now used by %lu",
2979 		    (void *)ai, (u_long)newcount);
2980 	}
2981 }
2982 
2983 int
2984 l9p_backend_fs_init(struct l9p_backend **backendp, int rootfd, bool ro)
2985 {
2986 	struct l9p_backend *backend;
2987 	struct fs_softc *sc;
2988 	int error;
2989 #if defined(WITH_CASPER)
2990 	cap_channel_t *capcas;
2991 #endif
2992 
2993 	if (!fs_attach_mutex_inited) {
2994 		error = pthread_mutex_init(&fs_attach_mutex, NULL);
2995 		if (error) {
2996 			errno = error;
2997 			return (-1);
2998 		}
2999 		fs_attach_mutex_inited = true;
3000 	}
3001 
3002 	backend = l9p_malloc(sizeof(*backend));
3003 	backend->attach = fs_attach;
3004 	backend->clunk = fs_clunk;
3005 	backend->create = fs_create;
3006 	backend->open = fs_open;
3007 	backend->read = fs_read;
3008 	backend->remove = fs_remove;
3009 	backend->stat = fs_stat;
3010 	backend->walk = fs_walk;
3011 	backend->write = fs_write;
3012 	backend->wstat = fs_wstat;
3013 	backend->statfs = fs_statfs;
3014 	backend->lopen = fs_lopen;
3015 	backend->lcreate = fs_lcreate;
3016 	backend->symlink = fs_symlink;
3017 	backend->mknod = fs_mknod;
3018 	backend->rename = fs_rename;
3019 	backend->readlink = fs_readlink;
3020 	backend->getattr = fs_getattr;
3021 	backend->setattr = fs_setattr;
3022 	backend->xattrwalk = fs_xattrwalk;
3023 	backend->xattrcreate = fs_xattrcreate;
3024 	backend->readdir = fs_readdir;
3025 	backend->fsync = fs_fsync;
3026 	backend->lock = fs_lock;
3027 	backend->getlock = fs_getlock;
3028 	backend->link = fs_link;
3029 	backend->mkdir = fs_mkdir;
3030 	backend->renameat = fs_renameat;
3031 	backend->unlinkat = fs_unlinkat;
3032 	backend->freefid = fs_freefid;
3033 
3034 	sc = l9p_malloc(sizeof(*sc));
3035 	sc->fs_rootfd = rootfd;
3036 	sc->fs_readonly = ro;
3037 	backend->softc = sc;
3038 
3039 #if defined(WITH_CASPER)
3040 	capcas = cap_init();
3041 	if (capcas == NULL)
3042 		return (-1);
3043 
3044 	sc->fs_cappwd = cap_service_open(capcas, "system.pwd");
3045 	if (sc->fs_cappwd == NULL)
3046 		return (-1);
3047 
3048 	sc->fs_capgrp = cap_service_open(capcas, "system.grp");
3049 	if (sc->fs_capgrp == NULL)
3050 		return (-1);
3051 
3052 	cap_setpassent(sc->fs_cappwd, 1);
3053 	cap_setgroupent(sc->fs_capgrp, 1);
3054 	cap_close(capcas);
3055 #else
3056 	setpassent(1);
3057 #endif
3058 
3059 	*backendp = backend;
3060 	return (0);
3061 }
3062