xref: /freebsd/stand/libsa/zfs/zfs.c (revision c7046f76c2c027b00c0e6ba57cfd28f1a78f5e23)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <part.h>
42 #include <stddef.h>
43 #include <stdarg.h>
44 #include <string.h>
45 #include <bootstrap.h>
46 
47 #include "libzfs.h"
48 
49 #include "zfsimpl.c"
50 
51 /* Define the range of indexes to be populated with ZFS Boot Environments */
52 #define		ZFS_BE_FIRST	4
53 #define		ZFS_BE_LAST	8
54 
55 static int	zfs_open(const char *path, struct open_file *f);
56 static int	zfs_close(struct open_file *f);
57 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
58 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
59 static int	zfs_stat(struct open_file *f, struct stat *sb);
60 static int	zfs_readdir(struct open_file *f, struct dirent *d);
61 static int	zfs_mount(const char *dev, const char *path, void **data);
62 static int	zfs_unmount(const char *dev, void *data);
63 
64 static void	zfs_bootenv_initial(const char *envname, spa_t *spa,
65 		    const char *name, const char *dsname, int checkpoint);
66 static void	zfs_checkpoints_initial(spa_t *spa, const char *name,
67 		    const char *dsname);
68 
69 struct devsw zfs_dev;
70 
71 struct fs_ops zfs_fsops = {
72 	.fs_name = "zfs",
73 	.fo_open = zfs_open,
74 	.fo_close = zfs_close,
75 	.fo_read = zfs_read,
76 	.fo_write = null_write,
77 	.fo_seek = zfs_seek,
78 	.fo_stat = zfs_stat,
79 	.fo_readdir = zfs_readdir,
80 	.fo_mount = zfs_mount,
81 	.fo_unmount = zfs_unmount
82 };
83 
84 /*
85  * In-core open file.
86  */
87 struct file {
88 	off_t		f_seekp;	/* seek pointer */
89 	dnode_phys_t	f_dnode;
90 	uint64_t	f_zap_type;	/* zap type for readdir */
91 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
92 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
93 };
94 
95 static int	zfs_env_index;
96 static int	zfs_env_count;
97 
98 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
99 struct zfs_be_list *zfs_be_headp;
100 struct zfs_be_entry {
101 	char *name;
102 	SLIST_ENTRY(zfs_be_entry) entries;
103 } *zfs_be, *zfs_be_tmp;
104 
105 /*
106  * Open a file.
107  */
108 static int
109 zfs_open(const char *upath, struct open_file *f)
110 {
111 	struct devdesc *dev = f->f_devdata;
112 	struct zfsmount *mount = dev->d_opendata;
113 	struct file *fp;
114 	int rc;
115 
116 	if (f->f_dev != &zfs_dev)
117 		return (EINVAL);
118 
119 	/* allocate file system specific data structure */
120 	fp = calloc(1, sizeof(struct file));
121 	if (fp == NULL)
122 		return (ENOMEM);
123 	f->f_fsdata = fp;
124 
125 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
126 	fp->f_seekp = 0;
127 	if (rc) {
128 		f->f_fsdata = NULL;
129 		free(fp);
130 	}
131 	return (rc);
132 }
133 
134 static int
135 zfs_close(struct open_file *f)
136 {
137 	struct file *fp = (struct file *)f->f_fsdata;
138 
139 	dnode_cache_obj = NULL;
140 	f->f_fsdata = NULL;
141 
142 	free(fp);
143 	return (0);
144 }
145 
146 /*
147  * Copy a portion of a file into kernel memory.
148  * Cross block boundaries when necessary.
149  */
150 static int
151 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
152 {
153 	struct devdesc *dev = f->f_devdata;
154 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
155 	struct file *fp = (struct file *)f->f_fsdata;
156 	struct stat sb;
157 	size_t n;
158 	int rc;
159 
160 	rc = zfs_stat(f, &sb);
161 	if (rc)
162 		return (rc);
163 	n = size;
164 	if (fp->f_seekp + n > sb.st_size)
165 		n = sb.st_size - fp->f_seekp;
166 
167 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
168 	if (rc)
169 		return (rc);
170 
171 	if (0) {
172 	    int i;
173 	    for (i = 0; i < n; i++)
174 		putchar(((char*) start)[i]);
175 	}
176 	fp->f_seekp += n;
177 	if (resid)
178 		*resid = size - n;
179 
180 	return (0);
181 }
182 
183 static off_t
184 zfs_seek(struct open_file *f, off_t offset, int where)
185 {
186 	struct file *fp = (struct file *)f->f_fsdata;
187 
188 	switch (where) {
189 	case SEEK_SET:
190 		fp->f_seekp = offset;
191 		break;
192 	case SEEK_CUR:
193 		fp->f_seekp += offset;
194 		break;
195 	case SEEK_END:
196 	    {
197 		struct stat sb;
198 		int error;
199 
200 		error = zfs_stat(f, &sb);
201 		if (error != 0) {
202 			errno = error;
203 			return (-1);
204 		}
205 		fp->f_seekp = sb.st_size - offset;
206 		break;
207 	    }
208 	default:
209 		errno = EINVAL;
210 		return (-1);
211 	}
212 	return (fp->f_seekp);
213 }
214 
215 static int
216 zfs_stat(struct open_file *f, struct stat *sb)
217 {
218 	struct devdesc *dev = f->f_devdata;
219 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
220 	struct file *fp = (struct file *)f->f_fsdata;
221 
222 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
223 }
224 
225 static int
226 zfs_readdir(struct open_file *f, struct dirent *d)
227 {
228 	struct devdesc *dev = f->f_devdata;
229 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
230 	struct file *fp = (struct file *)f->f_fsdata;
231 	mzap_ent_phys_t mze;
232 	struct stat sb;
233 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
234 	int rc;
235 
236 	rc = zfs_stat(f, &sb);
237 	if (rc)
238 		return (rc);
239 	if (!S_ISDIR(sb.st_mode))
240 		return (ENOTDIR);
241 
242 	/*
243 	 * If this is the first read, get the zap type.
244 	 */
245 	if (fp->f_seekp == 0) {
246 		rc = dnode_read(spa, &fp->f_dnode,
247 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
248 		if (rc)
249 			return (rc);
250 
251 		if (fp->f_zap_type == ZBT_MICRO) {
252 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
253 		} else {
254 			rc = dnode_read(spa, &fp->f_dnode,
255 					offsetof(zap_phys_t, zap_num_leafs),
256 					&fp->f_num_leafs,
257 					sizeof(fp->f_num_leafs));
258 			if (rc)
259 				return (rc);
260 
261 			fp->f_seekp = bsize;
262 			fp->f_zap_leaf = malloc(bsize);
263 			if (fp->f_zap_leaf == NULL)
264 				return (ENOMEM);
265 			rc = dnode_read(spa, &fp->f_dnode,
266 					fp->f_seekp,
267 					fp->f_zap_leaf,
268 					bsize);
269 			if (rc)
270 				return (rc);
271 		}
272 	}
273 
274 	if (fp->f_zap_type == ZBT_MICRO) {
275 	mzap_next:
276 		if (fp->f_seekp >= bsize)
277 			return (ENOENT);
278 
279 		rc = dnode_read(spa, &fp->f_dnode,
280 				fp->f_seekp, &mze, sizeof(mze));
281 		if (rc)
282 			return (rc);
283 		fp->f_seekp += sizeof(mze);
284 
285 		if (!mze.mze_name[0])
286 			goto mzap_next;
287 
288 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
289 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
290 		strcpy(d->d_name, mze.mze_name);
291 		d->d_namlen = strlen(d->d_name);
292 		return (0);
293 	} else {
294 		zap_leaf_t zl;
295 		zap_leaf_chunk_t *zc, *nc;
296 		int chunk;
297 		size_t namelen;
298 		char *p;
299 		uint64_t value;
300 
301 		/*
302 		 * Initialise this so we can use the ZAP size
303 		 * calculating macros.
304 		 */
305 		zl.l_bs = ilog2(bsize);
306 		zl.l_phys = fp->f_zap_leaf;
307 
308 		/*
309 		 * Figure out which chunk we are currently looking at
310 		 * and consider seeking to the next leaf. We use the
311 		 * low bits of f_seekp as a simple chunk index.
312 		 */
313 	fzap_next:
314 		chunk = fp->f_seekp & (bsize - 1);
315 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
316 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
317 			chunk = 0;
318 
319 			/*
320 			 * Check for EOF and read the new leaf.
321 			 */
322 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
323 				return (ENOENT);
324 
325 			rc = dnode_read(spa, &fp->f_dnode,
326 					fp->f_seekp,
327 					fp->f_zap_leaf,
328 					bsize);
329 			if (rc)
330 				return (rc);
331 		}
332 
333 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
334 		fp->f_seekp++;
335 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
336 			goto fzap_next;
337 
338 		namelen = zc->l_entry.le_name_numints;
339 		if (namelen > sizeof(d->d_name))
340 			namelen = sizeof(d->d_name);
341 
342 		/*
343 		 * Paste the name back together.
344 		 */
345 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
346 		p = d->d_name;
347 		while (namelen > 0) {
348 			int len;
349 			len = namelen;
350 			if (len > ZAP_LEAF_ARRAY_BYTES)
351 				len = ZAP_LEAF_ARRAY_BYTES;
352 			memcpy(p, nc->l_array.la_array, len);
353 			p += len;
354 			namelen -= len;
355 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
356 		}
357 		d->d_name[sizeof(d->d_name) - 1] = 0;
358 
359 		/*
360 		 * Assume the first eight bytes of the value are
361 		 * a uint64_t.
362 		 */
363 		value = fzap_leaf_value(&zl, zc);
364 
365 		d->d_fileno = ZFS_DIRENT_OBJ(value);
366 		d->d_type = ZFS_DIRENT_TYPE(value);
367 		d->d_namlen = strlen(d->d_name);
368 
369 		return (0);
370 	}
371 }
372 
373 /*
374  * if path is NULL, create mount structure, but do not add it to list.
375  */
376 static int
377 zfs_mount(const char *dev, const char *path, void **data)
378 {
379 	struct zfs_devdesc *zfsdev;
380 	spa_t *spa;
381 	struct zfsmount *mnt;
382 	int rv;
383 
384 	errno = 0;
385 	zfsdev = malloc(sizeof(*zfsdev));
386 	if (zfsdev == NULL)
387 		return (errno);
388 
389 	rv = zfs_parsedev(zfsdev, dev + 3, NULL);
390 	if (rv != 0) {
391 		free(zfsdev);
392 		return (rv);
393 	}
394 
395 	spa = spa_find_by_dev(zfsdev);
396 	if (spa == NULL)
397 		return (ENXIO);
398 
399 	mnt = calloc(1, sizeof(*mnt));
400 	if (mnt != NULL && path != NULL)
401 		mnt->path = strdup(path);
402 	rv = errno;
403 
404 	if (mnt != NULL)
405 		rv = zfs_mount_impl(spa, zfsdev->root_guid, mnt);
406 	free(zfsdev);
407 
408 	if (rv == 0 && mnt != NULL && mnt->objset.os_type != DMU_OST_ZFS) {
409 		printf("Unexpected object set type %ju\n",
410 		    (uintmax_t)mnt->objset.os_type);
411 		rv = EIO;
412 	}
413 
414 	if (rv != 0) {
415 		if (mnt != NULL)
416 			free(mnt->path);
417 		free(mnt);
418 		return (rv);
419 	}
420 
421 	if (mnt != NULL) {
422 		*data = mnt;
423 		if (path != NULL)
424 			STAILQ_INSERT_TAIL(&zfsmount, mnt, next);
425 	}
426 
427 	return (rv);
428 }
429 
430 static int
431 zfs_unmount(const char *dev, void *data)
432 {
433 	struct zfsmount *mnt = data;
434 
435 	STAILQ_REMOVE(&zfsmount, mnt, zfsmount, next);
436 	free(mnt->path);
437 	free(mnt);
438 	return (0);
439 }
440 
441 static int
442 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
443 {
444 	int fd, ret;
445 	size_t res, head, tail, total_size, full_sec_size;
446 	unsigned secsz, do_tail_read;
447 	off_t start_sec;
448 	char *outbuf, *bouncebuf;
449 
450 	fd = (uintptr_t) priv;
451 	outbuf = (char *) buf;
452 	bouncebuf = NULL;
453 
454 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
455 	if (ret != 0)
456 		return (ret);
457 
458 	/*
459 	 * Handling reads of arbitrary offset and size - multi-sector case
460 	 * and single-sector case.
461 	 *
462 	 *                        Multi-sector Case
463 	 *                (do_tail_read = true if tail > 0)
464 	 *
465 	 *   |<----------------------total_size--------------------->|
466 	 *   |                                                       |
467 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
468 	 *   |          |                                 |          |
469 	 *   |          |       |<~full_sec_size~>|       |          |
470 	 *   +------------------+                 +------------------+
471 	 *   |          |0101010|     .  .  .     |0101011|          |
472 	 *   +------------------+                 +------------------+
473 	 *         start_sec                         start_sec + n
474 	 *
475 	 *
476 	 *                      Single-sector Case
477 	 *                    (do_tail_read = false)
478 	 *
479 	 *              |<------total_size = secsz----->|
480 	 *              |                               |
481 	 *              |<-head->|<---bytes--->|<-tail->|
482 	 *              +-------------------------------+
483 	 *              |        |0101010101010|        |
484 	 *              +-------------------------------+
485 	 *                          start_sec
486 	 */
487 	start_sec = offset / secsz;
488 	head = offset % secsz;
489 	total_size = roundup2(head + bytes, secsz);
490 	tail = total_size - (head + bytes);
491 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
492 	full_sec_size = total_size;
493 	if (head > 0)
494 		full_sec_size -= secsz;
495 	if (do_tail_read)
496 		full_sec_size -= secsz;
497 
498 	/* Return of partial sector data requires a bounce buffer. */
499 	if ((head > 0) || do_tail_read || bytes < secsz) {
500 		bouncebuf = malloc(secsz);
501 		if (bouncebuf == NULL) {
502 			printf("vdev_read: out of memory\n");
503 			return (ENOMEM);
504 		}
505 	}
506 
507 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
508 		ret = errno;
509 		goto error;
510 	}
511 
512 	/* Partial data return from first sector */
513 	if (head > 0) {
514 		res = read(fd, bouncebuf, secsz);
515 		if (res != secsz) {
516 			ret = EIO;
517 			goto error;
518 		}
519 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
520 		outbuf += min(secsz - head, bytes);
521 	}
522 
523 	/*
524 	 * Full data return from read sectors.
525 	 * Note, there is still corner case where we read
526 	 * from sector boundary, but less than sector size, e.g. reading 512B
527 	 * from 4k sector.
528 	 */
529 	if (full_sec_size > 0) {
530 		if (bytes < full_sec_size) {
531 			res = read(fd, bouncebuf, secsz);
532 			if (res != secsz) {
533 				ret = EIO;
534 				goto error;
535 			}
536 			memcpy(outbuf, bouncebuf, bytes);
537 		} else {
538 			res = read(fd, outbuf, full_sec_size);
539 			if (res != full_sec_size) {
540 				ret = EIO;
541 				goto error;
542 			}
543 			outbuf += full_sec_size;
544 		}
545 	}
546 
547 	/* Partial data return from last sector */
548 	if (do_tail_read) {
549 		res = read(fd, bouncebuf, secsz);
550 		if (res != secsz) {
551 			ret = EIO;
552 			goto error;
553 		}
554 		memcpy(outbuf, bouncebuf, secsz - tail);
555 	}
556 
557 	ret = 0;
558 error:
559 	free(bouncebuf);
560 	return (ret);
561 }
562 
563 static int
564 vdev_write(vdev_t *vdev, off_t offset, void *buf, size_t bytes)
565 {
566 	int fd, ret;
567 	size_t head, tail, total_size, full_sec_size;
568 	unsigned secsz, do_tail_write;
569 	off_t start_sec;
570 	ssize_t res;
571 	char *outbuf, *bouncebuf;
572 
573 	fd = (uintptr_t)vdev->v_priv;
574 	outbuf = (char *)buf;
575 	bouncebuf = NULL;
576 
577 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
578 	if (ret != 0)
579 		return (ret);
580 
581 	start_sec = offset / secsz;
582 	head = offset % secsz;
583 	total_size = roundup2(head + bytes, secsz);
584 	tail = total_size - (head + bytes);
585 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
586 	full_sec_size = total_size;
587 	if (head > 0)
588 		full_sec_size -= secsz;
589 	if (do_tail_write)
590 		full_sec_size -= secsz;
591 
592 	/* Partial sector write requires a bounce buffer. */
593 	if ((head > 0) || do_tail_write || bytes < secsz) {
594 		bouncebuf = malloc(secsz);
595 		if (bouncebuf == NULL) {
596 			printf("vdev_write: out of memory\n");
597 			return (ENOMEM);
598 		}
599 	}
600 
601 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
602 		ret = errno;
603 		goto error;
604 	}
605 
606 	/* Partial data for first sector */
607 	if (head > 0) {
608 		res = read(fd, bouncebuf, secsz);
609 		if ((unsigned)res != secsz) {
610 			ret = EIO;
611 			goto error;
612 		}
613 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
614 		(void) lseek(fd, -secsz, SEEK_CUR);
615 		res = write(fd, bouncebuf, secsz);
616 		if ((unsigned)res != secsz) {
617 			ret = EIO;
618 			goto error;
619 		}
620 		outbuf += min(secsz - head, bytes);
621 	}
622 
623 	/*
624 	 * Full data write to sectors.
625 	 * Note, there is still corner case where we write
626 	 * to sector boundary, but less than sector size, e.g. write 512B
627 	 * to 4k sector.
628 	 */
629 	if (full_sec_size > 0) {
630 		if (bytes < full_sec_size) {
631 			res = read(fd, bouncebuf, secsz);
632 			if ((unsigned)res != secsz) {
633 				ret = EIO;
634 				goto error;
635 			}
636 			memcpy(bouncebuf, outbuf, bytes);
637 			(void) lseek(fd, -secsz, SEEK_CUR);
638 			res = write(fd, bouncebuf, secsz);
639 			if ((unsigned)res != secsz) {
640 				ret = EIO;
641 				goto error;
642 			}
643 		} else {
644 			res = write(fd, outbuf, full_sec_size);
645 			if ((unsigned)res != full_sec_size) {
646 				ret = EIO;
647 				goto error;
648 			}
649 			outbuf += full_sec_size;
650 		}
651 	}
652 
653 	/* Partial data write to last sector */
654 	if (do_tail_write) {
655 		res = read(fd, bouncebuf, secsz);
656 		if ((unsigned)res != secsz) {
657 			ret = EIO;
658 			goto error;
659 		}
660 		memcpy(bouncebuf, outbuf, secsz - tail);
661 		(void) lseek(fd, -secsz, SEEK_CUR);
662 		res = write(fd, bouncebuf, secsz);
663 		if ((unsigned)res != secsz) {
664 			ret = EIO;
665 			goto error;
666 		}
667 	}
668 
669 	ret = 0;
670 error:
671 	free(bouncebuf);
672 	return (ret);
673 }
674 
675 static int
676 zfs_dev_init(void)
677 {
678 	spa_t *spa;
679 	spa_t *next;
680 	spa_t *prev;
681 
682 	zfs_init();
683 	if (archsw.arch_zfs_probe == NULL)
684 		return (ENXIO);
685 	archsw.arch_zfs_probe();
686 
687 	prev = NULL;
688 	spa = STAILQ_FIRST(&zfs_pools);
689 	while (spa != NULL) {
690 		next = STAILQ_NEXT(spa, spa_link);
691 		if (zfs_spa_init(spa)) {
692 			if (prev == NULL)
693 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
694 			else
695 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
696 		} else
697 			prev = spa;
698 		spa = next;
699 	}
700 	return (0);
701 }
702 
703 struct zfs_probe_args {
704 	int		fd;
705 	const char	*devname;
706 	uint64_t	*pool_guid;
707 	u_int		secsz;
708 };
709 
710 static int
711 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
712 {
713 	struct zfs_probe_args *ppa;
714 
715 	ppa = (struct zfs_probe_args *)arg;
716 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
717 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
718 }
719 
720 static int
721 zfs_probe(int fd, uint64_t *pool_guid)
722 {
723 	spa_t *spa;
724 	int ret;
725 
726 	spa = NULL;
727 	ret = vdev_probe(vdev_read, vdev_write, (void *)(uintptr_t)fd, &spa);
728 	if (ret == 0 && pool_guid != NULL)
729 		if (*pool_guid == 0)
730 			*pool_guid = spa->spa_guid;
731 	return (ret);
732 }
733 
734 static int
735 zfs_probe_partition(void *arg, const char *partname,
736     const struct ptable_entry *part)
737 {
738 	struct zfs_probe_args *ppa, pa;
739 	struct ptable *table;
740 	char devname[32];
741 	int ret;
742 
743 	/* Probe only freebsd-zfs and freebsd partitions */
744 	if (part->type != PART_FREEBSD &&
745 	    part->type != PART_FREEBSD_ZFS)
746 		return (0);
747 
748 	ppa = (struct zfs_probe_args *)arg;
749 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
750 	devname[strlen(ppa->devname) - 1] = '\0';
751 	snprintf(devname, sizeof(devname), "%s%s:", devname, partname);
752 	pa.fd = open(devname, O_RDWR);
753 	if (pa.fd == -1)
754 		return (0);
755 	ret = zfs_probe(pa.fd, ppa->pool_guid);
756 	if (ret == 0)
757 		return (0);
758 	/* Do we have BSD label here? */
759 	if (part->type == PART_FREEBSD) {
760 		pa.devname = devname;
761 		pa.pool_guid = ppa->pool_guid;
762 		pa.secsz = ppa->secsz;
763 		table = ptable_open(&pa, part->end - part->start + 1,
764 		    ppa->secsz, zfs_diskread);
765 		if (table != NULL) {
766 			ptable_iterate(table, &pa, zfs_probe_partition);
767 			ptable_close(table);
768 		}
769 	}
770 	close(pa.fd);
771 	return (0);
772 }
773 
774 /*
775  * Return bootenv nvlist from pool label.
776  */
777 int
778 zfs_get_bootenv(void *vdev, nvlist_t **benvp)
779 {
780 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
781 	nvlist_t *benv = NULL;
782 	vdev_t *vd;
783 	spa_t *spa;
784 
785 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
786 		return (ENOTSUP);
787 
788 	if ((spa = spa_find_by_dev(dev)) == NULL)
789 		return (ENXIO);
790 
791 	if (spa->spa_bootenv == NULL) {
792 		STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children,
793 		    v_childlink) {
794 			benv = vdev_read_bootenv(vd);
795 
796 			if (benv != NULL)
797 				break;
798 		}
799 		spa->spa_bootenv = benv;
800 	} else {
801 		benv = spa->spa_bootenv;
802 	}
803 
804 	if (benv == NULL)
805 		return (ENOENT);
806 
807 	*benvp = benv;
808 	return (0);
809 }
810 
811 /*
812  * Store nvlist to pool label bootenv area. Also updates cached pointer in spa.
813  */
814 int
815 zfs_set_bootenv(void *vdev, nvlist_t *benv)
816 {
817 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
818 	spa_t *spa;
819 	vdev_t *vd;
820 
821 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
822 		return (ENOTSUP);
823 
824 	if ((spa = spa_find_by_dev(dev)) == NULL)
825 		return (ENXIO);
826 
827 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
828 		vdev_write_bootenv(vd, benv);
829 	}
830 
831 	spa->spa_bootenv = benv;
832 	return (0);
833 }
834 
835 /*
836  * Get bootonce value by key. The bootonce <key, value> pair is removed
837  * from the bootenv nvlist and the remaining nvlist is committed back to disk.
838  */
839 int
840 zfs_get_bootonce(void *vdev, const char *key, char *buf, size_t size)
841 {
842 	nvlist_t *benv;
843 	char *result = NULL;
844 	int result_size, rv;
845 
846 	if ((rv = zfs_get_bootenv(vdev, &benv)) != 0)
847 		return (rv);
848 
849 	if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL,
850 	    &result, &result_size)) == 0) {
851 		if (result_size == 0) {
852 			/* ignore empty string */
853 			rv = ENOENT;
854 		} else {
855 			size = MIN((size_t)result_size + 1, size);
856 			strlcpy(buf, result, size);
857 		}
858 		(void) nvlist_remove(benv, key, DATA_TYPE_STRING);
859 		(void) zfs_set_bootenv(vdev, benv);
860 	}
861 
862 	return (rv);
863 }
864 
865 /*
866  * nvstore backend.
867  */
868 
869 static int zfs_nvstore_setter(void *, int, const char *,
870     const void *, size_t);
871 static int zfs_nvstore_setter_str(void *, const char *, const char *,
872     const char *);
873 static int zfs_nvstore_unset_impl(void *, const char *, bool);
874 static int zfs_nvstore_setenv(void *, void *);
875 
876 /*
877  * nvstore is only present for current rootfs pool.
878  */
879 static int
880 zfs_nvstore_sethook(struct env_var *ev, int flags __unused, const void *value)
881 {
882 	struct zfs_devdesc *dev;
883 	int rv;
884 
885 	archsw.arch_getdev((void **)&dev, NULL, NULL);
886 	if (dev == NULL)
887 		return (ENXIO);
888 
889 	rv = zfs_nvstore_setter_str(dev, NULL, ev->ev_name, value);
890 
891 	free(dev);
892 	return (rv);
893 }
894 
895 /*
896  * nvstore is only present for current rootfs pool.
897  */
898 static int
899 zfs_nvstore_unsethook(struct env_var *ev)
900 {
901 	struct zfs_devdesc *dev;
902 	int rv;
903 
904 	archsw.arch_getdev((void **)&dev, NULL, NULL);
905 	if (dev == NULL)
906 		return (ENXIO);
907 
908 	rv = zfs_nvstore_unset_impl(dev, ev->ev_name, false);
909 
910 	free(dev);
911 	return (rv);
912 }
913 
914 static int
915 zfs_nvstore_getter(void *vdev, const char *name, void **data)
916 {
917 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
918 	spa_t *spa;
919 	nvlist_t *nv;
920 	char *str, **ptr;
921 	int size;
922 	int rv;
923 
924 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
925 		return (ENOTSUP);
926 
927 	if ((spa = spa_find_by_dev(dev)) == NULL)
928 		return (ENXIO);
929 
930 	if (spa->spa_bootenv == NULL)
931 		return (ENXIO);
932 
933 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
934 	    NULL, &nv, NULL) != 0)
935 		return (ENOENT);
936 
937 	rv = nvlist_find(nv, name, DATA_TYPE_STRING, NULL, &str, &size);
938 	if (rv == 0) {
939 		ptr = (char **)data;
940 		asprintf(ptr, "%.*s", size, str);
941 		if (*data == NULL)
942 			rv = ENOMEM;
943 	}
944 	nvlist_destroy(nv);
945 	return (rv);
946 }
947 
948 static int
949 zfs_nvstore_setter(void *vdev, int type, const char *name,
950     const void *data, size_t size)
951 {
952 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
953 	spa_t *spa;
954 	nvlist_t *nv;
955 	int rv;
956 	bool env_set = true;
957 
958 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
959 		return (ENOTSUP);
960 
961 	if ((spa = spa_find_by_dev(dev)) == NULL)
962 		return (ENXIO);
963 
964 	if (spa->spa_bootenv == NULL)
965 		return (ENXIO);
966 
967 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
968 	    NULL, &nv, NULL) != 0) {
969 		nv = nvlist_create(NV_UNIQUE_NAME);
970 		if (nv == NULL)
971 			return (ENOMEM);
972 	}
973 
974 	rv = 0;
975 	switch (type) {
976         case DATA_TYPE_INT8:
977 		if (size != sizeof (int8_t)) {
978 			rv = EINVAL;
979 			break;
980 		}
981 		rv = nvlist_add_int8(nv, name, *(int8_t *)data);
982 		break;
983 
984         case DATA_TYPE_INT16:
985 		if (size != sizeof (int16_t)) {
986 			rv = EINVAL;
987 			break;
988 		}
989 		rv = nvlist_add_int16(nv, name, *(int16_t *)data);
990 		break;
991 
992         case DATA_TYPE_INT32:
993 		if (size != sizeof (int32_t)) {
994 			rv = EINVAL;
995 			break;
996 		}
997 		rv = nvlist_add_int32(nv, name, *(int32_t *)data);
998 		break;
999 
1000         case DATA_TYPE_INT64:
1001 		if (size != sizeof (int64_t)) {
1002 			rv = EINVAL;
1003 			break;
1004 		}
1005 		rv = nvlist_add_int64(nv, name, *(int64_t *)data);
1006 		break;
1007 
1008         case DATA_TYPE_BYTE:
1009 		if (size != sizeof (uint8_t)) {
1010 			rv = EINVAL;
1011 			break;
1012 		}
1013 		rv = nvlist_add_byte(nv, name, *(int8_t *)data);
1014 		break;
1015 
1016         case DATA_TYPE_UINT8:
1017 		if (size != sizeof (uint8_t)) {
1018 			rv = EINVAL;
1019 			break;
1020 		}
1021 		rv = nvlist_add_uint8(nv, name, *(int8_t *)data);
1022 		break;
1023 
1024         case DATA_TYPE_UINT16:
1025 		if (size != sizeof (uint16_t)) {
1026 			rv = EINVAL;
1027 			break;
1028 		}
1029 		rv = nvlist_add_uint16(nv, name, *(uint16_t *)data);
1030 		break;
1031 
1032         case DATA_TYPE_UINT32:
1033 		if (size != sizeof (uint32_t)) {
1034 			rv = EINVAL;
1035 			break;
1036 		}
1037 		rv = nvlist_add_uint32(nv, name, *(uint32_t *)data);
1038 		break;
1039 
1040         case DATA_TYPE_UINT64:
1041 		if (size != sizeof (uint64_t)) {
1042 			rv = EINVAL;
1043 			break;
1044 		}
1045 		rv = nvlist_add_uint64(nv, name, *(uint64_t *)data);
1046 		break;
1047 
1048         case DATA_TYPE_STRING:
1049 		rv = nvlist_add_string(nv, name, data);
1050 		break;
1051 
1052 	case DATA_TYPE_BOOLEAN_VALUE:
1053 		if (size != sizeof (boolean_t)) {
1054 			rv = EINVAL;
1055 			break;
1056 		}
1057 		rv = nvlist_add_boolean_value(nv, name, *(boolean_t *)data);
1058 		break;
1059 
1060 	default:
1061 		rv = EINVAL;
1062 		break;
1063 	}
1064 
1065 	if (rv == 0) {
1066 		rv = nvlist_add_nvlist(spa->spa_bootenv, OS_NVSTORE, nv);
1067 		if (rv == 0) {
1068 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1069 		}
1070 		if (rv == 0) {
1071 			if (env_set) {
1072 				rv = zfs_nvstore_setenv(vdev,
1073 				    nvpair_find(nv, name));
1074 			} else {
1075 				env_discard(env_getenv(name));
1076 				rv = 0;
1077 			}
1078 		}
1079 	}
1080 
1081 	nvlist_destroy(nv);
1082 	return (rv);
1083 }
1084 
1085 static int
1086 get_int64(const char *data, int64_t *ip)
1087 {
1088 	char *end;
1089 	int64_t val;
1090 
1091 	errno = 0;
1092 	val = strtoll(data, &end, 0);
1093 	if (errno != 0 || *data == '\0' || *end != '\0')
1094 		return (EINVAL);
1095 
1096 	*ip = val;
1097 	return (0);
1098 }
1099 
1100 static int
1101 get_uint64(const char *data, uint64_t *ip)
1102 {
1103 	char *end;
1104 	uint64_t val;
1105 
1106 	errno = 0;
1107 	val = strtoull(data, &end, 0);
1108 	if (errno != 0 || *data == '\0' || *end != '\0')
1109 		return (EINVAL);
1110 
1111 	*ip = val;
1112 	return (0);
1113 }
1114 
1115 /*
1116  * Translate textual data to data type. If type is not set, and we are
1117  * creating new pair, use DATA_TYPE_STRING.
1118  */
1119 static int
1120 zfs_nvstore_setter_str(void *vdev, const char *type, const char *name,
1121     const char *data)
1122 {
1123 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1124 	spa_t *spa;
1125 	nvlist_t *nv;
1126 	int rv;
1127 	data_type_t dt;
1128 	int64_t val;
1129 	uint64_t uval;
1130 
1131 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1132 		return (ENOTSUP);
1133 
1134 	if ((spa = spa_find_by_dev(dev)) == NULL)
1135 		return (ENXIO);
1136 
1137 	if (spa->spa_bootenv == NULL)
1138 		return (ENXIO);
1139 
1140 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1141 	    NULL, &nv, NULL) != 0) {
1142 		nv = NULL;
1143 	}
1144 
1145 	if (type == NULL) {
1146 		nvp_header_t *nvh;
1147 
1148 		/*
1149 		 * if there is no existing pair, default to string.
1150 		 * Otherwise, use type from existing pair.
1151 		 */
1152 		nvh = nvpair_find(nv, name);
1153 		if (nvh == NULL) {
1154 			dt = DATA_TYPE_STRING;
1155 		} else {
1156 			nv_string_t *nvp_name;
1157 			nv_pair_data_t *nvp_data;
1158 
1159 			nvp_name = (nv_string_t *)(nvh + 1);
1160 			nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1161 			    NV_ALIGN4(nvp_name->nv_size));
1162 			dt = nvp_data->nv_type;
1163 		}
1164 	} else {
1165 		dt = nvpair_type_from_name(type);
1166 	}
1167 	nvlist_destroy(nv);
1168 
1169 	rv = 0;
1170 	switch (dt) {
1171         case DATA_TYPE_INT8:
1172 		rv = get_int64(data, &val);
1173 		if (rv == 0) {
1174 			int8_t v = val;
1175 
1176 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1177 		}
1178 		break;
1179         case DATA_TYPE_INT16:
1180 		rv = get_int64(data, &val);
1181 		if (rv == 0) {
1182 			int16_t v = val;
1183 
1184 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1185 		}
1186 		break;
1187         case DATA_TYPE_INT32:
1188 		rv = get_int64(data, &val);
1189 		if (rv == 0) {
1190 			int32_t v = val;
1191 
1192 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1193 		}
1194 		break;
1195         case DATA_TYPE_INT64:
1196 		rv = get_int64(data, &val);
1197 		if (rv == 0) {
1198 			rv = zfs_nvstore_setter(vdev, dt, name, &val,
1199 			    sizeof (val));
1200 		}
1201 		break;
1202 
1203         case DATA_TYPE_BYTE:
1204 		rv = get_uint64(data, &uval);
1205 		if (rv == 0) {
1206 			uint8_t v = uval;
1207 
1208 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1209 		}
1210 		break;
1211 
1212         case DATA_TYPE_UINT8:
1213 		rv = get_uint64(data, &uval);
1214 		if (rv == 0) {
1215 			uint8_t v = uval;
1216 
1217 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1218 		}
1219 		break;
1220 
1221         case DATA_TYPE_UINT16:
1222 		rv = get_uint64(data, &uval);
1223 		if (rv == 0) {
1224 			uint16_t v = uval;
1225 
1226 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1227 		}
1228 		break;
1229 
1230         case DATA_TYPE_UINT32:
1231 		rv = get_uint64(data, &uval);
1232 		if (rv == 0) {
1233 			uint32_t v = uval;
1234 
1235 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1236 		}
1237 		break;
1238 
1239         case DATA_TYPE_UINT64:
1240 		rv = get_uint64(data, &uval);
1241 		if (rv == 0) {
1242 			rv = zfs_nvstore_setter(vdev, dt, name, &uval,
1243 			    sizeof (uval));
1244 		}
1245 		break;
1246 
1247         case DATA_TYPE_STRING:
1248 		rv = zfs_nvstore_setter(vdev, dt, name, data, strlen(data) + 1);
1249 		break;
1250 
1251 	case DATA_TYPE_BOOLEAN_VALUE:
1252 		rv = get_int64(data, &val);
1253 		if (rv == 0) {
1254 			boolean_t v = val;
1255 
1256 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1257 		}
1258 
1259 	default:
1260 		rv = EINVAL;
1261 	}
1262 	return (rv);
1263 }
1264 
1265 static int
1266 zfs_nvstore_unset_impl(void *vdev, const char *name, bool unset_env)
1267 {
1268 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1269 	spa_t *spa;
1270 	nvlist_t *nv;
1271 	int rv;
1272 
1273 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1274 		return (ENOTSUP);
1275 
1276 	if ((spa = spa_find_by_dev(dev)) == NULL)
1277 		return (ENXIO);
1278 
1279 	if (spa->spa_bootenv == NULL)
1280 		return (ENXIO);
1281 
1282 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1283 	    NULL, &nv, NULL) != 0)
1284 		return (ENOENT);
1285 
1286 	rv = nvlist_remove(nv, name, DATA_TYPE_UNKNOWN);
1287 	if (rv == 0) {
1288 		if (nvlist_next_nvpair(nv, NULL) == NULL) {
1289 			rv = nvlist_remove(spa->spa_bootenv, OS_NVSTORE,
1290 			    DATA_TYPE_NVLIST);
1291 		} else {
1292 			rv = nvlist_add_nvlist(spa->spa_bootenv,
1293 			    OS_NVSTORE, nv);
1294 		}
1295 		if (rv == 0)
1296 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1297 	}
1298 
1299 	if (unset_env)
1300 		env_discard(env_getenv(name));
1301 	return (rv);
1302 }
1303 
1304 static int
1305 zfs_nvstore_unset(void *vdev, const char *name)
1306 {
1307 	return (zfs_nvstore_unset_impl(vdev, name, true));
1308 }
1309 
1310 static int
1311 zfs_nvstore_print(void *vdev __unused, void *ptr)
1312 {
1313 
1314 	nvpair_print(ptr, 0);
1315 	return (0);
1316 }
1317 
1318 /*
1319  * Create environment variable from nvpair.
1320  * set hook will update nvstore with new value, unset hook will remove
1321  * variable from nvstore.
1322  */
1323 static int
1324 zfs_nvstore_setenv(void *vdev __unused, void *ptr)
1325 {
1326 	nvp_header_t *nvh = ptr;
1327 	nv_string_t *nvp_name, *nvp_value;
1328 	nv_pair_data_t *nvp_data;
1329 	char *name, *value;
1330 	int rv = 0;
1331 
1332 	if (nvh == NULL)
1333 		return (ENOENT);
1334 
1335 	nvp_name = (nv_string_t *)(nvh + 1);
1336 	nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1337 	    NV_ALIGN4(nvp_name->nv_size));
1338 
1339 	if ((name = nvstring_get(nvp_name)) == NULL)
1340 		return (ENOMEM);
1341 
1342 	value = NULL;
1343 	switch (nvp_data->nv_type) {
1344 	case DATA_TYPE_BYTE:
1345 	case DATA_TYPE_UINT8:
1346 		(void) asprintf(&value, "%uc",
1347 		    *(unsigned *)&nvp_data->nv_data[0]);
1348 		if (value == NULL)
1349 			rv = ENOMEM;
1350 		break;
1351 
1352 	case DATA_TYPE_INT8:
1353 		(void) asprintf(&value, "%c", *(int *)&nvp_data->nv_data[0]);
1354 		if (value == NULL)
1355 			rv = ENOMEM;
1356 		break;
1357 
1358 	case DATA_TYPE_INT16:
1359 		(void) asprintf(&value, "%hd", *(short *)&nvp_data->nv_data[0]);
1360 		if (value == NULL)
1361 			rv = ENOMEM;
1362 		break;
1363 
1364 	case DATA_TYPE_UINT16:
1365 		(void) asprintf(&value, "%hu",
1366 		    *(unsigned short *)&nvp_data->nv_data[0]);
1367 		if (value == NULL)
1368 			rv = ENOMEM;
1369 		break;
1370 
1371 	case DATA_TYPE_BOOLEAN_VALUE:
1372 	case DATA_TYPE_INT32:
1373 		(void) asprintf(&value, "%d", *(int *)&nvp_data->nv_data[0]);
1374 		if (value == NULL)
1375 			rv = ENOMEM;
1376 		break;
1377 
1378 	case DATA_TYPE_UINT32:
1379 		(void) asprintf(&value, "%u",
1380 		    *(unsigned *)&nvp_data->nv_data[0]);
1381 		if (value == NULL)
1382 			rv = ENOMEM;
1383 		break;
1384 
1385 	case DATA_TYPE_INT64:
1386 		(void) asprintf(&value, "%jd",
1387 		    (intmax_t)*(int64_t *)&nvp_data->nv_data[0]);
1388 		if (value == NULL)
1389 			rv = ENOMEM;
1390 		break;
1391 
1392 	case DATA_TYPE_UINT64:
1393 		(void) asprintf(&value, "%ju",
1394 		    (uintmax_t)*(uint64_t *)&nvp_data->nv_data[0]);
1395 		if (value == NULL)
1396 			rv = ENOMEM;
1397 		break;
1398 
1399 	case DATA_TYPE_STRING:
1400 		nvp_value = (nv_string_t *)&nvp_data->nv_data[0];
1401 		if ((value = nvstring_get(nvp_value)) == NULL) {
1402 			rv = ENOMEM;
1403 			break;
1404 		}
1405 		break;
1406 
1407 	default:
1408 		rv = EINVAL;
1409 		break;
1410 	}
1411 
1412 	if (value != NULL) {
1413 		rv = env_setenv(name, EV_VOLATILE | EV_NOHOOK, value,
1414 		    zfs_nvstore_sethook, zfs_nvstore_unsethook);
1415 		free(value);
1416 	}
1417 	free(name);
1418 	return (rv);
1419 }
1420 
1421 static int
1422 zfs_nvstore_iterate(void *vdev, int (*cb)(void *, void *))
1423 {
1424 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1425 	spa_t *spa;
1426 	nvlist_t *nv;
1427 	nvp_header_t *nvh;
1428 	int rv;
1429 
1430 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1431 		return (ENOTSUP);
1432 
1433 	if ((spa = spa_find_by_dev(dev)) == NULL)
1434 		return (ENXIO);
1435 
1436 	if (spa->spa_bootenv == NULL)
1437 		return (ENXIO);
1438 
1439 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1440 	    NULL, &nv, NULL) != 0)
1441 		return (ENOENT);
1442 
1443 	rv = 0;
1444 	nvh = NULL;
1445 	while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
1446 		rv = cb(vdev, nvh);
1447 		if (rv != 0)
1448 			break;
1449 	}
1450 	return (rv);
1451 }
1452 
1453 nvs_callbacks_t nvstore_zfs_cb = {
1454 	.nvs_getter = zfs_nvstore_getter,
1455 	.nvs_setter = zfs_nvstore_setter,
1456 	.nvs_setter_str = zfs_nvstore_setter_str,
1457 	.nvs_unset = zfs_nvstore_unset,
1458 	.nvs_print = zfs_nvstore_print,
1459 	.nvs_iterate = zfs_nvstore_iterate
1460 };
1461 
1462 int
1463 zfs_attach_nvstore(void *vdev)
1464 {
1465 	struct zfs_devdesc *dev = vdev;
1466 	spa_t *spa;
1467 	uint64_t version;
1468 	int rv;
1469 
1470 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1471 		return (ENOTSUP);
1472 
1473 	if ((spa = spa_find_by_dev(dev)) == NULL)
1474 		return (ENXIO);
1475 
1476 	rv = nvlist_find(spa->spa_bootenv, BOOTENV_VERSION, DATA_TYPE_UINT64,
1477 	    NULL, &version, NULL);
1478 
1479 	if (rv != 0 || version != VB_NVLIST) {
1480 		return (ENXIO);
1481 	}
1482 
1483 	dev = malloc(sizeof (*dev));
1484 	if (dev == NULL)
1485 		return (ENOMEM);
1486 	memcpy(dev, vdev, sizeof (*dev));
1487 
1488 	rv = nvstore_init(spa->spa_name, &nvstore_zfs_cb, dev);
1489 	if (rv != 0)
1490 		free(dev);
1491 	else
1492 		rv = zfs_nvstore_iterate(dev, zfs_nvstore_setenv);
1493 	return (rv);
1494 }
1495 
1496 int
1497 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
1498 {
1499 	struct ptable *table;
1500 	struct zfs_probe_args pa;
1501 	uint64_t mediasz;
1502 	int ret;
1503 
1504 	if (pool_guid)
1505 		*pool_guid = 0;
1506 	pa.fd = open(devname, O_RDWR);
1507 	if (pa.fd == -1)
1508 		return (ENXIO);
1509 	/* Probe the whole disk */
1510 	ret = zfs_probe(pa.fd, pool_guid);
1511 	if (ret == 0)
1512 		return (0);
1513 
1514 	/* Probe each partition */
1515 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
1516 	if (ret == 0)
1517 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
1518 	if (ret == 0) {
1519 		pa.devname = devname;
1520 		pa.pool_guid = pool_guid;
1521 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
1522 		    zfs_diskread);
1523 		if (table != NULL) {
1524 			ptable_iterate(table, &pa, zfs_probe_partition);
1525 			ptable_close(table);
1526 		}
1527 	}
1528 	close(pa.fd);
1529 	if (pool_guid && *pool_guid == 0)
1530 		ret = ENXIO;
1531 	return (ret);
1532 }
1533 
1534 /*
1535  * Print information about ZFS pools
1536  */
1537 static int
1538 zfs_dev_print(int verbose)
1539 {
1540 	spa_t *spa;
1541 	char line[80];
1542 	int ret = 0;
1543 
1544 	if (STAILQ_EMPTY(&zfs_pools))
1545 		return (0);
1546 
1547 	printf("%s devices:", zfs_dev.dv_name);
1548 	if ((ret = pager_output("\n")) != 0)
1549 		return (ret);
1550 
1551 	if (verbose) {
1552 		return (spa_all_status());
1553 	}
1554 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1555 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
1556 		ret = pager_output(line);
1557 		if (ret != 0)
1558 			break;
1559 	}
1560 	return (ret);
1561 }
1562 
1563 /*
1564  * Attempt to open the pool described by (dev) for use by (f).
1565  */
1566 static int
1567 zfs_dev_open(struct open_file *f, ...)
1568 {
1569 	va_list		args;
1570 	struct zfs_devdesc	*dev;
1571 	struct zfsmount	*mount;
1572 	spa_t		*spa;
1573 	int		rv;
1574 
1575 	va_start(args, f);
1576 	dev = va_arg(args, struct zfs_devdesc *);
1577 	va_end(args);
1578 
1579 	if ((spa = spa_find_by_dev(dev)) == NULL)
1580 		return (ENXIO);
1581 
1582 	STAILQ_FOREACH(mount, &zfsmount, next) {
1583 		if (spa->spa_guid == mount->spa->spa_guid)
1584 			break;
1585 	}
1586 
1587 	rv = 0;
1588 	/* This device is not set as currdev, mount us private copy. */
1589 	if (mount == NULL)
1590 		rv = zfs_mount(devformat(&dev->dd), NULL, (void **)&mount);
1591 
1592 	if (rv == 0) {
1593 		dev->dd.d_opendata = mount;
1594 	}
1595 	return (rv);
1596 }
1597 
1598 static int
1599 zfs_dev_close(struct open_file *f)
1600 {
1601 	struct devdesc *dev;
1602 	struct zfsmount	*mnt, *mount;
1603 
1604 	dev = f->f_devdata;
1605 	mnt = dev->d_opendata;
1606 
1607 	STAILQ_FOREACH(mount, &zfsmount, next) {
1608 		if (mnt->spa->spa_guid == mount->spa->spa_guid)
1609 			break;
1610 	}
1611 
1612 	/* XXX */
1613 	return (0);
1614 }
1615 
1616 static int
1617 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
1618 {
1619 
1620 	return (ENOSYS);
1621 }
1622 
1623 struct devsw zfs_dev = {
1624 	.dv_name = "zfs",
1625 	.dv_type = DEVT_ZFS,
1626 	.dv_init = zfs_dev_init,
1627 	.dv_strategy = zfs_dev_strategy,
1628 	.dv_open = zfs_dev_open,
1629 	.dv_close = zfs_dev_close,
1630 	.dv_ioctl = noioctl,
1631 	.dv_print = zfs_dev_print,
1632 	.dv_cleanup = nullsys,
1633 	.dv_fmtdev = zfs_fmtdev,
1634 };
1635 
1636 int
1637 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
1638 {
1639 	static char	rootname[ZFS_MAXNAMELEN];
1640 	static char	poolname[ZFS_MAXNAMELEN];
1641 	spa_t		*spa;
1642 	const char	*end;
1643 	const char	*np;
1644 	const char	*sep;
1645 	int		rv;
1646 
1647 	np = devspec;
1648 	if (*np != ':')
1649 		return (EINVAL);
1650 	np++;
1651 	end = strrchr(np, ':');
1652 	if (end == NULL)
1653 		return (EINVAL);
1654 	sep = strchr(np, '/');
1655 	if (sep == NULL || sep >= end)
1656 		sep = end;
1657 	memcpy(poolname, np, sep - np);
1658 	poolname[sep - np] = '\0';
1659 	if (sep < end) {
1660 		sep++;
1661 		memcpy(rootname, sep, end - sep);
1662 		rootname[end - sep] = '\0';
1663 	}
1664 	else
1665 		rootname[0] = '\0';
1666 
1667 	spa = spa_find_by_name(poolname);
1668 	if (!spa)
1669 		return (ENXIO);
1670 	dev->pool_guid = spa->spa_guid;
1671 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1672 	if (rv != 0)
1673 		return (rv);
1674 	if (path != NULL)
1675 		*path = (*end == '\0') ? end : end + 1;
1676 	dev->dd.d_dev = &zfs_dev;
1677 	return (0);
1678 }
1679 
1680 char *
1681 zfs_fmtdev(struct devdesc *vdev)
1682 {
1683 	static char		rootname[ZFS_MAXNAMELEN];
1684 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1685 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1686 	spa_t			*spa;
1687 
1688 	buf[0] = '\0';
1689 	if (vdev->d_dev->dv_type != DEVT_ZFS)
1690 		return (buf);
1691 
1692 	/* Do we have any pools? */
1693 	spa = STAILQ_FIRST(&zfs_pools);
1694 	if (spa == NULL)
1695 		return (buf);
1696 
1697 	if (dev->pool_guid == 0)
1698 		dev->pool_guid = spa->spa_guid;
1699 	else
1700 		spa = spa_find_by_guid(dev->pool_guid);
1701 
1702 	if (spa == NULL) {
1703 		printf("ZFS: can't find pool by guid\n");
1704 		return (buf);
1705 	}
1706 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1707 		printf("ZFS: can't find root filesystem\n");
1708 		return (buf);
1709 	}
1710 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1711 		printf("ZFS: can't find filesystem by guid\n");
1712 		return (buf);
1713 	}
1714 
1715 	if (rootname[0] == '\0')
1716 		snprintf(buf, sizeof(buf), "%s:%s:", dev->dd.d_dev->dv_name,
1717 		    spa->spa_name);
1718 	else
1719 		snprintf(buf, sizeof(buf), "%s:%s/%s:", dev->dd.d_dev->dv_name,
1720 		    spa->spa_name, rootname);
1721 	return (buf);
1722 }
1723 
1724 static int
1725 split_devname(const char *name, char *poolname, size_t size,
1726     const char **dsnamep)
1727 {
1728 	const char *dsname;
1729 	size_t len;
1730 
1731 	ASSERT(name != NULL);
1732 	ASSERT(poolname != NULL);
1733 
1734 	len = strlen(name);
1735 	dsname = strchr(name, '/');
1736 	if (dsname != NULL) {
1737 		len = dsname - name;
1738 		dsname++;
1739 	} else
1740 		dsname = "";
1741 
1742 	if (len + 1 > size)
1743 		return (EINVAL);
1744 
1745 	strlcpy(poolname, name, len + 1);
1746 
1747 	if (dsnamep != NULL)
1748 		*dsnamep = dsname;
1749 
1750 	return (0);
1751 }
1752 
1753 int
1754 zfs_list(const char *name)
1755 {
1756 	static char	poolname[ZFS_MAXNAMELEN];
1757 	uint64_t	objid;
1758 	spa_t		*spa;
1759 	const char	*dsname;
1760 	int		rv;
1761 
1762 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1763 		return (EINVAL);
1764 
1765 	spa = spa_find_by_name(poolname);
1766 	if (!spa)
1767 		return (ENXIO);
1768 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1769 	if (rv != 0)
1770 		return (rv);
1771 
1772 	return (zfs_list_dataset(spa, objid));
1773 }
1774 
1775 void
1776 init_zfs_boot_options(const char *currdev_in)
1777 {
1778 	char poolname[ZFS_MAXNAMELEN];
1779 	char *beroot, *currdev;
1780 	spa_t *spa;
1781 	int currdev_len;
1782 	const char *dsname;
1783 
1784 	currdev = NULL;
1785 	currdev_len = strlen(currdev_in);
1786 	if (currdev_len == 0)
1787 		return;
1788 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1789 		return;
1790 	currdev = strdup(currdev_in);
1791 	if (currdev == NULL)
1792 		return;
1793 	/* Remove the trailing : */
1794 	currdev[currdev_len - 1] = '\0';
1795 
1796 	setenv("zfs_be_active", currdev, 1);
1797 	setenv("zfs_be_currpage", "1", 1);
1798 	/* Remove the last element (current bootenv) */
1799 	beroot = strrchr(currdev, '/');
1800 	if (beroot != NULL)
1801 		beroot[0] = '\0';
1802 	beroot = strchr(currdev, ':') + 1;
1803 	setenv("zfs_be_root", beroot, 1);
1804 
1805 	if (split_devname(beroot, poolname, sizeof(poolname), &dsname) != 0)
1806 		return;
1807 
1808 	spa = spa_find_by_name(poolname);
1809 	if (spa == NULL)
1810 		return;
1811 
1812 	zfs_bootenv_initial("bootenvs", spa, beroot, dsname, 0);
1813 	zfs_checkpoints_initial(spa, beroot, dsname);
1814 
1815 	free(currdev);
1816 }
1817 
1818 static void
1819 zfs_checkpoints_initial(spa_t *spa, const char *name, const char *dsname)
1820 {
1821 	char envname[32];
1822 
1823 	if (spa->spa_uberblock_checkpoint.ub_checkpoint_txg != 0) {
1824 		snprintf(envname, sizeof(envname), "zpool_checkpoint");
1825 		setenv(envname, name, 1);
1826 
1827 		spa->spa_uberblock = &spa->spa_uberblock_checkpoint;
1828 		spa->spa_mos = &spa->spa_mos_checkpoint;
1829 
1830 		zfs_bootenv_initial("bootenvs_check", spa, name, dsname, 1);
1831 
1832 		spa->spa_uberblock = &spa->spa_uberblock_master;
1833 		spa->spa_mos = &spa->spa_mos_master;
1834 	}
1835 }
1836 
1837 static void
1838 zfs_bootenv_initial(const char *envprefix, spa_t *spa, const char *rootname,
1839    const char *dsname, int checkpoint)
1840 {
1841 	char		envname[32], envval[256];
1842 	uint64_t	objid;
1843 	int		bootenvs_idx, rv;
1844 
1845 	SLIST_INIT(&zfs_be_head);
1846 	zfs_env_count = 0;
1847 
1848 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1849 	if (rv != 0)
1850 		return;
1851 
1852 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1853 	bootenvs_idx = 0;
1854 	/* Populate the initial environment variables */
1855 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1856 		/* Enumerate all bootenvs for general usage */
1857 		snprintf(envname, sizeof(envname), "%s[%d]",
1858 		    envprefix, bootenvs_idx);
1859 		snprintf(envval, sizeof(envval), "zfs:%s%s/%s",
1860 		    checkpoint ? "!" : "", rootname, zfs_be->name);
1861 		rv = setenv(envname, envval, 1);
1862 		if (rv != 0)
1863 			break;
1864 		bootenvs_idx++;
1865 	}
1866 	snprintf(envname, sizeof(envname), "%s_count", envprefix);
1867 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1868 	setenv(envname, envval, 1);
1869 
1870 	/* Clean up the SLIST of ZFS BEs */
1871 	while (!SLIST_EMPTY(&zfs_be_head)) {
1872 		zfs_be = SLIST_FIRST(&zfs_be_head);
1873 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1874 		free(zfs_be->name);
1875 		free(zfs_be);
1876 	}
1877 }
1878 
1879 int
1880 zfs_bootenv(const char *name)
1881 {
1882 	char		poolname[ZFS_MAXNAMELEN], *root;
1883 	const char	*dsname;
1884 	char		becount[4];
1885 	uint64_t	objid;
1886 	spa_t		*spa;
1887 	int		rv, pages, perpage, currpage;
1888 
1889 	if (name == NULL)
1890 		return (EINVAL);
1891 	if ((root = getenv("zfs_be_root")) == NULL)
1892 		return (EINVAL);
1893 
1894 	if (strcmp(name, root) != 0) {
1895 		if (setenv("zfs_be_root", name, 1) != 0)
1896 			return (ENOMEM);
1897 	}
1898 
1899 	SLIST_INIT(&zfs_be_head);
1900 	zfs_env_count = 0;
1901 
1902 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1903 		return (EINVAL);
1904 
1905 	spa = spa_find_by_name(poolname);
1906 	if (!spa)
1907 		return (ENXIO);
1908 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1909 	if (rv != 0)
1910 		return (rv);
1911 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1912 
1913 	/* Calculate and store the number of pages of BEs */
1914 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1915 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1916 	snprintf(becount, 4, "%d", pages);
1917 	if (setenv("zfs_be_pages", becount, 1) != 0)
1918 		return (ENOMEM);
1919 
1920 	/* Roll over the page counter if it has exceeded the maximum */
1921 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1922 	if (currpage > pages) {
1923 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1924 			return (ENOMEM);
1925 	}
1926 
1927 	/* Populate the menu environment variables */
1928 	zfs_set_env();
1929 
1930 	/* Clean up the SLIST of ZFS BEs */
1931 	while (!SLIST_EMPTY(&zfs_be_head)) {
1932 		zfs_be = SLIST_FIRST(&zfs_be_head);
1933 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1934 		free(zfs_be->name);
1935 		free(zfs_be);
1936 	}
1937 
1938 	return (rv);
1939 }
1940 
1941 int
1942 zfs_belist_add(const char *name, uint64_t value __unused)
1943 {
1944 
1945 	/* Skip special datasets that start with a $ character */
1946 	if (strncmp(name, "$", 1) == 0) {
1947 		return (0);
1948 	}
1949 	/* Add the boot environment to the head of the SLIST */
1950 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1951 	if (zfs_be == NULL) {
1952 		return (ENOMEM);
1953 	}
1954 	zfs_be->name = strdup(name);
1955 	if (zfs_be->name == NULL) {
1956 		free(zfs_be);
1957 		return (ENOMEM);
1958 	}
1959 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1960 	zfs_env_count++;
1961 
1962 	return (0);
1963 }
1964 
1965 int
1966 zfs_set_env(void)
1967 {
1968 	char envname[32], envval[256];
1969 	char *beroot, *pagenum;
1970 	int rv, page, ctr;
1971 
1972 	beroot = getenv("zfs_be_root");
1973 	if (beroot == NULL) {
1974 		return (1);
1975 	}
1976 
1977 	pagenum = getenv("zfs_be_currpage");
1978 	if (pagenum != NULL) {
1979 		page = strtol(pagenum, NULL, 10);
1980 	} else {
1981 		page = 1;
1982 	}
1983 
1984 	ctr = 1;
1985 	rv = 0;
1986 	zfs_env_index = ZFS_BE_FIRST;
1987 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1988 		/* Skip to the requested page number */
1989 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
1990 			ctr++;
1991 			continue;
1992 		}
1993 
1994 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1995 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
1996 		rv = setenv(envname, envval, 1);
1997 		if (rv != 0) {
1998 			break;
1999 		}
2000 
2001 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2002 		rv = setenv(envname, envval, 1);
2003 		if (rv != 0){
2004 			break;
2005 		}
2006 
2007 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2008 		rv = setenv(envname, "set_bootenv", 1);
2009 		if (rv != 0){
2010 			break;
2011 		}
2012 
2013 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2014 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
2015 		rv = setenv(envname, envval, 1);
2016 		if (rv != 0){
2017 			break;
2018 		}
2019 
2020 		zfs_env_index++;
2021 		if (zfs_env_index > ZFS_BE_LAST) {
2022 			break;
2023 		}
2024 
2025 	}
2026 
2027 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
2028 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
2029 		(void)unsetenv(envname);
2030 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2031 		(void)unsetenv(envname);
2032 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2033 		(void)unsetenv(envname);
2034 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2035 		(void)unsetenv(envname);
2036 	}
2037 
2038 	return (rv);
2039 }
2040