xref: /freebsd/stand/libsa/zfs/zfs.c (revision a12eb9e4ae534557867d49803a1e28bfe519a207)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <part.h>
42 #include <stddef.h>
43 #include <stdarg.h>
44 #include <string.h>
45 #include <bootstrap.h>
46 
47 #include "libzfs.h"
48 
49 #include "zfsimpl.c"
50 
51 /* Define the range of indexes to be populated with ZFS Boot Environments */
52 #define		ZFS_BE_FIRST	4
53 #define		ZFS_BE_LAST	8
54 
55 static int	zfs_open(const char *path, struct open_file *f);
56 static int	zfs_close(struct open_file *f);
57 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
58 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
59 static int	zfs_stat(struct open_file *f, struct stat *sb);
60 static int	zfs_readdir(struct open_file *f, struct dirent *d);
61 static int	zfs_mount(const char *dev, const char *path, void **data);
62 static int	zfs_unmount(const char *dev, void *data);
63 
64 static void	zfs_bootenv_initial(const char *envname, spa_t *spa,
65 		    const char *name, const char *dsname, int checkpoint);
66 static void	zfs_checkpoints_initial(spa_t *spa, const char *name,
67 		    const char *dsname);
68 
69 struct devsw zfs_dev;
70 
71 struct fs_ops zfs_fsops = {
72 	.fs_name = "zfs",
73 	.fo_open = zfs_open,
74 	.fo_close = zfs_close,
75 	.fo_read = zfs_read,
76 	.fo_write = null_write,
77 	.fo_seek = zfs_seek,
78 	.fo_stat = zfs_stat,
79 	.fo_readdir = zfs_readdir,
80 	.fo_mount = zfs_mount,
81 	.fo_unmount = zfs_unmount
82 };
83 
84 /*
85  * In-core open file.
86  */
87 struct file {
88 	off_t		f_seekp;	/* seek pointer */
89 	dnode_phys_t	f_dnode;
90 	uint64_t	f_zap_type;	/* zap type for readdir */
91 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
92 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
93 };
94 
95 static int	zfs_env_index;
96 static int	zfs_env_count;
97 
98 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
99 struct zfs_be_list *zfs_be_headp;
100 struct zfs_be_entry {
101 	char *name;
102 	SLIST_ENTRY(zfs_be_entry) entries;
103 } *zfs_be, *zfs_be_tmp;
104 
105 /*
106  * Open a file.
107  */
108 static int
109 zfs_open(const char *upath, struct open_file *f)
110 {
111 	struct zfsmount *mount = (struct zfsmount *)f->f_devdata;
112 	struct file *fp;
113 	int rc;
114 
115 	if (f->f_dev != &zfs_dev)
116 		return (EINVAL);
117 
118 	/* allocate file system specific data structure */
119 	fp = calloc(1, sizeof(struct file));
120 	if (fp == NULL)
121 		return (ENOMEM);
122 	f->f_fsdata = fp;
123 
124 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
125 	fp->f_seekp = 0;
126 	if (rc) {
127 		f->f_fsdata = NULL;
128 		free(fp);
129 	}
130 	return (rc);
131 }
132 
133 static int
134 zfs_close(struct open_file *f)
135 {
136 	struct file *fp = (struct file *)f->f_fsdata;
137 
138 	dnode_cache_obj = NULL;
139 	f->f_fsdata = NULL;
140 
141 	free(fp);
142 	return (0);
143 }
144 
145 /*
146  * Copy a portion of a file into kernel memory.
147  * Cross block boundaries when necessary.
148  */
149 static int
150 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
151 {
152 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
153 	struct file *fp = (struct file *)f->f_fsdata;
154 	struct stat sb;
155 	size_t n;
156 	int rc;
157 
158 	rc = zfs_stat(f, &sb);
159 	if (rc)
160 		return (rc);
161 	n = size;
162 	if (fp->f_seekp + n > sb.st_size)
163 		n = sb.st_size - fp->f_seekp;
164 
165 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
166 	if (rc)
167 		return (rc);
168 
169 	if (0) {
170 	    int i;
171 	    for (i = 0; i < n; i++)
172 		putchar(((char*) start)[i]);
173 	}
174 	fp->f_seekp += n;
175 	if (resid)
176 		*resid = size - n;
177 
178 	return (0);
179 }
180 
181 static off_t
182 zfs_seek(struct open_file *f, off_t offset, int where)
183 {
184 	struct file *fp = (struct file *)f->f_fsdata;
185 
186 	switch (where) {
187 	case SEEK_SET:
188 		fp->f_seekp = offset;
189 		break;
190 	case SEEK_CUR:
191 		fp->f_seekp += offset;
192 		break;
193 	case SEEK_END:
194 	    {
195 		struct stat sb;
196 		int error;
197 
198 		error = zfs_stat(f, &sb);
199 		if (error != 0) {
200 			errno = error;
201 			return (-1);
202 		}
203 		fp->f_seekp = sb.st_size - offset;
204 		break;
205 	    }
206 	default:
207 		errno = EINVAL;
208 		return (-1);
209 	}
210 	return (fp->f_seekp);
211 }
212 
213 static int
214 zfs_stat(struct open_file *f, struct stat *sb)
215 {
216 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
217 	struct file *fp = (struct file *)f->f_fsdata;
218 
219 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
220 }
221 
222 static int
223 zfs_readdir(struct open_file *f, struct dirent *d)
224 {
225 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
226 	struct file *fp = (struct file *)f->f_fsdata;
227 	mzap_ent_phys_t mze;
228 	struct stat sb;
229 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
230 	int rc;
231 
232 	rc = zfs_stat(f, &sb);
233 	if (rc)
234 		return (rc);
235 	if (!S_ISDIR(sb.st_mode))
236 		return (ENOTDIR);
237 
238 	/*
239 	 * If this is the first read, get the zap type.
240 	 */
241 	if (fp->f_seekp == 0) {
242 		rc = dnode_read(spa, &fp->f_dnode,
243 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
244 		if (rc)
245 			return (rc);
246 
247 		if (fp->f_zap_type == ZBT_MICRO) {
248 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
249 		} else {
250 			rc = dnode_read(spa, &fp->f_dnode,
251 					offsetof(zap_phys_t, zap_num_leafs),
252 					&fp->f_num_leafs,
253 					sizeof(fp->f_num_leafs));
254 			if (rc)
255 				return (rc);
256 
257 			fp->f_seekp = bsize;
258 			fp->f_zap_leaf = malloc(bsize);
259 			if (fp->f_zap_leaf == NULL)
260 				return (ENOMEM);
261 			rc = dnode_read(spa, &fp->f_dnode,
262 					fp->f_seekp,
263 					fp->f_zap_leaf,
264 					bsize);
265 			if (rc)
266 				return (rc);
267 		}
268 	}
269 
270 	if (fp->f_zap_type == ZBT_MICRO) {
271 	mzap_next:
272 		if (fp->f_seekp >= bsize)
273 			return (ENOENT);
274 
275 		rc = dnode_read(spa, &fp->f_dnode,
276 				fp->f_seekp, &mze, sizeof(mze));
277 		if (rc)
278 			return (rc);
279 		fp->f_seekp += sizeof(mze);
280 
281 		if (!mze.mze_name[0])
282 			goto mzap_next;
283 
284 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
285 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
286 		strcpy(d->d_name, mze.mze_name);
287 		d->d_namlen = strlen(d->d_name);
288 		return (0);
289 	} else {
290 		zap_leaf_t zl;
291 		zap_leaf_chunk_t *zc, *nc;
292 		int chunk;
293 		size_t namelen;
294 		char *p;
295 		uint64_t value;
296 
297 		/*
298 		 * Initialise this so we can use the ZAP size
299 		 * calculating macros.
300 		 */
301 		zl.l_bs = ilog2(bsize);
302 		zl.l_phys = fp->f_zap_leaf;
303 
304 		/*
305 		 * Figure out which chunk we are currently looking at
306 		 * and consider seeking to the next leaf. We use the
307 		 * low bits of f_seekp as a simple chunk index.
308 		 */
309 	fzap_next:
310 		chunk = fp->f_seekp & (bsize - 1);
311 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
312 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
313 			chunk = 0;
314 
315 			/*
316 			 * Check for EOF and read the new leaf.
317 			 */
318 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
319 				return (ENOENT);
320 
321 			rc = dnode_read(spa, &fp->f_dnode,
322 					fp->f_seekp,
323 					fp->f_zap_leaf,
324 					bsize);
325 			if (rc)
326 				return (rc);
327 		}
328 
329 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
330 		fp->f_seekp++;
331 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
332 			goto fzap_next;
333 
334 		namelen = zc->l_entry.le_name_numints;
335 		if (namelen > sizeof(d->d_name))
336 			namelen = sizeof(d->d_name);
337 
338 		/*
339 		 * Paste the name back together.
340 		 */
341 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
342 		p = d->d_name;
343 		while (namelen > 0) {
344 			int len;
345 			len = namelen;
346 			if (len > ZAP_LEAF_ARRAY_BYTES)
347 				len = ZAP_LEAF_ARRAY_BYTES;
348 			memcpy(p, nc->l_array.la_array, len);
349 			p += len;
350 			namelen -= len;
351 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
352 		}
353 		d->d_name[sizeof(d->d_name) - 1] = 0;
354 
355 		/*
356 		 * Assume the first eight bytes of the value are
357 		 * a uint64_t.
358 		 */
359 		value = fzap_leaf_value(&zl, zc);
360 
361 		d->d_fileno = ZFS_DIRENT_OBJ(value);
362 		d->d_type = ZFS_DIRENT_TYPE(value);
363 		d->d_namlen = strlen(d->d_name);
364 
365 		return (0);
366 	}
367 }
368 
369 /*
370  * if path is NULL, create mount structure, but do not add it to list.
371  */
372 static int
373 zfs_mount(const char *dev, const char *path, void **data)
374 {
375 	struct zfs_devdesc *zfsdev;
376 	spa_t *spa;
377 	struct zfsmount *mnt;
378 	int rv;
379 
380 	errno = 0;
381 	zfsdev = malloc(sizeof(*zfsdev));
382 	if (zfsdev == NULL)
383 		return (errno);
384 
385 	rv = zfs_parsedev(zfsdev, dev + 3, NULL);
386 	if (rv != 0) {
387 		free(zfsdev);
388 		return (rv);
389 	}
390 
391 	spa = spa_find_by_dev(zfsdev);
392 	if (spa == NULL)
393 		return (ENXIO);
394 
395 	mnt = calloc(1, sizeof(*mnt));
396 	if (mnt != NULL && path != NULL)
397 		mnt->path = strdup(path);
398 	rv = errno;
399 
400 	if (mnt != NULL)
401 		rv = zfs_mount_impl(spa, zfsdev->root_guid, mnt);
402 	free(zfsdev);
403 
404 	if (rv == 0 && mnt != NULL && mnt->objset.os_type != DMU_OST_ZFS) {
405 		printf("Unexpected object set type %ju\n",
406 		    (uintmax_t)mnt->objset.os_type);
407 		rv = EIO;
408 	}
409 
410 	if (rv != 0) {
411 		if (mnt != NULL)
412 			free(mnt->path);
413 		free(mnt);
414 		return (rv);
415 	}
416 
417 	if (mnt != NULL) {
418 		*data = mnt;
419 		if (path != NULL)
420 			STAILQ_INSERT_TAIL(&zfsmount, mnt, next);
421 	}
422 
423 	return (rv);
424 }
425 
426 static int
427 zfs_unmount(const char *dev, void *data)
428 {
429 	struct zfsmount *mnt = data;
430 
431 	STAILQ_REMOVE(&zfsmount, mnt, zfsmount, next);
432 	free(mnt->path);
433 	free(mnt);
434 	return (0);
435 }
436 
437 static int
438 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
439 {
440 	int fd, ret;
441 	size_t res, head, tail, total_size, full_sec_size;
442 	unsigned secsz, do_tail_read;
443 	off_t start_sec;
444 	char *outbuf, *bouncebuf;
445 
446 	fd = (uintptr_t) priv;
447 	outbuf = (char *) buf;
448 	bouncebuf = NULL;
449 
450 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
451 	if (ret != 0)
452 		return (ret);
453 
454 	/*
455 	 * Handling reads of arbitrary offset and size - multi-sector case
456 	 * and single-sector case.
457 	 *
458 	 *                        Multi-sector Case
459 	 *                (do_tail_read = true if tail > 0)
460 	 *
461 	 *   |<----------------------total_size--------------------->|
462 	 *   |                                                       |
463 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
464 	 *   |          |                                 |          |
465 	 *   |          |       |<~full_sec_size~>|       |          |
466 	 *   +------------------+                 +------------------+
467 	 *   |          |0101010|     .  .  .     |0101011|          |
468 	 *   +------------------+                 +------------------+
469 	 *         start_sec                         start_sec + n
470 	 *
471 	 *
472 	 *                      Single-sector Case
473 	 *                    (do_tail_read = false)
474 	 *
475 	 *              |<------total_size = secsz----->|
476 	 *              |                               |
477 	 *              |<-head->|<---bytes--->|<-tail->|
478 	 *              +-------------------------------+
479 	 *              |        |0101010101010|        |
480 	 *              +-------------------------------+
481 	 *                          start_sec
482 	 */
483 	start_sec = offset / secsz;
484 	head = offset % secsz;
485 	total_size = roundup2(head + bytes, secsz);
486 	tail = total_size - (head + bytes);
487 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
488 	full_sec_size = total_size;
489 	if (head > 0)
490 		full_sec_size -= secsz;
491 	if (do_tail_read)
492 		full_sec_size -= secsz;
493 
494 	/* Return of partial sector data requires a bounce buffer. */
495 	if ((head > 0) || do_tail_read || bytes < secsz) {
496 		bouncebuf = malloc(secsz);
497 		if (bouncebuf == NULL) {
498 			printf("vdev_read: out of memory\n");
499 			return (ENOMEM);
500 		}
501 	}
502 
503 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
504 		ret = errno;
505 		goto error;
506 	}
507 
508 	/* Partial data return from first sector */
509 	if (head > 0) {
510 		res = read(fd, bouncebuf, secsz);
511 		if (res != secsz) {
512 			ret = EIO;
513 			goto error;
514 		}
515 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
516 		outbuf += min(secsz - head, bytes);
517 	}
518 
519 	/*
520 	 * Full data return from read sectors.
521 	 * Note, there is still corner case where we read
522 	 * from sector boundary, but less than sector size, e.g. reading 512B
523 	 * from 4k sector.
524 	 */
525 	if (full_sec_size > 0) {
526 		if (bytes < full_sec_size) {
527 			res = read(fd, bouncebuf, secsz);
528 			if (res != secsz) {
529 				ret = EIO;
530 				goto error;
531 			}
532 			memcpy(outbuf, bouncebuf, bytes);
533 		} else {
534 			res = read(fd, outbuf, full_sec_size);
535 			if (res != full_sec_size) {
536 				ret = EIO;
537 				goto error;
538 			}
539 			outbuf += full_sec_size;
540 		}
541 	}
542 
543 	/* Partial data return from last sector */
544 	if (do_tail_read) {
545 		res = read(fd, bouncebuf, secsz);
546 		if (res != secsz) {
547 			ret = EIO;
548 			goto error;
549 		}
550 		memcpy(outbuf, bouncebuf, secsz - tail);
551 	}
552 
553 	ret = 0;
554 error:
555 	free(bouncebuf);
556 	return (ret);
557 }
558 
559 static int
560 vdev_write(vdev_t *vdev, off_t offset, void *buf, size_t bytes)
561 {
562 	int fd, ret;
563 	size_t head, tail, total_size, full_sec_size;
564 	unsigned secsz, do_tail_write;
565 	off_t start_sec;
566 	ssize_t res;
567 	char *outbuf, *bouncebuf;
568 
569 	fd = (uintptr_t)vdev->v_priv;
570 	outbuf = (char *)buf;
571 	bouncebuf = NULL;
572 
573 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
574 	if (ret != 0)
575 		return (ret);
576 
577 	start_sec = offset / secsz;
578 	head = offset % secsz;
579 	total_size = roundup2(head + bytes, secsz);
580 	tail = total_size - (head + bytes);
581 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
582 	full_sec_size = total_size;
583 	if (head > 0)
584 		full_sec_size -= secsz;
585 	if (do_tail_write)
586 		full_sec_size -= secsz;
587 
588 	/* Partial sector write requires a bounce buffer. */
589 	if ((head > 0) || do_tail_write || bytes < secsz) {
590 		bouncebuf = malloc(secsz);
591 		if (bouncebuf == NULL) {
592 			printf("vdev_write: out of memory\n");
593 			return (ENOMEM);
594 		}
595 	}
596 
597 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
598 		ret = errno;
599 		goto error;
600 	}
601 
602 	/* Partial data for first sector */
603 	if (head > 0) {
604 		res = read(fd, bouncebuf, secsz);
605 		if ((unsigned)res != secsz) {
606 			ret = EIO;
607 			goto error;
608 		}
609 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
610 		(void) lseek(fd, -secsz, SEEK_CUR);
611 		res = write(fd, bouncebuf, secsz);
612 		if ((unsigned)res != secsz) {
613 			ret = EIO;
614 			goto error;
615 		}
616 		outbuf += min(secsz - head, bytes);
617 	}
618 
619 	/*
620 	 * Full data write to sectors.
621 	 * Note, there is still corner case where we write
622 	 * to sector boundary, but less than sector size, e.g. write 512B
623 	 * to 4k sector.
624 	 */
625 	if (full_sec_size > 0) {
626 		if (bytes < full_sec_size) {
627 			res = read(fd, bouncebuf, secsz);
628 			if ((unsigned)res != secsz) {
629 				ret = EIO;
630 				goto error;
631 			}
632 			memcpy(bouncebuf, outbuf, bytes);
633 			(void) lseek(fd, -secsz, SEEK_CUR);
634 			res = write(fd, bouncebuf, secsz);
635 			if ((unsigned)res != secsz) {
636 				ret = EIO;
637 				goto error;
638 			}
639 		} else {
640 			res = write(fd, outbuf, full_sec_size);
641 			if ((unsigned)res != full_sec_size) {
642 				ret = EIO;
643 				goto error;
644 			}
645 			outbuf += full_sec_size;
646 		}
647 	}
648 
649 	/* Partial data write to last sector */
650 	if (do_tail_write) {
651 		res = read(fd, bouncebuf, secsz);
652 		if ((unsigned)res != secsz) {
653 			ret = EIO;
654 			goto error;
655 		}
656 		memcpy(bouncebuf, outbuf, secsz - tail);
657 		(void) lseek(fd, -secsz, SEEK_CUR);
658 		res = write(fd, bouncebuf, secsz);
659 		if ((unsigned)res != secsz) {
660 			ret = EIO;
661 			goto error;
662 		}
663 	}
664 
665 	ret = 0;
666 error:
667 	free(bouncebuf);
668 	return (ret);
669 }
670 
671 static int
672 zfs_dev_init(void)
673 {
674 	spa_t *spa;
675 	spa_t *next;
676 	spa_t *prev;
677 
678 	zfs_init();
679 	if (archsw.arch_zfs_probe == NULL)
680 		return (ENXIO);
681 	archsw.arch_zfs_probe();
682 
683 	prev = NULL;
684 	spa = STAILQ_FIRST(&zfs_pools);
685 	while (spa != NULL) {
686 		next = STAILQ_NEXT(spa, spa_link);
687 		if (zfs_spa_init(spa)) {
688 			if (prev == NULL)
689 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
690 			else
691 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
692 		} else
693 			prev = spa;
694 		spa = next;
695 	}
696 	return (0);
697 }
698 
699 struct zfs_probe_args {
700 	int		fd;
701 	const char	*devname;
702 	uint64_t	*pool_guid;
703 	u_int		secsz;
704 };
705 
706 static int
707 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
708 {
709 	struct zfs_probe_args *ppa;
710 
711 	ppa = (struct zfs_probe_args *)arg;
712 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
713 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
714 }
715 
716 static int
717 zfs_probe(int fd, uint64_t *pool_guid)
718 {
719 	spa_t *spa;
720 	int ret;
721 
722 	spa = NULL;
723 	ret = vdev_probe(vdev_read, vdev_write, (void *)(uintptr_t)fd, &spa);
724 	if (ret == 0 && pool_guid != NULL)
725 		if (*pool_guid == 0)
726 			*pool_guid = spa->spa_guid;
727 	return (ret);
728 }
729 
730 static int
731 zfs_probe_partition(void *arg, const char *partname,
732     const struct ptable_entry *part)
733 {
734 	struct zfs_probe_args *ppa, pa;
735 	struct ptable *table;
736 	char devname[32];
737 	int ret;
738 
739 	/* Probe only freebsd-zfs and freebsd partitions */
740 	if (part->type != PART_FREEBSD &&
741 	    part->type != PART_FREEBSD_ZFS)
742 		return (0);
743 
744 	ppa = (struct zfs_probe_args *)arg;
745 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
746 	devname[strlen(ppa->devname) - 1] = '\0';
747 	snprintf(devname, sizeof(devname), "%s%s:", devname, partname);
748 	pa.fd = open(devname, O_RDWR);
749 	if (pa.fd == -1)
750 		return (0);
751 	ret = zfs_probe(pa.fd, ppa->pool_guid);
752 	if (ret == 0)
753 		return (0);
754 	/* Do we have BSD label here? */
755 	if (part->type == PART_FREEBSD) {
756 		pa.devname = devname;
757 		pa.pool_guid = ppa->pool_guid;
758 		pa.secsz = ppa->secsz;
759 		table = ptable_open(&pa, part->end - part->start + 1,
760 		    ppa->secsz, zfs_diskread);
761 		if (table != NULL) {
762 			ptable_iterate(table, &pa, zfs_probe_partition);
763 			ptable_close(table);
764 		}
765 	}
766 	close(pa.fd);
767 	return (0);
768 }
769 
770 /*
771  * Return bootenv nvlist from pool label.
772  */
773 int
774 zfs_get_bootenv(void *vdev, nvlist_t **benvp)
775 {
776 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
777 	nvlist_t *benv = NULL;
778 	vdev_t *vd;
779 	spa_t *spa;
780 
781 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
782 		return (ENOTSUP);
783 
784 	if ((spa = spa_find_by_dev(dev)) == NULL)
785 		return (ENXIO);
786 
787 	if (spa->spa_bootenv == NULL) {
788 		STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children,
789 		    v_childlink) {
790 			benv = vdev_read_bootenv(vd);
791 
792 			if (benv != NULL)
793 				break;
794 		}
795 		spa->spa_bootenv = benv;
796 	} else {
797 		benv = spa->spa_bootenv;
798 	}
799 
800 	if (benv == NULL)
801 		return (ENOENT);
802 
803 	*benvp = benv;
804 	return (0);
805 }
806 
807 /*
808  * Store nvlist to pool label bootenv area. Also updates cached pointer in spa.
809  */
810 int
811 zfs_set_bootenv(void *vdev, nvlist_t *benv)
812 {
813 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
814 	spa_t *spa;
815 	vdev_t *vd;
816 
817 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
818 		return (ENOTSUP);
819 
820 	if ((spa = spa_find_by_dev(dev)) == NULL)
821 		return (ENXIO);
822 
823 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
824 		vdev_write_bootenv(vd, benv);
825 	}
826 
827 	spa->spa_bootenv = benv;
828 	return (0);
829 }
830 
831 /*
832  * Get bootonce value by key. The bootonce <key, value> pair is removed
833  * from the bootenv nvlist and the remaining nvlist is committed back to disk.
834  */
835 int
836 zfs_get_bootonce(void *vdev, const char *key, char *buf, size_t size)
837 {
838 	nvlist_t *benv;
839 	char *result = NULL;
840 	int result_size, rv;
841 
842 	if ((rv = zfs_get_bootenv(vdev, &benv)) != 0)
843 		return (rv);
844 
845 	if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL,
846 	    &result, &result_size)) == 0) {
847 		if (result_size == 0) {
848 			/* ignore empty string */
849 			rv = ENOENT;
850 		} else {
851 			size = MIN((size_t)result_size + 1, size);
852 			strlcpy(buf, result, size);
853 		}
854 		(void) nvlist_remove(benv, key, DATA_TYPE_STRING);
855 		(void) zfs_set_bootenv(vdev, benv);
856 	}
857 
858 	return (rv);
859 }
860 
861 /*
862  * nvstore backend.
863  */
864 
865 static int zfs_nvstore_setter(void *, int, const char *,
866     const void *, size_t);
867 static int zfs_nvstore_setter_str(void *, const char *, const char *,
868     const char *);
869 static int zfs_nvstore_unset_impl(void *, const char *, bool);
870 static int zfs_nvstore_setenv(void *, void *);
871 
872 /*
873  * nvstore is only present for current rootfs pool.
874  */
875 static int
876 zfs_nvstore_sethook(struct env_var *ev, int flags __unused, const void *value)
877 {
878 	struct zfs_devdesc *dev;
879 	int rv;
880 
881 	archsw.arch_getdev((void **)&dev, NULL, NULL);
882 	if (dev == NULL)
883 		return (ENXIO);
884 
885 	rv = zfs_nvstore_setter_str(dev, NULL, ev->ev_name, value);
886 
887 	free(dev);
888 	return (rv);
889 }
890 
891 /*
892  * nvstore is only present for current rootfs pool.
893  */
894 static int
895 zfs_nvstore_unsethook(struct env_var *ev)
896 {
897 	struct zfs_devdesc *dev;
898 	int rv;
899 
900 	archsw.arch_getdev((void **)&dev, NULL, NULL);
901 	if (dev == NULL)
902 		return (ENXIO);
903 
904 	rv = zfs_nvstore_unset_impl(dev, ev->ev_name, false);
905 
906 	free(dev);
907 	return (rv);
908 }
909 
910 static int
911 zfs_nvstore_getter(void *vdev, const char *name, void **data)
912 {
913 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
914 	spa_t *spa;
915 	nvlist_t *nv;
916 	char *str, **ptr;
917 	int size;
918 	int rv;
919 
920 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
921 		return (ENOTSUP);
922 
923 	if ((spa = spa_find_by_dev(dev)) == NULL)
924 		return (ENXIO);
925 
926 	if (spa->spa_bootenv == NULL)
927 		return (ENXIO);
928 
929 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
930 	    NULL, &nv, NULL) != 0)
931 		return (ENOENT);
932 
933 	rv = nvlist_find(nv, name, DATA_TYPE_STRING, NULL, &str, &size);
934 	if (rv == 0) {
935 		ptr = (char **)data;
936 		asprintf(ptr, "%.*s", size, str);
937 		if (*data == NULL)
938 			rv = ENOMEM;
939 	}
940 	nvlist_destroy(nv);
941 	return (rv);
942 }
943 
944 static int
945 zfs_nvstore_setter(void *vdev, int type, const char *name,
946     const void *data, size_t size)
947 {
948 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
949 	spa_t *spa;
950 	nvlist_t *nv;
951 	int rv;
952 	bool env_set = true;
953 
954 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
955 		return (ENOTSUP);
956 
957 	if ((spa = spa_find_by_dev(dev)) == NULL)
958 		return (ENXIO);
959 
960 	if (spa->spa_bootenv == NULL)
961 		return (ENXIO);
962 
963 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
964 	    NULL, &nv, NULL) != 0) {
965 		nv = nvlist_create(NV_UNIQUE_NAME);
966 		if (nv == NULL)
967 			return (ENOMEM);
968 	}
969 
970 	rv = 0;
971 	switch (type) {
972         case DATA_TYPE_INT8:
973 		if (size != sizeof (int8_t)) {
974 			rv = EINVAL;
975 			break;
976 		}
977 		rv = nvlist_add_int8(nv, name, *(int8_t *)data);
978 		break;
979 
980         case DATA_TYPE_INT16:
981 		if (size != sizeof (int16_t)) {
982 			rv = EINVAL;
983 			break;
984 		}
985 		rv = nvlist_add_int16(nv, name, *(int16_t *)data);
986 		break;
987 
988         case DATA_TYPE_INT32:
989 		if (size != sizeof (int32_t)) {
990 			rv = EINVAL;
991 			break;
992 		}
993 		rv = nvlist_add_int32(nv, name, *(int32_t *)data);
994 		break;
995 
996         case DATA_TYPE_INT64:
997 		if (size != sizeof (int64_t)) {
998 			rv = EINVAL;
999 			break;
1000 		}
1001 		rv = nvlist_add_int64(nv, name, *(int64_t *)data);
1002 		break;
1003 
1004         case DATA_TYPE_BYTE:
1005 		if (size != sizeof (uint8_t)) {
1006 			rv = EINVAL;
1007 			break;
1008 		}
1009 		rv = nvlist_add_byte(nv, name, *(int8_t *)data);
1010 		break;
1011 
1012         case DATA_TYPE_UINT8:
1013 		if (size != sizeof (uint8_t)) {
1014 			rv = EINVAL;
1015 			break;
1016 		}
1017 		rv = nvlist_add_uint8(nv, name, *(int8_t *)data);
1018 		break;
1019 
1020         case DATA_TYPE_UINT16:
1021 		if (size != sizeof (uint16_t)) {
1022 			rv = EINVAL;
1023 			break;
1024 		}
1025 		rv = nvlist_add_uint16(nv, name, *(uint16_t *)data);
1026 		break;
1027 
1028         case DATA_TYPE_UINT32:
1029 		if (size != sizeof (uint32_t)) {
1030 			rv = EINVAL;
1031 			break;
1032 		}
1033 		rv = nvlist_add_uint32(nv, name, *(uint32_t *)data);
1034 		break;
1035 
1036         case DATA_TYPE_UINT64:
1037 		if (size != sizeof (uint64_t)) {
1038 			rv = EINVAL;
1039 			break;
1040 		}
1041 		rv = nvlist_add_uint64(nv, name, *(uint64_t *)data);
1042 		break;
1043 
1044         case DATA_TYPE_STRING:
1045 		rv = nvlist_add_string(nv, name, data);
1046 		break;
1047 
1048 	case DATA_TYPE_BOOLEAN_VALUE:
1049 		if (size != sizeof (boolean_t)) {
1050 			rv = EINVAL;
1051 			break;
1052 		}
1053 		rv = nvlist_add_boolean_value(nv, name, *(boolean_t *)data);
1054 		break;
1055 
1056 	default:
1057 		rv = EINVAL;
1058 		break;
1059 	}
1060 
1061 	if (rv == 0) {
1062 		rv = nvlist_add_nvlist(spa->spa_bootenv, OS_NVSTORE, nv);
1063 		if (rv == 0) {
1064 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1065 		}
1066 		if (rv == 0) {
1067 			if (env_set) {
1068 				rv = zfs_nvstore_setenv(vdev,
1069 				    nvpair_find(nv, name));
1070 			} else {
1071 				env_discard(env_getenv(name));
1072 				rv = 0;
1073 			}
1074 		}
1075 	}
1076 
1077 	nvlist_destroy(nv);
1078 	return (rv);
1079 }
1080 
1081 static int
1082 get_int64(const char *data, int64_t *ip)
1083 {
1084 	char *end;
1085 	int64_t val;
1086 
1087 	errno = 0;
1088 	val = strtoll(data, &end, 0);
1089 	if (errno != 0 || *data == '\0' || *end != '\0')
1090 		return (EINVAL);
1091 
1092 	*ip = val;
1093 	return (0);
1094 }
1095 
1096 static int
1097 get_uint64(const char *data, uint64_t *ip)
1098 {
1099 	char *end;
1100 	uint64_t val;
1101 
1102 	errno = 0;
1103 	val = strtoull(data, &end, 0);
1104 	if (errno != 0 || *data == '\0' || *end != '\0')
1105 		return (EINVAL);
1106 
1107 	*ip = val;
1108 	return (0);
1109 }
1110 
1111 /*
1112  * Translate textual data to data type. If type is not set, and we are
1113  * creating new pair, use DATA_TYPE_STRING.
1114  */
1115 static int
1116 zfs_nvstore_setter_str(void *vdev, const char *type, const char *name,
1117     const char *data)
1118 {
1119 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1120 	spa_t *spa;
1121 	nvlist_t *nv;
1122 	int rv;
1123 	data_type_t dt;
1124 	int64_t val;
1125 	uint64_t uval;
1126 
1127 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1128 		return (ENOTSUP);
1129 
1130 	if ((spa = spa_find_by_dev(dev)) == NULL)
1131 		return (ENXIO);
1132 
1133 	if (spa->spa_bootenv == NULL)
1134 		return (ENXIO);
1135 
1136 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1137 	    NULL, &nv, NULL) != 0) {
1138 		nv = NULL;
1139 	}
1140 
1141 	if (type == NULL) {
1142 		nvp_header_t *nvh;
1143 
1144 		/*
1145 		 * if there is no existing pair, default to string.
1146 		 * Otherwise, use type from existing pair.
1147 		 */
1148 		nvh = nvpair_find(nv, name);
1149 		if (nvh == NULL) {
1150 			dt = DATA_TYPE_STRING;
1151 		} else {
1152 			nv_string_t *nvp_name;
1153 			nv_pair_data_t *nvp_data;
1154 
1155 			nvp_name = (nv_string_t *)(nvh + 1);
1156 			nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1157 			    NV_ALIGN4(nvp_name->nv_size));
1158 			dt = nvp_data->nv_type;
1159 		}
1160 	} else {
1161 		dt = nvpair_type_from_name(type);
1162 	}
1163 	nvlist_destroy(nv);
1164 
1165 	rv = 0;
1166 	switch (dt) {
1167         case DATA_TYPE_INT8:
1168 		rv = get_int64(data, &val);
1169 		if (rv == 0) {
1170 			int8_t v = val;
1171 
1172 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1173 		}
1174 		break;
1175         case DATA_TYPE_INT16:
1176 		rv = get_int64(data, &val);
1177 		if (rv == 0) {
1178 			int16_t v = val;
1179 
1180 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1181 		}
1182 		break;
1183         case DATA_TYPE_INT32:
1184 		rv = get_int64(data, &val);
1185 		if (rv == 0) {
1186 			int32_t v = val;
1187 
1188 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1189 		}
1190 		break;
1191         case DATA_TYPE_INT64:
1192 		rv = get_int64(data, &val);
1193 		if (rv == 0) {
1194 			rv = zfs_nvstore_setter(vdev, dt, name, &val,
1195 			    sizeof (val));
1196 		}
1197 		break;
1198 
1199         case DATA_TYPE_BYTE:
1200 		rv = get_uint64(data, &uval);
1201 		if (rv == 0) {
1202 			uint8_t v = uval;
1203 
1204 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1205 		}
1206 		break;
1207 
1208         case DATA_TYPE_UINT8:
1209 		rv = get_uint64(data, &uval);
1210 		if (rv == 0) {
1211 			uint8_t v = uval;
1212 
1213 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1214 		}
1215 		break;
1216 
1217         case DATA_TYPE_UINT16:
1218 		rv = get_uint64(data, &uval);
1219 		if (rv == 0) {
1220 			uint16_t v = uval;
1221 
1222 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1223 		}
1224 		break;
1225 
1226         case DATA_TYPE_UINT32:
1227 		rv = get_uint64(data, &uval);
1228 		if (rv == 0) {
1229 			uint32_t v = uval;
1230 
1231 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1232 		}
1233 		break;
1234 
1235         case DATA_TYPE_UINT64:
1236 		rv = get_uint64(data, &uval);
1237 		if (rv == 0) {
1238 			rv = zfs_nvstore_setter(vdev, dt, name, &uval,
1239 			    sizeof (uval));
1240 		}
1241 		break;
1242 
1243         case DATA_TYPE_STRING:
1244 		rv = zfs_nvstore_setter(vdev, dt, name, data, strlen(data) + 1);
1245 		break;
1246 
1247 	case DATA_TYPE_BOOLEAN_VALUE:
1248 		rv = get_int64(data, &val);
1249 		if (rv == 0) {
1250 			boolean_t v = val;
1251 
1252 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1253 		}
1254 
1255 	default:
1256 		rv = EINVAL;
1257 	}
1258 	return (rv);
1259 }
1260 
1261 static int
1262 zfs_nvstore_unset_impl(void *vdev, const char *name, bool unset_env)
1263 {
1264 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1265 	spa_t *spa;
1266 	nvlist_t *nv;
1267 	int rv;
1268 
1269 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1270 		return (ENOTSUP);
1271 
1272 	if ((spa = spa_find_by_dev(dev)) == NULL)
1273 		return (ENXIO);
1274 
1275 	if (spa->spa_bootenv == NULL)
1276 		return (ENXIO);
1277 
1278 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1279 	    NULL, &nv, NULL) != 0)
1280 		return (ENOENT);
1281 
1282 	rv = nvlist_remove(nv, name, DATA_TYPE_UNKNOWN);
1283 	if (rv == 0) {
1284 		if (nvlist_next_nvpair(nv, NULL) == NULL) {
1285 			rv = nvlist_remove(spa->spa_bootenv, OS_NVSTORE,
1286 			    DATA_TYPE_NVLIST);
1287 		} else {
1288 			rv = nvlist_add_nvlist(spa->spa_bootenv,
1289 			    OS_NVSTORE, nv);
1290 		}
1291 		if (rv == 0)
1292 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1293 	}
1294 
1295 	if (unset_env)
1296 		env_discard(env_getenv(name));
1297 	return (rv);
1298 }
1299 
1300 static int
1301 zfs_nvstore_unset(void *vdev, const char *name)
1302 {
1303 	return (zfs_nvstore_unset_impl(vdev, name, true));
1304 }
1305 
1306 static int
1307 zfs_nvstore_print(void *vdev __unused, void *ptr)
1308 {
1309 
1310 	nvpair_print(ptr, 0);
1311 	return (0);
1312 }
1313 
1314 /*
1315  * Create environment variable from nvpair.
1316  * set hook will update nvstore with new value, unset hook will remove
1317  * variable from nvstore.
1318  */
1319 static int
1320 zfs_nvstore_setenv(void *vdev __unused, void *ptr)
1321 {
1322 	nvp_header_t *nvh = ptr;
1323 	nv_string_t *nvp_name, *nvp_value;
1324 	nv_pair_data_t *nvp_data;
1325 	char *name, *value;
1326 	int rv = 0;
1327 
1328 	if (nvh == NULL)
1329 		return (ENOENT);
1330 
1331 	nvp_name = (nv_string_t *)(nvh + 1);
1332 	nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1333 	    NV_ALIGN4(nvp_name->nv_size));
1334 
1335 	if ((name = nvstring_get(nvp_name)) == NULL)
1336 		return (ENOMEM);
1337 
1338 	value = NULL;
1339 	switch (nvp_data->nv_type) {
1340 	case DATA_TYPE_BYTE:
1341 	case DATA_TYPE_UINT8:
1342 		(void) asprintf(&value, "%uc",
1343 		    *(unsigned *)&nvp_data->nv_data[0]);
1344 		if (value == NULL)
1345 			rv = ENOMEM;
1346 		break;
1347 
1348 	case DATA_TYPE_INT8:
1349 		(void) asprintf(&value, "%c", *(int *)&nvp_data->nv_data[0]);
1350 		if (value == NULL)
1351 			rv = ENOMEM;
1352 		break;
1353 
1354 	case DATA_TYPE_INT16:
1355 		(void) asprintf(&value, "%hd", *(short *)&nvp_data->nv_data[0]);
1356 		if (value == NULL)
1357 			rv = ENOMEM;
1358 		break;
1359 
1360 	case DATA_TYPE_UINT16:
1361 		(void) asprintf(&value, "%hu",
1362 		    *(unsigned short *)&nvp_data->nv_data[0]);
1363 		if (value == NULL)
1364 			rv = ENOMEM;
1365 		break;
1366 
1367 	case DATA_TYPE_BOOLEAN_VALUE:
1368 	case DATA_TYPE_INT32:
1369 		(void) asprintf(&value, "%d", *(int *)&nvp_data->nv_data[0]);
1370 		if (value == NULL)
1371 			rv = ENOMEM;
1372 		break;
1373 
1374 	case DATA_TYPE_UINT32:
1375 		(void) asprintf(&value, "%u",
1376 		    *(unsigned *)&nvp_data->nv_data[0]);
1377 		if (value == NULL)
1378 			rv = ENOMEM;
1379 		break;
1380 
1381 	case DATA_TYPE_INT64:
1382 		(void) asprintf(&value, "%jd",
1383 		    (intmax_t)*(int64_t *)&nvp_data->nv_data[0]);
1384 		if (value == NULL)
1385 			rv = ENOMEM;
1386 		break;
1387 
1388 	case DATA_TYPE_UINT64:
1389 		(void) asprintf(&value, "%ju",
1390 		    (uintmax_t)*(uint64_t *)&nvp_data->nv_data[0]);
1391 		if (value == NULL)
1392 			rv = ENOMEM;
1393 		break;
1394 
1395 	case DATA_TYPE_STRING:
1396 		nvp_value = (nv_string_t *)&nvp_data->nv_data[0];
1397 		if ((value = nvstring_get(nvp_value)) == NULL) {
1398 			rv = ENOMEM;
1399 			break;
1400 		}
1401 		break;
1402 
1403 	default:
1404 		rv = EINVAL;
1405 		break;
1406 	}
1407 
1408 	if (value != NULL) {
1409 		rv = env_setenv(name, EV_VOLATILE | EV_NOHOOK, value,
1410 		    zfs_nvstore_sethook, zfs_nvstore_unsethook);
1411 		free(value);
1412 	}
1413 	free(name);
1414 	return (rv);
1415 }
1416 
1417 static int
1418 zfs_nvstore_iterate(void *vdev, int (*cb)(void *, void *))
1419 {
1420 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1421 	spa_t *spa;
1422 	nvlist_t *nv;
1423 	nvp_header_t *nvh;
1424 	int rv;
1425 
1426 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1427 		return (ENOTSUP);
1428 
1429 	if ((spa = spa_find_by_dev(dev)) == NULL)
1430 		return (ENXIO);
1431 
1432 	if (spa->spa_bootenv == NULL)
1433 		return (ENXIO);
1434 
1435 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1436 	    NULL, &nv, NULL) != 0)
1437 		return (ENOENT);
1438 
1439 	rv = 0;
1440 	nvh = NULL;
1441 	while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
1442 		rv = cb(vdev, nvh);
1443 		if (rv != 0)
1444 			break;
1445 	}
1446 	return (rv);
1447 }
1448 
1449 nvs_callbacks_t nvstore_zfs_cb = {
1450 	.nvs_getter = zfs_nvstore_getter,
1451 	.nvs_setter = zfs_nvstore_setter,
1452 	.nvs_setter_str = zfs_nvstore_setter_str,
1453 	.nvs_unset = zfs_nvstore_unset,
1454 	.nvs_print = zfs_nvstore_print,
1455 	.nvs_iterate = zfs_nvstore_iterate
1456 };
1457 
1458 int
1459 zfs_attach_nvstore(void *vdev)
1460 {
1461 	struct zfs_devdesc *dev = vdev;
1462 	spa_t *spa;
1463 	uint64_t version;
1464 	int rv;
1465 
1466 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1467 		return (ENOTSUP);
1468 
1469 	if ((spa = spa_find_by_dev(dev)) == NULL)
1470 		return (ENXIO);
1471 
1472 	rv = nvlist_find(spa->spa_bootenv, BOOTENV_VERSION, DATA_TYPE_UINT64,
1473 	    NULL, &version, NULL);
1474 
1475 	if (rv != 0 || version != VB_NVLIST) {
1476 		return (ENXIO);
1477 	}
1478 
1479 	dev = malloc(sizeof (*dev));
1480 	if (dev == NULL)
1481 		return (ENOMEM);
1482 	memcpy(dev, vdev, sizeof (*dev));
1483 
1484 	rv = nvstore_init(spa->spa_name, &nvstore_zfs_cb, dev);
1485 	if (rv != 0)
1486 		free(dev);
1487 	else
1488 		rv = zfs_nvstore_iterate(dev, zfs_nvstore_setenv);
1489 	return (rv);
1490 }
1491 
1492 int
1493 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
1494 {
1495 	struct ptable *table;
1496 	struct zfs_probe_args pa;
1497 	uint64_t mediasz;
1498 	int ret;
1499 
1500 	if (pool_guid)
1501 		*pool_guid = 0;
1502 	pa.fd = open(devname, O_RDWR);
1503 	if (pa.fd == -1)
1504 		return (ENXIO);
1505 	/* Probe the whole disk */
1506 	ret = zfs_probe(pa.fd, pool_guid);
1507 	if (ret == 0)
1508 		return (0);
1509 
1510 	/* Probe each partition */
1511 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
1512 	if (ret == 0)
1513 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
1514 	if (ret == 0) {
1515 		pa.devname = devname;
1516 		pa.pool_guid = pool_guid;
1517 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
1518 		    zfs_diskread);
1519 		if (table != NULL) {
1520 			ptable_iterate(table, &pa, zfs_probe_partition);
1521 			ptable_close(table);
1522 		}
1523 	}
1524 	close(pa.fd);
1525 	if (pool_guid && *pool_guid == 0)
1526 		ret = ENXIO;
1527 	return (ret);
1528 }
1529 
1530 /*
1531  * Print information about ZFS pools
1532  */
1533 static int
1534 zfs_dev_print(int verbose)
1535 {
1536 	spa_t *spa;
1537 	char line[80];
1538 	int ret = 0;
1539 
1540 	if (STAILQ_EMPTY(&zfs_pools))
1541 		return (0);
1542 
1543 	printf("%s devices:", zfs_dev.dv_name);
1544 	if ((ret = pager_output("\n")) != 0)
1545 		return (ret);
1546 
1547 	if (verbose) {
1548 		return (spa_all_status());
1549 	}
1550 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1551 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
1552 		ret = pager_output(line);
1553 		if (ret != 0)
1554 			break;
1555 	}
1556 	return (ret);
1557 }
1558 
1559 /*
1560  * Attempt to open the pool described by (dev) for use by (f).
1561  */
1562 static int
1563 zfs_dev_open(struct open_file *f, ...)
1564 {
1565 	va_list		args;
1566 	struct zfs_devdesc	*dev;
1567 	struct zfsmount	*mount;
1568 	spa_t		*spa;
1569 	int		rv;
1570 
1571 	va_start(args, f);
1572 	dev = va_arg(args, struct zfs_devdesc *);
1573 	va_end(args);
1574 
1575 	if ((spa = spa_find_by_dev(dev)) == NULL)
1576 		return (ENXIO);
1577 
1578 	STAILQ_FOREACH(mount, &zfsmount, next) {
1579 		if (spa->spa_guid == mount->spa->spa_guid)
1580 			break;
1581 	}
1582 
1583 	rv = 0;
1584 	/* This device is not set as currdev, mount us private copy. */
1585 	if (mount == NULL)
1586 		rv = zfs_mount(zfs_fmtdev(dev), NULL, (void **)&mount);
1587 
1588 	if (rv == 0) {
1589 		f->f_devdata = mount;
1590 		free(dev);
1591 	}
1592 	return (rv);
1593 }
1594 
1595 static int
1596 zfs_dev_close(struct open_file *f)
1597 {
1598 	struct zfsmount	*mnt, *mount;
1599 
1600 	mnt = f->f_devdata;
1601 
1602 	STAILQ_FOREACH(mount, &zfsmount, next) {
1603 		if (mnt->spa->spa_guid == mount->spa->spa_guid)
1604 			break;
1605 	}
1606 
1607 	/*
1608 	 * devclose() will free f->f_devdata, but since we do have
1609 	 * pointer to zfsmount structure in f->f_devdata, and
1610 	 * zfs_unmount() will also free the zfsmount structure,
1611 	 * we will get double free. To prevent double free,
1612 	 * we must set f_devdata to NULL there.
1613 	 */
1614 	if (mount != NULL)
1615 		f->f_devdata = NULL;
1616 
1617 	return (0);
1618 }
1619 
1620 static int
1621 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
1622 {
1623 
1624 	return (ENOSYS);
1625 }
1626 
1627 struct devsw zfs_dev = {
1628 	.dv_name = "zfs",
1629 	.dv_type = DEVT_ZFS,
1630 	.dv_init = zfs_dev_init,
1631 	.dv_strategy = zfs_dev_strategy,
1632 	.dv_open = zfs_dev_open,
1633 	.dv_close = zfs_dev_close,
1634 	.dv_ioctl = noioctl,
1635 	.dv_print = zfs_dev_print,
1636 	.dv_cleanup = nullsys,
1637 };
1638 
1639 int
1640 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
1641 {
1642 	static char	rootname[ZFS_MAXNAMELEN];
1643 	static char	poolname[ZFS_MAXNAMELEN];
1644 	spa_t		*spa;
1645 	const char	*end;
1646 	const char	*np;
1647 	const char	*sep;
1648 	int		rv;
1649 
1650 	np = devspec;
1651 	if (*np != ':')
1652 		return (EINVAL);
1653 	np++;
1654 	end = strrchr(np, ':');
1655 	if (end == NULL)
1656 		return (EINVAL);
1657 	sep = strchr(np, '/');
1658 	if (sep == NULL || sep >= end)
1659 		sep = end;
1660 	memcpy(poolname, np, sep - np);
1661 	poolname[sep - np] = '\0';
1662 	if (sep < end) {
1663 		sep++;
1664 		memcpy(rootname, sep, end - sep);
1665 		rootname[end - sep] = '\0';
1666 	}
1667 	else
1668 		rootname[0] = '\0';
1669 
1670 	spa = spa_find_by_name(poolname);
1671 	if (!spa)
1672 		return (ENXIO);
1673 	dev->pool_guid = spa->spa_guid;
1674 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1675 	if (rv != 0)
1676 		return (rv);
1677 	if (path != NULL)
1678 		*path = (*end == '\0') ? end : end + 1;
1679 	dev->dd.d_dev = &zfs_dev;
1680 	return (0);
1681 }
1682 
1683 char *
1684 zfs_fmtdev(void *vdev)
1685 {
1686 	static char		rootname[ZFS_MAXNAMELEN];
1687 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1688 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1689 	spa_t			*spa;
1690 
1691 	buf[0] = '\0';
1692 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1693 		return (buf);
1694 
1695 	/* Do we have any pools? */
1696 	spa = STAILQ_FIRST(&zfs_pools);
1697 	if (spa == NULL)
1698 		return (buf);
1699 
1700 	if (dev->pool_guid == 0)
1701 		dev->pool_guid = spa->spa_guid;
1702 	else
1703 		spa = spa_find_by_guid(dev->pool_guid);
1704 
1705 	if (spa == NULL) {
1706 		printf("ZFS: can't find pool by guid\n");
1707 		return (buf);
1708 	}
1709 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1710 		printf("ZFS: can't find root filesystem\n");
1711 		return (buf);
1712 	}
1713 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1714 		printf("ZFS: can't find filesystem by guid\n");
1715 		return (buf);
1716 	}
1717 
1718 	if (rootname[0] == '\0')
1719 		snprintf(buf, sizeof(buf), "%s:%s:", dev->dd.d_dev->dv_name,
1720 		    spa->spa_name);
1721 	else
1722 		snprintf(buf, sizeof(buf), "%s:%s/%s:", dev->dd.d_dev->dv_name,
1723 		    spa->spa_name, rootname);
1724 	return (buf);
1725 }
1726 
1727 static int
1728 split_devname(const char *name, char *poolname, size_t size,
1729     const char **dsnamep)
1730 {
1731 	const char *dsname;
1732 	size_t len;
1733 
1734 	ASSERT(name != NULL);
1735 	ASSERT(poolname != NULL);
1736 
1737 	len = strlen(name);
1738 	dsname = strchr(name, '/');
1739 	if (dsname != NULL) {
1740 		len = dsname - name;
1741 		dsname++;
1742 	} else
1743 		dsname = "";
1744 
1745 	if (len + 1 > size)
1746 		return (EINVAL);
1747 
1748 	strlcpy(poolname, name, len + 1);
1749 
1750 	if (dsnamep != NULL)
1751 		*dsnamep = dsname;
1752 
1753 	return (0);
1754 }
1755 
1756 int
1757 zfs_list(const char *name)
1758 {
1759 	static char	poolname[ZFS_MAXNAMELEN];
1760 	uint64_t	objid;
1761 	spa_t		*spa;
1762 	const char	*dsname;
1763 	int		rv;
1764 
1765 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1766 		return (EINVAL);
1767 
1768 	spa = spa_find_by_name(poolname);
1769 	if (!spa)
1770 		return (ENXIO);
1771 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1772 	if (rv != 0)
1773 		return (rv);
1774 
1775 	return (zfs_list_dataset(spa, objid));
1776 }
1777 
1778 void
1779 init_zfs_boot_options(const char *currdev_in)
1780 {
1781 	char poolname[ZFS_MAXNAMELEN];
1782 	char *beroot, *currdev;
1783 	spa_t *spa;
1784 	int currdev_len;
1785 	const char *dsname;
1786 
1787 	currdev = NULL;
1788 	currdev_len = strlen(currdev_in);
1789 	if (currdev_len == 0)
1790 		return;
1791 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1792 		return;
1793 	currdev = strdup(currdev_in);
1794 	if (currdev == NULL)
1795 		return;
1796 	/* Remove the trailing : */
1797 	currdev[currdev_len - 1] = '\0';
1798 
1799 	setenv("zfs_be_active", currdev, 1);
1800 	setenv("zfs_be_currpage", "1", 1);
1801 	/* Remove the last element (current bootenv) */
1802 	beroot = strrchr(currdev, '/');
1803 	if (beroot != NULL)
1804 		beroot[0] = '\0';
1805 	beroot = strchr(currdev, ':') + 1;
1806 	setenv("zfs_be_root", beroot, 1);
1807 
1808 	if (split_devname(beroot, poolname, sizeof(poolname), &dsname) != 0)
1809 		return;
1810 
1811 	spa = spa_find_by_name(poolname);
1812 	if (spa == NULL)
1813 		return;
1814 
1815 	zfs_bootenv_initial("bootenvs", spa, beroot, dsname, 0);
1816 	zfs_checkpoints_initial(spa, beroot, dsname);
1817 
1818 	free(currdev);
1819 }
1820 
1821 static void
1822 zfs_checkpoints_initial(spa_t *spa, const char *name, const char *dsname)
1823 {
1824 	char envname[32];
1825 
1826 	if (spa->spa_uberblock_checkpoint.ub_checkpoint_txg != 0) {
1827 		snprintf(envname, sizeof(envname), "zpool_checkpoint");
1828 		setenv(envname, name, 1);
1829 
1830 		spa->spa_uberblock = &spa->spa_uberblock_checkpoint;
1831 		spa->spa_mos = &spa->spa_mos_checkpoint;
1832 
1833 		zfs_bootenv_initial("bootenvs_check", spa, name, dsname, 1);
1834 
1835 		spa->spa_uberblock = &spa->spa_uberblock_master;
1836 		spa->spa_mos = &spa->spa_mos_master;
1837 	}
1838 }
1839 
1840 static void
1841 zfs_bootenv_initial(const char *envprefix, spa_t *spa, const char *rootname,
1842    const char *dsname, int checkpoint)
1843 {
1844 	char		envname[32], envval[256];
1845 	uint64_t	objid;
1846 	int		bootenvs_idx, rv;
1847 
1848 	SLIST_INIT(&zfs_be_head);
1849 	zfs_env_count = 0;
1850 
1851 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1852 	if (rv != 0)
1853 		return;
1854 
1855 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1856 	bootenvs_idx = 0;
1857 	/* Populate the initial environment variables */
1858 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1859 		/* Enumerate all bootenvs for general usage */
1860 		snprintf(envname, sizeof(envname), "%s[%d]",
1861 		    envprefix, bootenvs_idx);
1862 		snprintf(envval, sizeof(envval), "zfs:%s%s/%s",
1863 		    checkpoint ? "!" : "", rootname, zfs_be->name);
1864 		rv = setenv(envname, envval, 1);
1865 		if (rv != 0)
1866 			break;
1867 		bootenvs_idx++;
1868 	}
1869 	snprintf(envname, sizeof(envname), "%s_count", envprefix);
1870 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1871 	setenv(envname, envval, 1);
1872 
1873 	/* Clean up the SLIST of ZFS BEs */
1874 	while (!SLIST_EMPTY(&zfs_be_head)) {
1875 		zfs_be = SLIST_FIRST(&zfs_be_head);
1876 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1877 		free(zfs_be->name);
1878 		free(zfs_be);
1879 	}
1880 }
1881 
1882 int
1883 zfs_bootenv(const char *name)
1884 {
1885 	char		poolname[ZFS_MAXNAMELEN], *root;
1886 	const char	*dsname;
1887 	char		becount[4];
1888 	uint64_t	objid;
1889 	spa_t		*spa;
1890 	int		rv, pages, perpage, currpage;
1891 
1892 	if (name == NULL)
1893 		return (EINVAL);
1894 	if ((root = getenv("zfs_be_root")) == NULL)
1895 		return (EINVAL);
1896 
1897 	if (strcmp(name, root) != 0) {
1898 		if (setenv("zfs_be_root", name, 1) != 0)
1899 			return (ENOMEM);
1900 	}
1901 
1902 	SLIST_INIT(&zfs_be_head);
1903 	zfs_env_count = 0;
1904 
1905 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1906 		return (EINVAL);
1907 
1908 	spa = spa_find_by_name(poolname);
1909 	if (!spa)
1910 		return (ENXIO);
1911 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1912 	if (rv != 0)
1913 		return (rv);
1914 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1915 
1916 	/* Calculate and store the number of pages of BEs */
1917 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1918 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1919 	snprintf(becount, 4, "%d", pages);
1920 	if (setenv("zfs_be_pages", becount, 1) != 0)
1921 		return (ENOMEM);
1922 
1923 	/* Roll over the page counter if it has exceeded the maximum */
1924 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1925 	if (currpage > pages) {
1926 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1927 			return (ENOMEM);
1928 	}
1929 
1930 	/* Populate the menu environment variables */
1931 	zfs_set_env();
1932 
1933 	/* Clean up the SLIST of ZFS BEs */
1934 	while (!SLIST_EMPTY(&zfs_be_head)) {
1935 		zfs_be = SLIST_FIRST(&zfs_be_head);
1936 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1937 		free(zfs_be->name);
1938 		free(zfs_be);
1939 	}
1940 
1941 	return (rv);
1942 }
1943 
1944 int
1945 zfs_belist_add(const char *name, uint64_t value __unused)
1946 {
1947 
1948 	/* Skip special datasets that start with a $ character */
1949 	if (strncmp(name, "$", 1) == 0) {
1950 		return (0);
1951 	}
1952 	/* Add the boot environment to the head of the SLIST */
1953 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1954 	if (zfs_be == NULL) {
1955 		return (ENOMEM);
1956 	}
1957 	zfs_be->name = strdup(name);
1958 	if (zfs_be->name == NULL) {
1959 		free(zfs_be);
1960 		return (ENOMEM);
1961 	}
1962 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1963 	zfs_env_count++;
1964 
1965 	return (0);
1966 }
1967 
1968 int
1969 zfs_set_env(void)
1970 {
1971 	char envname[32], envval[256];
1972 	char *beroot, *pagenum;
1973 	int rv, page, ctr;
1974 
1975 	beroot = getenv("zfs_be_root");
1976 	if (beroot == NULL) {
1977 		return (1);
1978 	}
1979 
1980 	pagenum = getenv("zfs_be_currpage");
1981 	if (pagenum != NULL) {
1982 		page = strtol(pagenum, NULL, 10);
1983 	} else {
1984 		page = 1;
1985 	}
1986 
1987 	ctr = 1;
1988 	rv = 0;
1989 	zfs_env_index = ZFS_BE_FIRST;
1990 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1991 		/* Skip to the requested page number */
1992 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
1993 			ctr++;
1994 			continue;
1995 		}
1996 
1997 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1998 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
1999 		rv = setenv(envname, envval, 1);
2000 		if (rv != 0) {
2001 			break;
2002 		}
2003 
2004 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2005 		rv = setenv(envname, envval, 1);
2006 		if (rv != 0){
2007 			break;
2008 		}
2009 
2010 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2011 		rv = setenv(envname, "set_bootenv", 1);
2012 		if (rv != 0){
2013 			break;
2014 		}
2015 
2016 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2017 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
2018 		rv = setenv(envname, envval, 1);
2019 		if (rv != 0){
2020 			break;
2021 		}
2022 
2023 		zfs_env_index++;
2024 		if (zfs_env_index > ZFS_BE_LAST) {
2025 			break;
2026 		}
2027 
2028 	}
2029 
2030 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
2031 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
2032 		(void)unsetenv(envname);
2033 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2034 		(void)unsetenv(envname);
2035 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2036 		(void)unsetenv(envname);
2037 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2038 		(void)unsetenv(envname);
2039 	}
2040 
2041 	return (rv);
2042 }
2043