xref: /freebsd/stand/libsa/zfs/zfs.c (revision d9a42747950146bf03cda7f6e25d219253f8a57a)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <part.h>
42 #include <stddef.h>
43 #include <stdarg.h>
44 #include <string.h>
45 #include <bootstrap.h>
46 
47 #include "libzfs.h"
48 
49 #include "zfsimpl.c"
50 
51 /* Define the range of indexes to be populated with ZFS Boot Environments */
52 #define		ZFS_BE_FIRST	4
53 #define		ZFS_BE_LAST	8
54 
55 static int	zfs_open(const char *path, struct open_file *f);
56 static int	zfs_close(struct open_file *f);
57 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
58 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
59 static int	zfs_stat(struct open_file *f, struct stat *sb);
60 static int	zfs_readdir(struct open_file *f, struct dirent *d);
61 static int	zfs_mount(const char *dev, const char *path, void **data);
62 static int	zfs_unmount(const char *dev, void *data);
63 
64 static void	zfs_bootenv_initial(const char *envname, spa_t *spa,
65 		    const char *name, const char *dsname, int checkpoint);
66 static void	zfs_checkpoints_initial(spa_t *spa, const char *name,
67 		    const char *dsname);
68 
69 static int	zfs_parsedev(struct devdesc **idev, const char *devspec,
70 		    const char **path);
71 
72 struct devsw zfs_dev;
73 
74 struct fs_ops zfs_fsops = {
75 	.fs_name = "zfs",
76 	.fo_open = zfs_open,
77 	.fo_close = zfs_close,
78 	.fo_read = zfs_read,
79 	.fo_write = null_write,
80 	.fo_seek = zfs_seek,
81 	.fo_stat = zfs_stat,
82 	.fo_readdir = zfs_readdir,
83 	.fo_mount = zfs_mount,
84 	.fo_unmount = zfs_unmount
85 };
86 
87 /*
88  * In-core open file.
89  */
90 struct file {
91 	off_t		f_seekp;	/* seek pointer */
92 	dnode_phys_t	f_dnode;
93 	uint64_t	f_zap_type;	/* zap type for readdir */
94 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
95 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
96 };
97 
98 static int	zfs_env_index;
99 static int	zfs_env_count;
100 
101 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
102 struct zfs_be_list *zfs_be_headp;
103 struct zfs_be_entry {
104 	char *name;
105 	SLIST_ENTRY(zfs_be_entry) entries;
106 } *zfs_be, *zfs_be_tmp;
107 
108 /*
109  * Open a file.
110  */
111 static int
112 zfs_open(const char *upath, struct open_file *f)
113 {
114 	struct devdesc *dev = f->f_devdata;
115 	struct zfsmount *mount = dev->d_opendata;
116 	struct file *fp;
117 	int rc;
118 
119 	if (f->f_dev != &zfs_dev)
120 		return (EINVAL);
121 
122 	/* allocate file system specific data structure */
123 	fp = calloc(1, sizeof(struct file));
124 	if (fp == NULL)
125 		return (ENOMEM);
126 	f->f_fsdata = fp;
127 
128 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
129 	fp->f_seekp = 0;
130 	if (rc) {
131 		f->f_fsdata = NULL;
132 		free(fp);
133 	}
134 	return (rc);
135 }
136 
137 static int
138 zfs_close(struct open_file *f)
139 {
140 	struct file *fp = (struct file *)f->f_fsdata;
141 
142 	dnode_cache_obj = NULL;
143 	f->f_fsdata = NULL;
144 
145 	free(fp);
146 	return (0);
147 }
148 
149 /*
150  * Copy a portion of a file into kernel memory.
151  * Cross block boundaries when necessary.
152  */
153 static int
154 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
155 {
156 	struct devdesc *dev = f->f_devdata;
157 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
158 	struct file *fp = (struct file *)f->f_fsdata;
159 	struct stat sb;
160 	size_t n;
161 	int rc;
162 
163 	rc = zfs_stat(f, &sb);
164 	if (rc)
165 		return (rc);
166 	n = size;
167 	if (fp->f_seekp + n > sb.st_size)
168 		n = sb.st_size - fp->f_seekp;
169 
170 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
171 	if (rc)
172 		return (rc);
173 
174 	if (0) {
175 	    int i;
176 	    for (i = 0; i < n; i++)
177 		putchar(((char*) start)[i]);
178 	}
179 	fp->f_seekp += n;
180 	if (resid)
181 		*resid = size - n;
182 
183 	return (0);
184 }
185 
186 static off_t
187 zfs_seek(struct open_file *f, off_t offset, int where)
188 {
189 	struct file *fp = (struct file *)f->f_fsdata;
190 
191 	switch (where) {
192 	case SEEK_SET:
193 		fp->f_seekp = offset;
194 		break;
195 	case SEEK_CUR:
196 		fp->f_seekp += offset;
197 		break;
198 	case SEEK_END:
199 	    {
200 		struct stat sb;
201 		int error;
202 
203 		error = zfs_stat(f, &sb);
204 		if (error != 0) {
205 			errno = error;
206 			return (-1);
207 		}
208 		fp->f_seekp = sb.st_size - offset;
209 		break;
210 	    }
211 	default:
212 		errno = EINVAL;
213 		return (-1);
214 	}
215 	return (fp->f_seekp);
216 }
217 
218 static int
219 zfs_stat(struct open_file *f, struct stat *sb)
220 {
221 	struct devdesc *dev = f->f_devdata;
222 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
223 	struct file *fp = (struct file *)f->f_fsdata;
224 
225 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
226 }
227 
228 static int
229 zfs_readdir(struct open_file *f, struct dirent *d)
230 {
231 	struct devdesc *dev = f->f_devdata;
232 	const spa_t *spa = ((struct zfsmount *)dev->d_opendata)->spa;
233 	struct file *fp = (struct file *)f->f_fsdata;
234 	mzap_ent_phys_t mze;
235 	struct stat sb;
236 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
237 	int rc;
238 
239 	rc = zfs_stat(f, &sb);
240 	if (rc)
241 		return (rc);
242 	if (!S_ISDIR(sb.st_mode))
243 		return (ENOTDIR);
244 
245 	/*
246 	 * If this is the first read, get the zap type.
247 	 */
248 	if (fp->f_seekp == 0) {
249 		rc = dnode_read(spa, &fp->f_dnode,
250 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
251 		if (rc)
252 			return (rc);
253 
254 		if (fp->f_zap_type == ZBT_MICRO) {
255 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
256 		} else {
257 			rc = dnode_read(spa, &fp->f_dnode,
258 					offsetof(zap_phys_t, zap_num_leafs),
259 					&fp->f_num_leafs,
260 					sizeof(fp->f_num_leafs));
261 			if (rc)
262 				return (rc);
263 
264 			fp->f_seekp = bsize;
265 			fp->f_zap_leaf = malloc(bsize);
266 			if (fp->f_zap_leaf == NULL)
267 				return (ENOMEM);
268 			rc = dnode_read(spa, &fp->f_dnode,
269 					fp->f_seekp,
270 					fp->f_zap_leaf,
271 					bsize);
272 			if (rc)
273 				return (rc);
274 		}
275 	}
276 
277 	if (fp->f_zap_type == ZBT_MICRO) {
278 	mzap_next:
279 		if (fp->f_seekp >= bsize)
280 			return (ENOENT);
281 
282 		rc = dnode_read(spa, &fp->f_dnode,
283 				fp->f_seekp, &mze, sizeof(mze));
284 		if (rc)
285 			return (rc);
286 		fp->f_seekp += sizeof(mze);
287 
288 		if (!mze.mze_name[0])
289 			goto mzap_next;
290 
291 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
292 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
293 		strcpy(d->d_name, mze.mze_name);
294 		d->d_namlen = strlen(d->d_name);
295 		return (0);
296 	} else {
297 		zap_leaf_t zl;
298 		zap_leaf_chunk_t *zc, *nc;
299 		int chunk;
300 		size_t namelen;
301 		char *p;
302 		uint64_t value;
303 
304 		/*
305 		 * Initialise this so we can use the ZAP size
306 		 * calculating macros.
307 		 */
308 		zl.l_bs = ilog2(bsize);
309 		zl.l_phys = fp->f_zap_leaf;
310 
311 		/*
312 		 * Figure out which chunk we are currently looking at
313 		 * and consider seeking to the next leaf. We use the
314 		 * low bits of f_seekp as a simple chunk index.
315 		 */
316 	fzap_next:
317 		chunk = fp->f_seekp & (bsize - 1);
318 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
319 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
320 			chunk = 0;
321 
322 			/*
323 			 * Check for EOF and read the new leaf.
324 			 */
325 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
326 				return (ENOENT);
327 
328 			rc = dnode_read(spa, &fp->f_dnode,
329 					fp->f_seekp,
330 					fp->f_zap_leaf,
331 					bsize);
332 			if (rc)
333 				return (rc);
334 		}
335 
336 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
337 		fp->f_seekp++;
338 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
339 			goto fzap_next;
340 
341 		namelen = zc->l_entry.le_name_numints;
342 		if (namelen > sizeof(d->d_name))
343 			namelen = sizeof(d->d_name);
344 
345 		/*
346 		 * Paste the name back together.
347 		 */
348 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
349 		p = d->d_name;
350 		while (namelen > 0) {
351 			int len;
352 			len = namelen;
353 			if (len > ZAP_LEAF_ARRAY_BYTES)
354 				len = ZAP_LEAF_ARRAY_BYTES;
355 			memcpy(p, nc->l_array.la_array, len);
356 			p += len;
357 			namelen -= len;
358 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
359 		}
360 		d->d_name[sizeof(d->d_name) - 1] = 0;
361 
362 		/*
363 		 * Assume the first eight bytes of the value are
364 		 * a uint64_t.
365 		 */
366 		value = fzap_leaf_value(&zl, zc);
367 
368 		d->d_fileno = ZFS_DIRENT_OBJ(value);
369 		d->d_type = ZFS_DIRENT_TYPE(value);
370 		d->d_namlen = strlen(d->d_name);
371 
372 		return (0);
373 	}
374 }
375 
376 /*
377  * if path is NULL, create mount structure, but do not add it to list.
378  */
379 static int
380 zfs_mount(const char *dev, const char *path, void **data)
381 {
382 	struct zfs_devdesc *zfsdev = NULL;
383 	spa_t *spa;
384 	struct zfsmount *mnt = NULL;
385 	int rv;
386 
387 	errno = 0;
388 	rv = zfs_parsedev((struct devdesc **)&zfsdev, dev, NULL);
389 	if (rv != 0) {
390 		return (rv);
391 	}
392 
393 	spa = spa_find_by_dev(zfsdev);
394 	if (spa == NULL) {
395 		rv = ENXIO;
396 		goto err;
397 	}
398 
399 	mnt = calloc(1, sizeof(*mnt));
400 	if (mnt == NULL) {
401 		rv = ENOMEM;
402 		goto err;
403 	}
404 
405 	if (mnt->path != NULL) {
406 		mnt->path = strdup(path);
407 		if (mnt->path == NULL) {
408 			rv = ENOMEM;
409 			goto err;
410 		}
411 	}
412 
413 	rv = zfs_mount_impl(spa, zfsdev->root_guid, mnt);
414 
415 	if (rv == 0 && mnt->objset.os_type != DMU_OST_ZFS) {
416 		printf("Unexpected object set type %ju\n",
417 		    (uintmax_t)mnt->objset.os_type);
418 		rv = EIO;
419 	}
420 err:
421 	if (rv != 0) {
422 		if (mnt != NULL)
423 			free(mnt->path);
424 		free(mnt);
425 		free(zfsdev);
426 		return (rv);
427 	}
428 
429 	*data = mnt;
430 	if (path != NULL)
431 		STAILQ_INSERT_TAIL(&zfsmount, mnt, next);
432 
433 	free(zfsdev);
434 
435 	return (rv);
436 }
437 
438 static int
439 zfs_unmount(const char *dev, void *data)
440 {
441 	struct zfsmount *mnt = data;
442 
443 	STAILQ_REMOVE(&zfsmount, mnt, zfsmount, next);
444 	free(mnt->path);
445 	free(mnt);
446 	return (0);
447 }
448 
449 static int
450 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
451 {
452 	int fd, ret;
453 	size_t res, head, tail, total_size, full_sec_size;
454 	unsigned secsz, do_tail_read;
455 	off_t start_sec;
456 	char *outbuf, *bouncebuf;
457 
458 	fd = (uintptr_t) priv;
459 	outbuf = (char *) buf;
460 	bouncebuf = NULL;
461 
462 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
463 	if (ret != 0)
464 		return (ret);
465 
466 	/*
467 	 * Handling reads of arbitrary offset and size - multi-sector case
468 	 * and single-sector case.
469 	 *
470 	 *                        Multi-sector Case
471 	 *                (do_tail_read = true if tail > 0)
472 	 *
473 	 *   |<----------------------total_size--------------------->|
474 	 *   |                                                       |
475 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
476 	 *   |          |                                 |          |
477 	 *   |          |       |<~full_sec_size~>|       |          |
478 	 *   +------------------+                 +------------------+
479 	 *   |          |0101010|     .  .  .     |0101011|          |
480 	 *   +------------------+                 +------------------+
481 	 *         start_sec                         start_sec + n
482 	 *
483 	 *
484 	 *                      Single-sector Case
485 	 *                    (do_tail_read = false)
486 	 *
487 	 *              |<------total_size = secsz----->|
488 	 *              |                               |
489 	 *              |<-head->|<---bytes--->|<-tail->|
490 	 *              +-------------------------------+
491 	 *              |        |0101010101010|        |
492 	 *              +-------------------------------+
493 	 *                          start_sec
494 	 */
495 	start_sec = offset / secsz;
496 	head = offset % secsz;
497 	total_size = roundup2(head + bytes, secsz);
498 	tail = total_size - (head + bytes);
499 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
500 	full_sec_size = total_size;
501 	if (head > 0)
502 		full_sec_size -= secsz;
503 	if (do_tail_read)
504 		full_sec_size -= secsz;
505 
506 	/* Return of partial sector data requires a bounce buffer. */
507 	if ((head > 0) || do_tail_read || bytes < secsz) {
508 		bouncebuf = malloc(secsz);
509 		if (bouncebuf == NULL) {
510 			printf("vdev_read: out of memory\n");
511 			return (ENOMEM);
512 		}
513 	}
514 
515 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
516 		ret = errno;
517 		goto error;
518 	}
519 
520 	/* Partial data return from first sector */
521 	if (head > 0) {
522 		res = read(fd, bouncebuf, secsz);
523 		if (res != secsz) {
524 			ret = EIO;
525 			goto error;
526 		}
527 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
528 		outbuf += min(secsz - head, bytes);
529 	}
530 
531 	/*
532 	 * Full data return from read sectors.
533 	 * Note, there is still corner case where we read
534 	 * from sector boundary, but less than sector size, e.g. reading 512B
535 	 * from 4k sector.
536 	 */
537 	if (full_sec_size > 0) {
538 		if (bytes < full_sec_size) {
539 			res = read(fd, bouncebuf, secsz);
540 			if (res != secsz) {
541 				ret = EIO;
542 				goto error;
543 			}
544 			memcpy(outbuf, bouncebuf, bytes);
545 		} else {
546 			res = read(fd, outbuf, full_sec_size);
547 			if (res != full_sec_size) {
548 				ret = EIO;
549 				goto error;
550 			}
551 			outbuf += full_sec_size;
552 		}
553 	}
554 
555 	/* Partial data return from last sector */
556 	if (do_tail_read) {
557 		res = read(fd, bouncebuf, secsz);
558 		if (res != secsz) {
559 			ret = EIO;
560 			goto error;
561 		}
562 		memcpy(outbuf, bouncebuf, secsz - tail);
563 	}
564 
565 	ret = 0;
566 error:
567 	free(bouncebuf);
568 	return (ret);
569 }
570 
571 static int
572 vdev_write(vdev_t *vdev, off_t offset, void *buf, size_t bytes)
573 {
574 	int fd, ret;
575 	size_t head, tail, total_size, full_sec_size;
576 	unsigned secsz, do_tail_write;
577 	off_t start_sec;
578 	ssize_t res;
579 	char *outbuf, *bouncebuf;
580 
581 	fd = (uintptr_t)vdev->v_priv;
582 	outbuf = (char *)buf;
583 	bouncebuf = NULL;
584 
585 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
586 	if (ret != 0)
587 		return (ret);
588 
589 	start_sec = offset / secsz;
590 	head = offset % secsz;
591 	total_size = roundup2(head + bytes, secsz);
592 	tail = total_size - (head + bytes);
593 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
594 	full_sec_size = total_size;
595 	if (head > 0)
596 		full_sec_size -= secsz;
597 	if (do_tail_write)
598 		full_sec_size -= secsz;
599 
600 	/* Partial sector write requires a bounce buffer. */
601 	if ((head > 0) || do_tail_write || bytes < secsz) {
602 		bouncebuf = malloc(secsz);
603 		if (bouncebuf == NULL) {
604 			printf("vdev_write: out of memory\n");
605 			return (ENOMEM);
606 		}
607 	}
608 
609 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
610 		ret = errno;
611 		goto error;
612 	}
613 
614 	/* Partial data for first sector */
615 	if (head > 0) {
616 		res = read(fd, bouncebuf, secsz);
617 		if ((unsigned)res != secsz) {
618 			ret = EIO;
619 			goto error;
620 		}
621 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
622 		(void) lseek(fd, -secsz, SEEK_CUR);
623 		res = write(fd, bouncebuf, secsz);
624 		if ((unsigned)res != secsz) {
625 			ret = EIO;
626 			goto error;
627 		}
628 		outbuf += min(secsz - head, bytes);
629 	}
630 
631 	/*
632 	 * Full data write to sectors.
633 	 * Note, there is still corner case where we write
634 	 * to sector boundary, but less than sector size, e.g. write 512B
635 	 * to 4k sector.
636 	 */
637 	if (full_sec_size > 0) {
638 		if (bytes < full_sec_size) {
639 			res = read(fd, bouncebuf, secsz);
640 			if ((unsigned)res != secsz) {
641 				ret = EIO;
642 				goto error;
643 			}
644 			memcpy(bouncebuf, outbuf, bytes);
645 			(void) lseek(fd, -secsz, SEEK_CUR);
646 			res = write(fd, bouncebuf, secsz);
647 			if ((unsigned)res != secsz) {
648 				ret = EIO;
649 				goto error;
650 			}
651 		} else {
652 			res = write(fd, outbuf, full_sec_size);
653 			if ((unsigned)res != full_sec_size) {
654 				ret = EIO;
655 				goto error;
656 			}
657 			outbuf += full_sec_size;
658 		}
659 	}
660 
661 	/* Partial data write to last sector */
662 	if (do_tail_write) {
663 		res = read(fd, bouncebuf, secsz);
664 		if ((unsigned)res != secsz) {
665 			ret = EIO;
666 			goto error;
667 		}
668 		memcpy(bouncebuf, outbuf, secsz - tail);
669 		(void) lseek(fd, -secsz, SEEK_CUR);
670 		res = write(fd, bouncebuf, secsz);
671 		if ((unsigned)res != secsz) {
672 			ret = EIO;
673 			goto error;
674 		}
675 	}
676 
677 	ret = 0;
678 error:
679 	free(bouncebuf);
680 	return (ret);
681 }
682 
683 static int
684 zfs_dev_init(void)
685 {
686 	spa_t *spa;
687 	spa_t *next;
688 	spa_t *prev;
689 
690 	zfs_init();
691 	if (archsw.arch_zfs_probe == NULL)
692 		return (ENXIO);
693 	archsw.arch_zfs_probe();
694 
695 	prev = NULL;
696 	spa = STAILQ_FIRST(&zfs_pools);
697 	while (spa != NULL) {
698 		next = STAILQ_NEXT(spa, spa_link);
699 		if (zfs_spa_init(spa)) {
700 			if (prev == NULL)
701 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
702 			else
703 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
704 		} else
705 			prev = spa;
706 		spa = next;
707 	}
708 	return (0);
709 }
710 
711 struct zfs_probe_args {
712 	int		fd;
713 	const char	*devname;
714 	uint64_t	*pool_guid;
715 	u_int		secsz;
716 };
717 
718 static int
719 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
720 {
721 	struct zfs_probe_args *ppa;
722 
723 	ppa = (struct zfs_probe_args *)arg;
724 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
725 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
726 }
727 
728 static int
729 zfs_probe(int fd, uint64_t *pool_guid)
730 {
731 	spa_t *spa;
732 	int ret;
733 
734 	spa = NULL;
735 	ret = vdev_probe(vdev_read, vdev_write, (void *)(uintptr_t)fd, &spa);
736 	if (ret == 0 && pool_guid != NULL)
737 		if (*pool_guid == 0)
738 			*pool_guid = spa->spa_guid;
739 	return (ret);
740 }
741 
742 static int
743 zfs_probe_partition(void *arg, const char *partname,
744     const struct ptable_entry *part)
745 {
746 	struct zfs_probe_args *ppa, pa;
747 	struct ptable *table;
748 	char devname[32];
749 	int ret;
750 
751 	/* Probe only freebsd-zfs and freebsd partitions */
752 	if (part->type != PART_FREEBSD &&
753 	    part->type != PART_FREEBSD_ZFS)
754 		return (0);
755 
756 	ppa = (struct zfs_probe_args *)arg;
757 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
758 	devname[strlen(ppa->devname) - 1] = '\0';
759 	snprintf(devname, sizeof(devname), "%s%s:", devname, partname);
760 	pa.fd = open(devname, O_RDWR);
761 	if (pa.fd == -1)
762 		return (0);
763 	ret = zfs_probe(pa.fd, ppa->pool_guid);
764 	if (ret == 0)
765 		return (0);
766 	/* Do we have BSD label here? */
767 	if (part->type == PART_FREEBSD) {
768 		pa.devname = devname;
769 		pa.pool_guid = ppa->pool_guid;
770 		pa.secsz = ppa->secsz;
771 		table = ptable_open(&pa, part->end - part->start + 1,
772 		    ppa->secsz, zfs_diskread);
773 		if (table != NULL) {
774 			ptable_iterate(table, &pa, zfs_probe_partition);
775 			ptable_close(table);
776 		}
777 	}
778 	close(pa.fd);
779 	return (0);
780 }
781 
782 /*
783  * Return bootenv nvlist from pool label.
784  */
785 int
786 zfs_get_bootenv(void *vdev, nvlist_t **benvp)
787 {
788 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
789 	nvlist_t *benv = NULL;
790 	vdev_t *vd;
791 	spa_t *spa;
792 
793 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
794 		return (ENOTSUP);
795 
796 	if ((spa = spa_find_by_dev(dev)) == NULL)
797 		return (ENXIO);
798 
799 	if (spa->spa_bootenv == NULL) {
800 		STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children,
801 		    v_childlink) {
802 			benv = vdev_read_bootenv(vd);
803 
804 			if (benv != NULL)
805 				break;
806 		}
807 		spa->spa_bootenv = benv;
808 	} else {
809 		benv = spa->spa_bootenv;
810 	}
811 
812 	if (benv == NULL)
813 		return (ENOENT);
814 
815 	*benvp = benv;
816 	return (0);
817 }
818 
819 /*
820  * Store nvlist to pool label bootenv area. Also updates cached pointer in spa.
821  */
822 int
823 zfs_set_bootenv(void *vdev, nvlist_t *benv)
824 {
825 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
826 	spa_t *spa;
827 	vdev_t *vd;
828 
829 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
830 		return (ENOTSUP);
831 
832 	if ((spa = spa_find_by_dev(dev)) == NULL)
833 		return (ENXIO);
834 
835 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
836 		vdev_write_bootenv(vd, benv);
837 	}
838 
839 	spa->spa_bootenv = benv;
840 	return (0);
841 }
842 
843 /*
844  * Get bootonce value by key. The bootonce <key, value> pair is removed
845  * from the bootenv nvlist and the remaining nvlist is committed back to disk.
846  */
847 int
848 zfs_get_bootonce(void *vdev, const char *key, char *buf, size_t size)
849 {
850 	nvlist_t *benv;
851 	char *result = NULL;
852 	int result_size, rv;
853 
854 	if ((rv = zfs_get_bootenv(vdev, &benv)) != 0)
855 		return (rv);
856 
857 	if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL,
858 	    &result, &result_size)) == 0) {
859 		if (result_size == 0) {
860 			/* ignore empty string */
861 			rv = ENOENT;
862 		} else {
863 			size = MIN((size_t)result_size + 1, size);
864 			strlcpy(buf, result, size);
865 		}
866 		(void) nvlist_remove(benv, key, DATA_TYPE_STRING);
867 		(void) zfs_set_bootenv(vdev, benv);
868 	}
869 
870 	return (rv);
871 }
872 
873 /*
874  * nvstore backend.
875  */
876 
877 static int zfs_nvstore_setter(void *, int, const char *,
878     const void *, size_t);
879 static int zfs_nvstore_setter_str(void *, const char *, const char *,
880     const char *);
881 static int zfs_nvstore_unset_impl(void *, const char *, bool);
882 static int zfs_nvstore_setenv(void *, void *);
883 
884 /*
885  * nvstore is only present for current rootfs pool.
886  */
887 static int
888 zfs_nvstore_sethook(struct env_var *ev, int flags __unused, const void *value)
889 {
890 	struct zfs_devdesc *dev;
891 	int rv;
892 
893 	archsw.arch_getdev((void **)&dev, NULL, NULL);
894 	if (dev == NULL)
895 		return (ENXIO);
896 
897 	rv = zfs_nvstore_setter_str(dev, NULL, ev->ev_name, value);
898 
899 	free(dev);
900 	return (rv);
901 }
902 
903 /*
904  * nvstore is only present for current rootfs pool.
905  */
906 static int
907 zfs_nvstore_unsethook(struct env_var *ev)
908 {
909 	struct zfs_devdesc *dev;
910 	int rv;
911 
912 	archsw.arch_getdev((void **)&dev, NULL, NULL);
913 	if (dev == NULL)
914 		return (ENXIO);
915 
916 	rv = zfs_nvstore_unset_impl(dev, ev->ev_name, false);
917 
918 	free(dev);
919 	return (rv);
920 }
921 
922 static int
923 zfs_nvstore_getter(void *vdev, const char *name, void **data)
924 {
925 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
926 	spa_t *spa;
927 	nvlist_t *nv;
928 	char *str, **ptr;
929 	int size;
930 	int rv;
931 
932 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
933 		return (ENOTSUP);
934 
935 	if ((spa = spa_find_by_dev(dev)) == NULL)
936 		return (ENXIO);
937 
938 	if (spa->spa_bootenv == NULL)
939 		return (ENXIO);
940 
941 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
942 	    NULL, &nv, NULL) != 0)
943 		return (ENOENT);
944 
945 	rv = nvlist_find(nv, name, DATA_TYPE_STRING, NULL, &str, &size);
946 	if (rv == 0) {
947 		ptr = (char **)data;
948 		asprintf(ptr, "%.*s", size, str);
949 		if (*data == NULL)
950 			rv = ENOMEM;
951 	}
952 	nvlist_destroy(nv);
953 	return (rv);
954 }
955 
956 static int
957 zfs_nvstore_setter(void *vdev, int type, const char *name,
958     const void *data, size_t size)
959 {
960 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
961 	spa_t *spa;
962 	nvlist_t *nv;
963 	int rv;
964 	bool env_set = true;
965 
966 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
967 		return (ENOTSUP);
968 
969 	if ((spa = spa_find_by_dev(dev)) == NULL)
970 		return (ENXIO);
971 
972 	if (spa->spa_bootenv == NULL)
973 		return (ENXIO);
974 
975 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
976 	    NULL, &nv, NULL) != 0) {
977 		nv = nvlist_create(NV_UNIQUE_NAME);
978 		if (nv == NULL)
979 			return (ENOMEM);
980 	}
981 
982 	rv = 0;
983 	switch (type) {
984         case DATA_TYPE_INT8:
985 		if (size != sizeof (int8_t)) {
986 			rv = EINVAL;
987 			break;
988 		}
989 		rv = nvlist_add_int8(nv, name, *(int8_t *)data);
990 		break;
991 
992         case DATA_TYPE_INT16:
993 		if (size != sizeof (int16_t)) {
994 			rv = EINVAL;
995 			break;
996 		}
997 		rv = nvlist_add_int16(nv, name, *(int16_t *)data);
998 		break;
999 
1000         case DATA_TYPE_INT32:
1001 		if (size != sizeof (int32_t)) {
1002 			rv = EINVAL;
1003 			break;
1004 		}
1005 		rv = nvlist_add_int32(nv, name, *(int32_t *)data);
1006 		break;
1007 
1008         case DATA_TYPE_INT64:
1009 		if (size != sizeof (int64_t)) {
1010 			rv = EINVAL;
1011 			break;
1012 		}
1013 		rv = nvlist_add_int64(nv, name, *(int64_t *)data);
1014 		break;
1015 
1016         case DATA_TYPE_BYTE:
1017 		if (size != sizeof (uint8_t)) {
1018 			rv = EINVAL;
1019 			break;
1020 		}
1021 		rv = nvlist_add_byte(nv, name, *(int8_t *)data);
1022 		break;
1023 
1024         case DATA_TYPE_UINT8:
1025 		if (size != sizeof (uint8_t)) {
1026 			rv = EINVAL;
1027 			break;
1028 		}
1029 		rv = nvlist_add_uint8(nv, name, *(int8_t *)data);
1030 		break;
1031 
1032         case DATA_TYPE_UINT16:
1033 		if (size != sizeof (uint16_t)) {
1034 			rv = EINVAL;
1035 			break;
1036 		}
1037 		rv = nvlist_add_uint16(nv, name, *(uint16_t *)data);
1038 		break;
1039 
1040         case DATA_TYPE_UINT32:
1041 		if (size != sizeof (uint32_t)) {
1042 			rv = EINVAL;
1043 			break;
1044 		}
1045 		rv = nvlist_add_uint32(nv, name, *(uint32_t *)data);
1046 		break;
1047 
1048         case DATA_TYPE_UINT64:
1049 		if (size != sizeof (uint64_t)) {
1050 			rv = EINVAL;
1051 			break;
1052 		}
1053 		rv = nvlist_add_uint64(nv, name, *(uint64_t *)data);
1054 		break;
1055 
1056         case DATA_TYPE_STRING:
1057 		rv = nvlist_add_string(nv, name, data);
1058 		break;
1059 
1060 	case DATA_TYPE_BOOLEAN_VALUE:
1061 		if (size != sizeof (boolean_t)) {
1062 			rv = EINVAL;
1063 			break;
1064 		}
1065 		rv = nvlist_add_boolean_value(nv, name, *(boolean_t *)data);
1066 		break;
1067 
1068 	default:
1069 		rv = EINVAL;
1070 		break;
1071 	}
1072 
1073 	if (rv == 0) {
1074 		rv = nvlist_add_nvlist(spa->spa_bootenv, OS_NVSTORE, nv);
1075 		if (rv == 0) {
1076 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1077 		}
1078 		if (rv == 0) {
1079 			if (env_set) {
1080 				rv = zfs_nvstore_setenv(vdev,
1081 				    nvpair_find(nv, name));
1082 			} else {
1083 				env_discard(env_getenv(name));
1084 				rv = 0;
1085 			}
1086 		}
1087 	}
1088 
1089 	nvlist_destroy(nv);
1090 	return (rv);
1091 }
1092 
1093 static int
1094 get_int64(const char *data, int64_t *ip)
1095 {
1096 	char *end;
1097 	int64_t val;
1098 
1099 	errno = 0;
1100 	val = strtoll(data, &end, 0);
1101 	if (errno != 0 || *data == '\0' || *end != '\0')
1102 		return (EINVAL);
1103 
1104 	*ip = val;
1105 	return (0);
1106 }
1107 
1108 static int
1109 get_uint64(const char *data, uint64_t *ip)
1110 {
1111 	char *end;
1112 	uint64_t val;
1113 
1114 	errno = 0;
1115 	val = strtoull(data, &end, 0);
1116 	if (errno != 0 || *data == '\0' || *end != '\0')
1117 		return (EINVAL);
1118 
1119 	*ip = val;
1120 	return (0);
1121 }
1122 
1123 /*
1124  * Translate textual data to data type. If type is not set, and we are
1125  * creating new pair, use DATA_TYPE_STRING.
1126  */
1127 static int
1128 zfs_nvstore_setter_str(void *vdev, const char *type, const char *name,
1129     const char *data)
1130 {
1131 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1132 	spa_t *spa;
1133 	nvlist_t *nv;
1134 	int rv;
1135 	data_type_t dt;
1136 	int64_t val;
1137 	uint64_t uval;
1138 
1139 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1140 		return (ENOTSUP);
1141 
1142 	if ((spa = spa_find_by_dev(dev)) == NULL)
1143 		return (ENXIO);
1144 
1145 	if (spa->spa_bootenv == NULL)
1146 		return (ENXIO);
1147 
1148 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1149 	    NULL, &nv, NULL) != 0) {
1150 		nv = NULL;
1151 	}
1152 
1153 	if (type == NULL) {
1154 		nvp_header_t *nvh;
1155 
1156 		/*
1157 		 * if there is no existing pair, default to string.
1158 		 * Otherwise, use type from existing pair.
1159 		 */
1160 		nvh = nvpair_find(nv, name);
1161 		if (nvh == NULL) {
1162 			dt = DATA_TYPE_STRING;
1163 		} else {
1164 			nv_string_t *nvp_name;
1165 			nv_pair_data_t *nvp_data;
1166 
1167 			nvp_name = (nv_string_t *)(nvh + 1);
1168 			nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1169 			    NV_ALIGN4(nvp_name->nv_size));
1170 			dt = nvp_data->nv_type;
1171 		}
1172 	} else {
1173 		dt = nvpair_type_from_name(type);
1174 	}
1175 	nvlist_destroy(nv);
1176 
1177 	rv = 0;
1178 	switch (dt) {
1179         case DATA_TYPE_INT8:
1180 		rv = get_int64(data, &val);
1181 		if (rv == 0) {
1182 			int8_t v = val;
1183 
1184 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1185 		}
1186 		break;
1187         case DATA_TYPE_INT16:
1188 		rv = get_int64(data, &val);
1189 		if (rv == 0) {
1190 			int16_t v = val;
1191 
1192 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1193 		}
1194 		break;
1195         case DATA_TYPE_INT32:
1196 		rv = get_int64(data, &val);
1197 		if (rv == 0) {
1198 			int32_t v = val;
1199 
1200 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1201 		}
1202 		break;
1203         case DATA_TYPE_INT64:
1204 		rv = get_int64(data, &val);
1205 		if (rv == 0) {
1206 			rv = zfs_nvstore_setter(vdev, dt, name, &val,
1207 			    sizeof (val));
1208 		}
1209 		break;
1210 
1211         case DATA_TYPE_BYTE:
1212 		rv = get_uint64(data, &uval);
1213 		if (rv == 0) {
1214 			uint8_t v = uval;
1215 
1216 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1217 		}
1218 		break;
1219 
1220         case DATA_TYPE_UINT8:
1221 		rv = get_uint64(data, &uval);
1222 		if (rv == 0) {
1223 			uint8_t v = uval;
1224 
1225 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1226 		}
1227 		break;
1228 
1229         case DATA_TYPE_UINT16:
1230 		rv = get_uint64(data, &uval);
1231 		if (rv == 0) {
1232 			uint16_t v = uval;
1233 
1234 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1235 		}
1236 		break;
1237 
1238         case DATA_TYPE_UINT32:
1239 		rv = get_uint64(data, &uval);
1240 		if (rv == 0) {
1241 			uint32_t v = uval;
1242 
1243 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1244 		}
1245 		break;
1246 
1247         case DATA_TYPE_UINT64:
1248 		rv = get_uint64(data, &uval);
1249 		if (rv == 0) {
1250 			rv = zfs_nvstore_setter(vdev, dt, name, &uval,
1251 			    sizeof (uval));
1252 		}
1253 		break;
1254 
1255         case DATA_TYPE_STRING:
1256 		rv = zfs_nvstore_setter(vdev, dt, name, data, strlen(data) + 1);
1257 		break;
1258 
1259 	case DATA_TYPE_BOOLEAN_VALUE:
1260 		rv = get_int64(data, &val);
1261 		if (rv == 0) {
1262 			boolean_t v = val;
1263 
1264 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1265 		}
1266 
1267 	default:
1268 		rv = EINVAL;
1269 	}
1270 	return (rv);
1271 }
1272 
1273 static int
1274 zfs_nvstore_unset_impl(void *vdev, const char *name, bool unset_env)
1275 {
1276 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1277 	spa_t *spa;
1278 	nvlist_t *nv;
1279 	int rv;
1280 
1281 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1282 		return (ENOTSUP);
1283 
1284 	if ((spa = spa_find_by_dev(dev)) == NULL)
1285 		return (ENXIO);
1286 
1287 	if (spa->spa_bootenv == NULL)
1288 		return (ENXIO);
1289 
1290 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1291 	    NULL, &nv, NULL) != 0)
1292 		return (ENOENT);
1293 
1294 	rv = nvlist_remove(nv, name, DATA_TYPE_UNKNOWN);
1295 	if (rv == 0) {
1296 		if (nvlist_next_nvpair(nv, NULL) == NULL) {
1297 			rv = nvlist_remove(spa->spa_bootenv, OS_NVSTORE,
1298 			    DATA_TYPE_NVLIST);
1299 		} else {
1300 			rv = nvlist_add_nvlist(spa->spa_bootenv,
1301 			    OS_NVSTORE, nv);
1302 		}
1303 		if (rv == 0)
1304 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1305 	}
1306 
1307 	if (unset_env)
1308 		env_discard(env_getenv(name));
1309 	return (rv);
1310 }
1311 
1312 static int
1313 zfs_nvstore_unset(void *vdev, const char *name)
1314 {
1315 	return (zfs_nvstore_unset_impl(vdev, name, true));
1316 }
1317 
1318 static int
1319 zfs_nvstore_print(void *vdev __unused, void *ptr)
1320 {
1321 
1322 	nvpair_print(ptr, 0);
1323 	return (0);
1324 }
1325 
1326 /*
1327  * Create environment variable from nvpair.
1328  * set hook will update nvstore with new value, unset hook will remove
1329  * variable from nvstore.
1330  */
1331 static int
1332 zfs_nvstore_setenv(void *vdev __unused, void *ptr)
1333 {
1334 	nvp_header_t *nvh = ptr;
1335 	nv_string_t *nvp_name, *nvp_value;
1336 	nv_pair_data_t *nvp_data;
1337 	char *name, *value;
1338 	int rv = 0;
1339 
1340 	if (nvh == NULL)
1341 		return (ENOENT);
1342 
1343 	nvp_name = (nv_string_t *)(nvh + 1);
1344 	nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1345 	    NV_ALIGN4(nvp_name->nv_size));
1346 
1347 	if ((name = nvstring_get(nvp_name)) == NULL)
1348 		return (ENOMEM);
1349 
1350 	value = NULL;
1351 	switch (nvp_data->nv_type) {
1352 	case DATA_TYPE_BYTE:
1353 	case DATA_TYPE_UINT8:
1354 		(void) asprintf(&value, "%uc",
1355 		    *(unsigned *)&nvp_data->nv_data[0]);
1356 		if (value == NULL)
1357 			rv = ENOMEM;
1358 		break;
1359 
1360 	case DATA_TYPE_INT8:
1361 		(void) asprintf(&value, "%c", *(int *)&nvp_data->nv_data[0]);
1362 		if (value == NULL)
1363 			rv = ENOMEM;
1364 		break;
1365 
1366 	case DATA_TYPE_INT16:
1367 		(void) asprintf(&value, "%hd", *(short *)&nvp_data->nv_data[0]);
1368 		if (value == NULL)
1369 			rv = ENOMEM;
1370 		break;
1371 
1372 	case DATA_TYPE_UINT16:
1373 		(void) asprintf(&value, "%hu",
1374 		    *(unsigned short *)&nvp_data->nv_data[0]);
1375 		if (value == NULL)
1376 			rv = ENOMEM;
1377 		break;
1378 
1379 	case DATA_TYPE_BOOLEAN_VALUE:
1380 	case DATA_TYPE_INT32:
1381 		(void) asprintf(&value, "%d", *(int *)&nvp_data->nv_data[0]);
1382 		if (value == NULL)
1383 			rv = ENOMEM;
1384 		break;
1385 
1386 	case DATA_TYPE_UINT32:
1387 		(void) asprintf(&value, "%u",
1388 		    *(unsigned *)&nvp_data->nv_data[0]);
1389 		if (value == NULL)
1390 			rv = ENOMEM;
1391 		break;
1392 
1393 	case DATA_TYPE_INT64:
1394 		(void) asprintf(&value, "%jd",
1395 		    (intmax_t)*(int64_t *)&nvp_data->nv_data[0]);
1396 		if (value == NULL)
1397 			rv = ENOMEM;
1398 		break;
1399 
1400 	case DATA_TYPE_UINT64:
1401 		(void) asprintf(&value, "%ju",
1402 		    (uintmax_t)*(uint64_t *)&nvp_data->nv_data[0]);
1403 		if (value == NULL)
1404 			rv = ENOMEM;
1405 		break;
1406 
1407 	case DATA_TYPE_STRING:
1408 		nvp_value = (nv_string_t *)&nvp_data->nv_data[0];
1409 		if ((value = nvstring_get(nvp_value)) == NULL) {
1410 			rv = ENOMEM;
1411 			break;
1412 		}
1413 		break;
1414 
1415 	default:
1416 		rv = EINVAL;
1417 		break;
1418 	}
1419 
1420 	if (value != NULL) {
1421 		rv = env_setenv(name, EV_VOLATILE | EV_NOHOOK, value,
1422 		    zfs_nvstore_sethook, zfs_nvstore_unsethook);
1423 		free(value);
1424 	}
1425 	free(name);
1426 	return (rv);
1427 }
1428 
1429 static int
1430 zfs_nvstore_iterate(void *vdev, int (*cb)(void *, void *))
1431 {
1432 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1433 	spa_t *spa;
1434 	nvlist_t *nv;
1435 	nvp_header_t *nvh;
1436 	int rv;
1437 
1438 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1439 		return (ENOTSUP);
1440 
1441 	if ((spa = spa_find_by_dev(dev)) == NULL)
1442 		return (ENXIO);
1443 
1444 	if (spa->spa_bootenv == NULL)
1445 		return (ENXIO);
1446 
1447 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1448 	    NULL, &nv, NULL) != 0)
1449 		return (ENOENT);
1450 
1451 	rv = 0;
1452 	nvh = NULL;
1453 	while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
1454 		rv = cb(vdev, nvh);
1455 		if (rv != 0)
1456 			break;
1457 	}
1458 	return (rv);
1459 }
1460 
1461 nvs_callbacks_t nvstore_zfs_cb = {
1462 	.nvs_getter = zfs_nvstore_getter,
1463 	.nvs_setter = zfs_nvstore_setter,
1464 	.nvs_setter_str = zfs_nvstore_setter_str,
1465 	.nvs_unset = zfs_nvstore_unset,
1466 	.nvs_print = zfs_nvstore_print,
1467 	.nvs_iterate = zfs_nvstore_iterate
1468 };
1469 
1470 int
1471 zfs_attach_nvstore(void *vdev)
1472 {
1473 	struct zfs_devdesc *dev = vdev;
1474 	spa_t *spa;
1475 	uint64_t version;
1476 	int rv;
1477 
1478 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1479 		return (ENOTSUP);
1480 
1481 	if ((spa = spa_find_by_dev(dev)) == NULL)
1482 		return (ENXIO);
1483 
1484 	rv = nvlist_find(spa->spa_bootenv, BOOTENV_VERSION, DATA_TYPE_UINT64,
1485 	    NULL, &version, NULL);
1486 
1487 	if (rv != 0 || version != VB_NVLIST) {
1488 		return (ENXIO);
1489 	}
1490 
1491 	dev = malloc(sizeof (*dev));
1492 	if (dev == NULL)
1493 		return (ENOMEM);
1494 	memcpy(dev, vdev, sizeof (*dev));
1495 
1496 	rv = nvstore_init(spa->spa_name, &nvstore_zfs_cb, dev);
1497 	if (rv != 0)
1498 		free(dev);
1499 	else
1500 		rv = zfs_nvstore_iterate(dev, zfs_nvstore_setenv);
1501 	return (rv);
1502 }
1503 
1504 int
1505 zfs_probe_dev(const char *devname, uint64_t *pool_guid, bool parts_too)
1506 {
1507 	struct ptable *table;
1508 	struct zfs_probe_args pa;
1509 	uint64_t mediasz;
1510 	int ret;
1511 
1512 	if (pool_guid)
1513 		*pool_guid = 0;
1514 	pa.fd = open(devname, O_RDWR);
1515 	if (pa.fd == -1)
1516 		return (ENXIO);
1517 	/* Probe the whole disk */
1518 	ret = zfs_probe(pa.fd, pool_guid);
1519 	if (ret == 0)
1520 		return (0);
1521 	if (!parts_too)
1522 		return (ENXIO);
1523 
1524 	/* Probe each partition */
1525 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
1526 	if (ret == 0)
1527 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
1528 	if (ret == 0) {
1529 		pa.devname = devname;
1530 		pa.pool_guid = pool_guid;
1531 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
1532 		    zfs_diskread);
1533 		if (table != NULL) {
1534 			ptable_iterate(table, &pa, zfs_probe_partition);
1535 			ptable_close(table);
1536 		}
1537 	}
1538 	close(pa.fd);
1539 	if (pool_guid && *pool_guid == 0)
1540 		ret = ENXIO;
1541 	return (ret);
1542 }
1543 
1544 /*
1545  * Print information about ZFS pools
1546  */
1547 static int
1548 zfs_dev_print(int verbose)
1549 {
1550 	spa_t *spa;
1551 	char line[80];
1552 	int ret = 0;
1553 
1554 	if (STAILQ_EMPTY(&zfs_pools))
1555 		return (0);
1556 
1557 	printf("%s devices:", zfs_dev.dv_name);
1558 	if ((ret = pager_output("\n")) != 0)
1559 		return (ret);
1560 
1561 	if (verbose) {
1562 		return (spa_all_status());
1563 	}
1564 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1565 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
1566 		ret = pager_output(line);
1567 		if (ret != 0)
1568 			break;
1569 	}
1570 	return (ret);
1571 }
1572 
1573 /*
1574  * Attempt to open the pool described by (dev) for use by (f).
1575  */
1576 static int
1577 zfs_dev_open(struct open_file *f, ...)
1578 {
1579 	va_list		args;
1580 	struct zfs_devdesc	*dev;
1581 	struct zfsmount	*mount;
1582 	spa_t		*spa;
1583 	int		rv;
1584 
1585 	va_start(args, f);
1586 	dev = va_arg(args, struct zfs_devdesc *);
1587 	va_end(args);
1588 
1589 	if ((spa = spa_find_by_dev(dev)) == NULL)
1590 		return (ENXIO);
1591 
1592 	STAILQ_FOREACH(mount, &zfsmount, next) {
1593 		if (spa->spa_guid == mount->spa->spa_guid)
1594 			break;
1595 	}
1596 
1597 	rv = 0;
1598 	/* This device is not set as currdev, mount us private copy. */
1599 	if (mount == NULL)
1600 		rv = zfs_mount(devformat(&dev->dd), NULL, (void **)&mount);
1601 
1602 	if (rv == 0) {
1603 		dev->dd.d_opendata = mount;
1604 	}
1605 	return (rv);
1606 }
1607 
1608 static int
1609 zfs_dev_close(struct open_file *f)
1610 {
1611 	struct devdesc *dev;
1612 	struct zfsmount	*mnt, *mount;
1613 
1614 	dev = f->f_devdata;
1615 	mnt = dev->d_opendata;
1616 
1617 	STAILQ_FOREACH(mount, &zfsmount, next) {
1618 		if (mnt->spa->spa_guid == mount->spa->spa_guid)
1619 			break;
1620 	}
1621 
1622 	/* XXX */
1623 	return (0);
1624 }
1625 
1626 static int
1627 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
1628 {
1629 
1630 	return (ENOSYS);
1631 }
1632 
1633 struct devsw zfs_dev = {
1634 	.dv_name = "zfs",
1635 	.dv_type = DEVT_ZFS,
1636 	.dv_init = zfs_dev_init,
1637 	.dv_strategy = zfs_dev_strategy,
1638 	.dv_open = zfs_dev_open,
1639 	.dv_close = zfs_dev_close,
1640 	.dv_ioctl = noioctl,
1641 	.dv_print = zfs_dev_print,
1642 	.dv_cleanup = nullsys,
1643 	.dv_fmtdev = zfs_fmtdev,
1644 	.dv_parsedev = zfs_parsedev,
1645 };
1646 
1647 static int
1648 zfs_parsedev(struct devdesc **idev, const char *devspec, const char **path)
1649 {
1650 	static char	rootname[ZFS_MAXNAMELEN];
1651 	static char	poolname[ZFS_MAXNAMELEN];
1652 	spa_t		*spa;
1653 	const char	*end;
1654 	const char	*np;
1655 	const char	*sep;
1656 	int		rv;
1657 	struct zfs_devdesc *dev;
1658 
1659 	np = devspec + 3;			/* Skip the leading 'zfs' */
1660 	if (*np != ':')
1661 		return (EINVAL);
1662 	np++;
1663 	end = strrchr(np, ':');
1664 	if (end == NULL)
1665 		return (EINVAL);
1666 	sep = strchr(np, '/');
1667 	if (sep == NULL || sep >= end)
1668 		sep = end;
1669 	memcpy(poolname, np, sep - np);
1670 	poolname[sep - np] = '\0';
1671 	if (sep < end) {
1672 		sep++;
1673 		memcpy(rootname, sep, end - sep);
1674 		rootname[end - sep] = '\0';
1675 	}
1676 	else
1677 		rootname[0] = '\0';
1678 
1679 	spa = spa_find_by_name(poolname);
1680 	if (!spa)
1681 		return (ENXIO);
1682 	dev = malloc(sizeof(*dev));
1683 	if (dev == NULL)
1684 		return (ENOMEM);
1685 	dev->pool_guid = spa->spa_guid;
1686 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1687 	if (rv != 0) {
1688 		free(dev);
1689 		return (rv);
1690 	}
1691 	if (path != NULL)
1692 		*path = (*end == '\0') ? end : end + 1;
1693 	dev->dd.d_dev = &zfs_dev;
1694 	*idev = &dev->dd;
1695 	return (0);
1696 }
1697 
1698 char *
1699 zfs_fmtdev(struct devdesc *vdev)
1700 {
1701 	static char		rootname[ZFS_MAXNAMELEN];
1702 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1703 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1704 	spa_t			*spa;
1705 
1706 	buf[0] = '\0';
1707 	if (vdev->d_dev->dv_type != DEVT_ZFS)
1708 		return (buf);
1709 
1710 	/* Do we have any pools? */
1711 	spa = STAILQ_FIRST(&zfs_pools);
1712 	if (spa == NULL)
1713 		return (buf);
1714 
1715 	if (dev->pool_guid == 0)
1716 		dev->pool_guid = spa->spa_guid;
1717 	else
1718 		spa = spa_find_by_guid(dev->pool_guid);
1719 
1720 	if (spa == NULL) {
1721 		printf("ZFS: can't find pool by guid\n");
1722 		return (buf);
1723 	}
1724 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1725 		printf("ZFS: can't find root filesystem\n");
1726 		return (buf);
1727 	}
1728 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1729 		printf("ZFS: can't find filesystem by guid\n");
1730 		return (buf);
1731 	}
1732 
1733 	if (rootname[0] == '\0')
1734 		snprintf(buf, sizeof(buf), "%s:%s:", dev->dd.d_dev->dv_name,
1735 		    spa->spa_name);
1736 	else
1737 		snprintf(buf, sizeof(buf), "%s:%s/%s:", dev->dd.d_dev->dv_name,
1738 		    spa->spa_name, rootname);
1739 	return (buf);
1740 }
1741 
1742 static int
1743 split_devname(const char *name, char *poolname, size_t size,
1744     const char **dsnamep)
1745 {
1746 	const char *dsname;
1747 	size_t len;
1748 
1749 	ASSERT(name != NULL);
1750 	ASSERT(poolname != NULL);
1751 
1752 	len = strlen(name);
1753 	dsname = strchr(name, '/');
1754 	if (dsname != NULL) {
1755 		len = dsname - name;
1756 		dsname++;
1757 	} else
1758 		dsname = "";
1759 
1760 	if (len + 1 > size)
1761 		return (EINVAL);
1762 
1763 	strlcpy(poolname, name, len + 1);
1764 
1765 	if (dsnamep != NULL)
1766 		*dsnamep = dsname;
1767 
1768 	return (0);
1769 }
1770 
1771 int
1772 zfs_list(const char *name)
1773 {
1774 	static char	poolname[ZFS_MAXNAMELEN];
1775 	uint64_t	objid;
1776 	spa_t		*spa;
1777 	const char	*dsname;
1778 	int		rv;
1779 
1780 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1781 		return (EINVAL);
1782 
1783 	spa = spa_find_by_name(poolname);
1784 	if (!spa)
1785 		return (ENXIO);
1786 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1787 	if (rv != 0)
1788 		return (rv);
1789 
1790 	return (zfs_list_dataset(spa, objid));
1791 }
1792 
1793 void
1794 init_zfs_boot_options(const char *currdev_in)
1795 {
1796 	char poolname[ZFS_MAXNAMELEN];
1797 	char *beroot, *currdev;
1798 	spa_t *spa;
1799 	int currdev_len;
1800 	const char *dsname;
1801 
1802 	currdev = NULL;
1803 	currdev_len = strlen(currdev_in);
1804 	if (currdev_len == 0)
1805 		return;
1806 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1807 		return;
1808 	currdev = strdup(currdev_in);
1809 	if (currdev == NULL)
1810 		return;
1811 	/* Remove the trailing : */
1812 	currdev[currdev_len - 1] = '\0';
1813 
1814 	setenv("zfs_be_active", currdev, 1);
1815 	setenv("zfs_be_currpage", "1", 1);
1816 	/* Remove the last element (current bootenv) */
1817 	beroot = strrchr(currdev, '/');
1818 	if (beroot != NULL)
1819 		beroot[0] = '\0';
1820 	beroot = strchr(currdev, ':') + 1;
1821 	setenv("zfs_be_root", beroot, 1);
1822 
1823 	if (split_devname(beroot, poolname, sizeof(poolname), &dsname) != 0)
1824 		return;
1825 
1826 	spa = spa_find_by_name(poolname);
1827 	if (spa == NULL)
1828 		return;
1829 
1830 	zfs_bootenv_initial("bootenvs", spa, beroot, dsname, 0);
1831 	zfs_checkpoints_initial(spa, beroot, dsname);
1832 
1833 	free(currdev);
1834 }
1835 
1836 static void
1837 zfs_checkpoints_initial(spa_t *spa, const char *name, const char *dsname)
1838 {
1839 	char envname[32];
1840 
1841 	if (spa->spa_uberblock_checkpoint.ub_checkpoint_txg != 0) {
1842 		snprintf(envname, sizeof(envname), "zpool_checkpoint");
1843 		setenv(envname, name, 1);
1844 
1845 		spa->spa_uberblock = &spa->spa_uberblock_checkpoint;
1846 		spa->spa_mos = &spa->spa_mos_checkpoint;
1847 
1848 		zfs_bootenv_initial("bootenvs_check", spa, name, dsname, 1);
1849 
1850 		spa->spa_uberblock = &spa->spa_uberblock_master;
1851 		spa->spa_mos = &spa->spa_mos_master;
1852 	}
1853 }
1854 
1855 static void
1856 zfs_bootenv_initial(const char *envprefix, spa_t *spa, const char *rootname,
1857    const char *dsname, int checkpoint)
1858 {
1859 	char		envname[32], envval[256];
1860 	uint64_t	objid;
1861 	int		bootenvs_idx, rv;
1862 
1863 	SLIST_INIT(&zfs_be_head);
1864 	zfs_env_count = 0;
1865 
1866 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1867 	if (rv != 0)
1868 		return;
1869 
1870 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1871 	bootenvs_idx = 0;
1872 	/* Populate the initial environment variables */
1873 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1874 		/* Enumerate all bootenvs for general usage */
1875 		snprintf(envname, sizeof(envname), "%s[%d]",
1876 		    envprefix, bootenvs_idx);
1877 		snprintf(envval, sizeof(envval), "zfs:%s%s/%s",
1878 		    checkpoint ? "!" : "", rootname, zfs_be->name);
1879 		rv = setenv(envname, envval, 1);
1880 		if (rv != 0)
1881 			break;
1882 		bootenvs_idx++;
1883 	}
1884 	snprintf(envname, sizeof(envname), "%s_count", envprefix);
1885 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1886 	setenv(envname, envval, 1);
1887 
1888 	/* Clean up the SLIST of ZFS BEs */
1889 	while (!SLIST_EMPTY(&zfs_be_head)) {
1890 		zfs_be = SLIST_FIRST(&zfs_be_head);
1891 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1892 		free(zfs_be->name);
1893 		free(zfs_be);
1894 	}
1895 }
1896 
1897 int
1898 zfs_bootenv(const char *name)
1899 {
1900 	char		poolname[ZFS_MAXNAMELEN], *root;
1901 	const char	*dsname;
1902 	char		becount[4];
1903 	uint64_t	objid;
1904 	spa_t		*spa;
1905 	int		rv, pages, perpage, currpage;
1906 
1907 	if (name == NULL)
1908 		return (EINVAL);
1909 	if ((root = getenv("zfs_be_root")) == NULL)
1910 		return (EINVAL);
1911 
1912 	if (strcmp(name, root) != 0) {
1913 		if (setenv("zfs_be_root", name, 1) != 0)
1914 			return (ENOMEM);
1915 	}
1916 
1917 	SLIST_INIT(&zfs_be_head);
1918 	zfs_env_count = 0;
1919 
1920 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1921 		return (EINVAL);
1922 
1923 	spa = spa_find_by_name(poolname);
1924 	if (!spa)
1925 		return (ENXIO);
1926 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1927 	if (rv != 0)
1928 		return (rv);
1929 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1930 
1931 	/* Calculate and store the number of pages of BEs */
1932 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1933 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1934 	snprintf(becount, 4, "%d", pages);
1935 	if (setenv("zfs_be_pages", becount, 1) != 0)
1936 		return (ENOMEM);
1937 
1938 	/* Roll over the page counter if it has exceeded the maximum */
1939 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1940 	if (currpage > pages) {
1941 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1942 			return (ENOMEM);
1943 	}
1944 
1945 	/* Populate the menu environment variables */
1946 	zfs_set_env();
1947 
1948 	/* Clean up the SLIST of ZFS BEs */
1949 	while (!SLIST_EMPTY(&zfs_be_head)) {
1950 		zfs_be = SLIST_FIRST(&zfs_be_head);
1951 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1952 		free(zfs_be->name);
1953 		free(zfs_be);
1954 	}
1955 
1956 	return (rv);
1957 }
1958 
1959 int
1960 zfs_belist_add(const char *name, uint64_t value __unused)
1961 {
1962 
1963 	/* Skip special datasets that start with a $ character */
1964 	if (strncmp(name, "$", 1) == 0) {
1965 		return (0);
1966 	}
1967 	/* Add the boot environment to the head of the SLIST */
1968 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1969 	if (zfs_be == NULL) {
1970 		return (ENOMEM);
1971 	}
1972 	zfs_be->name = strdup(name);
1973 	if (zfs_be->name == NULL) {
1974 		free(zfs_be);
1975 		return (ENOMEM);
1976 	}
1977 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1978 	zfs_env_count++;
1979 
1980 	return (0);
1981 }
1982 
1983 int
1984 zfs_set_env(void)
1985 {
1986 	char envname[32], envval[256];
1987 	char *beroot, *pagenum;
1988 	int rv, page, ctr;
1989 
1990 	beroot = getenv("zfs_be_root");
1991 	if (beroot == NULL) {
1992 		return (1);
1993 	}
1994 
1995 	pagenum = getenv("zfs_be_currpage");
1996 	if (pagenum != NULL) {
1997 		page = strtol(pagenum, NULL, 10);
1998 	} else {
1999 		page = 1;
2000 	}
2001 
2002 	ctr = 1;
2003 	rv = 0;
2004 	zfs_env_index = ZFS_BE_FIRST;
2005 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
2006 		/* Skip to the requested page number */
2007 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
2008 			ctr++;
2009 			continue;
2010 		}
2011 
2012 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
2013 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
2014 		rv = setenv(envname, envval, 1);
2015 		if (rv != 0) {
2016 			break;
2017 		}
2018 
2019 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2020 		rv = setenv(envname, envval, 1);
2021 		if (rv != 0){
2022 			break;
2023 		}
2024 
2025 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2026 		rv = setenv(envname, "set_bootenv", 1);
2027 		if (rv != 0){
2028 			break;
2029 		}
2030 
2031 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2032 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
2033 		rv = setenv(envname, envval, 1);
2034 		if (rv != 0){
2035 			break;
2036 		}
2037 
2038 		zfs_env_index++;
2039 		if (zfs_env_index > ZFS_BE_LAST) {
2040 			break;
2041 		}
2042 
2043 	}
2044 
2045 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
2046 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
2047 		(void)unsetenv(envname);
2048 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
2049 		(void)unsetenv(envname);
2050 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
2051 		(void)unsetenv(envname);
2052 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
2053 		(void)unsetenv(envname);
2054 	}
2055 
2056 	return (rv);
2057 }
2058