xref: /freebsd/stand/libsa/zfs/zfs.c (revision bc5304a006238115291e7568583632889dffbab9)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <part.h>
42 #include <stddef.h>
43 #include <stdarg.h>
44 #include <string.h>
45 #include <bootstrap.h>
46 
47 #include "libzfs.h"
48 
49 #include "zfsimpl.c"
50 
51 /* Define the range of indexes to be populated with ZFS Boot Environments */
52 #define		ZFS_BE_FIRST	4
53 #define		ZFS_BE_LAST	8
54 
55 static int	zfs_open(const char *path, struct open_file *f);
56 static int	zfs_close(struct open_file *f);
57 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
58 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
59 static int	zfs_stat(struct open_file *f, struct stat *sb);
60 static int	zfs_readdir(struct open_file *f, struct dirent *d);
61 
62 static void	zfs_bootenv_initial(const char *envname, spa_t *spa,
63 		    const char *name, const char *dsname, int checkpoint);
64 static void	zfs_checkpoints_initial(spa_t *spa, const char *name,
65 		    const char *dsname);
66 
67 struct devsw zfs_dev;
68 
69 struct fs_ops zfs_fsops = {
70 	"zfs",
71 	zfs_open,
72 	zfs_close,
73 	zfs_read,
74 	null_write,
75 	zfs_seek,
76 	zfs_stat,
77 	zfs_readdir
78 };
79 
80 /*
81  * In-core open file.
82  */
83 struct file {
84 	off_t		f_seekp;	/* seek pointer */
85 	dnode_phys_t	f_dnode;
86 	uint64_t	f_zap_type;	/* zap type for readdir */
87 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
88 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
89 };
90 
91 static int	zfs_env_index;
92 static int	zfs_env_count;
93 
94 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
95 struct zfs_be_list *zfs_be_headp;
96 struct zfs_be_entry {
97 	char *name;
98 	SLIST_ENTRY(zfs_be_entry) entries;
99 } *zfs_be, *zfs_be_tmp;
100 
101 /*
102  * Open a file.
103  */
104 static int
105 zfs_open(const char *upath, struct open_file *f)
106 {
107 	struct zfsmount *mount = (struct zfsmount *)f->f_devdata;
108 	struct file *fp;
109 	int rc;
110 
111 	if (f->f_dev != &zfs_dev)
112 		return (EINVAL);
113 
114 	/* allocate file system specific data structure */
115 	fp = calloc(1, sizeof(struct file));
116 	if (fp == NULL)
117 		return (ENOMEM);
118 	f->f_fsdata = fp;
119 
120 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
121 	fp->f_seekp = 0;
122 	if (rc) {
123 		f->f_fsdata = NULL;
124 		free(fp);
125 	}
126 	return (rc);
127 }
128 
129 static int
130 zfs_close(struct open_file *f)
131 {
132 	struct file *fp = (struct file *)f->f_fsdata;
133 
134 	dnode_cache_obj = NULL;
135 	f->f_fsdata = NULL;
136 
137 	free(fp);
138 	return (0);
139 }
140 
141 /*
142  * Copy a portion of a file into kernel memory.
143  * Cross block boundaries when necessary.
144  */
145 static int
146 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
147 {
148 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
149 	struct file *fp = (struct file *)f->f_fsdata;
150 	struct stat sb;
151 	size_t n;
152 	int rc;
153 
154 	rc = zfs_stat(f, &sb);
155 	if (rc)
156 		return (rc);
157 	n = size;
158 	if (fp->f_seekp + n > sb.st_size)
159 		n = sb.st_size - fp->f_seekp;
160 
161 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
162 	if (rc)
163 		return (rc);
164 
165 	if (0) {
166 	    int i;
167 	    for (i = 0; i < n; i++)
168 		putchar(((char*) start)[i]);
169 	}
170 	fp->f_seekp += n;
171 	if (resid)
172 		*resid = size - n;
173 
174 	return (0);
175 }
176 
177 static off_t
178 zfs_seek(struct open_file *f, off_t offset, int where)
179 {
180 	struct file *fp = (struct file *)f->f_fsdata;
181 
182 	switch (where) {
183 	case SEEK_SET:
184 		fp->f_seekp = offset;
185 		break;
186 	case SEEK_CUR:
187 		fp->f_seekp += offset;
188 		break;
189 	case SEEK_END:
190 	    {
191 		struct stat sb;
192 		int error;
193 
194 		error = zfs_stat(f, &sb);
195 		if (error != 0) {
196 			errno = error;
197 			return (-1);
198 		}
199 		fp->f_seekp = sb.st_size - offset;
200 		break;
201 	    }
202 	default:
203 		errno = EINVAL;
204 		return (-1);
205 	}
206 	return (fp->f_seekp);
207 }
208 
209 static int
210 zfs_stat(struct open_file *f, struct stat *sb)
211 {
212 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
213 	struct file *fp = (struct file *)f->f_fsdata;
214 
215 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
216 }
217 
218 static int
219 zfs_readdir(struct open_file *f, struct dirent *d)
220 {
221 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
222 	struct file *fp = (struct file *)f->f_fsdata;
223 	mzap_ent_phys_t mze;
224 	struct stat sb;
225 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
226 	int rc;
227 
228 	rc = zfs_stat(f, &sb);
229 	if (rc)
230 		return (rc);
231 	if (!S_ISDIR(sb.st_mode))
232 		return (ENOTDIR);
233 
234 	/*
235 	 * If this is the first read, get the zap type.
236 	 */
237 	if (fp->f_seekp == 0) {
238 		rc = dnode_read(spa, &fp->f_dnode,
239 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
240 		if (rc)
241 			return (rc);
242 
243 		if (fp->f_zap_type == ZBT_MICRO) {
244 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
245 		} else {
246 			rc = dnode_read(spa, &fp->f_dnode,
247 					offsetof(zap_phys_t, zap_num_leafs),
248 					&fp->f_num_leafs,
249 					sizeof(fp->f_num_leafs));
250 			if (rc)
251 				return (rc);
252 
253 			fp->f_seekp = bsize;
254 			fp->f_zap_leaf = malloc(bsize);
255 			if (fp->f_zap_leaf == NULL)
256 				return (ENOMEM);
257 			rc = dnode_read(spa, &fp->f_dnode,
258 					fp->f_seekp,
259 					fp->f_zap_leaf,
260 					bsize);
261 			if (rc)
262 				return (rc);
263 		}
264 	}
265 
266 	if (fp->f_zap_type == ZBT_MICRO) {
267 	mzap_next:
268 		if (fp->f_seekp >= bsize)
269 			return (ENOENT);
270 
271 		rc = dnode_read(spa, &fp->f_dnode,
272 				fp->f_seekp, &mze, sizeof(mze));
273 		if (rc)
274 			return (rc);
275 		fp->f_seekp += sizeof(mze);
276 
277 		if (!mze.mze_name[0])
278 			goto mzap_next;
279 
280 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
281 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
282 		strcpy(d->d_name, mze.mze_name);
283 		d->d_namlen = strlen(d->d_name);
284 		return (0);
285 	} else {
286 		zap_leaf_t zl;
287 		zap_leaf_chunk_t *zc, *nc;
288 		int chunk;
289 		size_t namelen;
290 		char *p;
291 		uint64_t value;
292 
293 		/*
294 		 * Initialise this so we can use the ZAP size
295 		 * calculating macros.
296 		 */
297 		zl.l_bs = ilog2(bsize);
298 		zl.l_phys = fp->f_zap_leaf;
299 
300 		/*
301 		 * Figure out which chunk we are currently looking at
302 		 * and consider seeking to the next leaf. We use the
303 		 * low bits of f_seekp as a simple chunk index.
304 		 */
305 	fzap_next:
306 		chunk = fp->f_seekp & (bsize - 1);
307 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
308 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
309 			chunk = 0;
310 
311 			/*
312 			 * Check for EOF and read the new leaf.
313 			 */
314 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
315 				return (ENOENT);
316 
317 			rc = dnode_read(spa, &fp->f_dnode,
318 					fp->f_seekp,
319 					fp->f_zap_leaf,
320 					bsize);
321 			if (rc)
322 				return (rc);
323 		}
324 
325 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
326 		fp->f_seekp++;
327 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
328 			goto fzap_next;
329 
330 		namelen = zc->l_entry.le_name_numints;
331 		if (namelen > sizeof(d->d_name))
332 			namelen = sizeof(d->d_name);
333 
334 		/*
335 		 * Paste the name back together.
336 		 */
337 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
338 		p = d->d_name;
339 		while (namelen > 0) {
340 			int len;
341 			len = namelen;
342 			if (len > ZAP_LEAF_ARRAY_BYTES)
343 				len = ZAP_LEAF_ARRAY_BYTES;
344 			memcpy(p, nc->l_array.la_array, len);
345 			p += len;
346 			namelen -= len;
347 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
348 		}
349 		d->d_name[sizeof(d->d_name) - 1] = 0;
350 
351 		/*
352 		 * Assume the first eight bytes of the value are
353 		 * a uint64_t.
354 		 */
355 		value = fzap_leaf_value(&zl, zc);
356 
357 		d->d_fileno = ZFS_DIRENT_OBJ(value);
358 		d->d_type = ZFS_DIRENT_TYPE(value);
359 		d->d_namlen = strlen(d->d_name);
360 
361 		return (0);
362 	}
363 }
364 
365 static int
366 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
367 {
368 	int fd, ret;
369 	size_t res, head, tail, total_size, full_sec_size;
370 	unsigned secsz, do_tail_read;
371 	off_t start_sec;
372 	char *outbuf, *bouncebuf;
373 
374 	fd = (uintptr_t) priv;
375 	outbuf = (char *) buf;
376 	bouncebuf = NULL;
377 
378 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
379 	if (ret != 0)
380 		return (ret);
381 
382 	/*
383 	 * Handling reads of arbitrary offset and size - multi-sector case
384 	 * and single-sector case.
385 	 *
386 	 *                        Multi-sector Case
387 	 *                (do_tail_read = true if tail > 0)
388 	 *
389 	 *   |<----------------------total_size--------------------->|
390 	 *   |                                                       |
391 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
392 	 *   |          |                                 |          |
393 	 *   |          |       |<~full_sec_size~>|       |          |
394 	 *   +------------------+                 +------------------+
395 	 *   |          |0101010|     .  .  .     |0101011|          |
396 	 *   +------------------+                 +------------------+
397 	 *         start_sec                         start_sec + n
398 	 *
399 	 *
400 	 *                      Single-sector Case
401 	 *                    (do_tail_read = false)
402 	 *
403 	 *              |<------total_size = secsz----->|
404 	 *              |                               |
405 	 *              |<-head->|<---bytes--->|<-tail->|
406 	 *              +-------------------------------+
407 	 *              |        |0101010101010|        |
408 	 *              +-------------------------------+
409 	 *                          start_sec
410 	 */
411 	start_sec = offset / secsz;
412 	head = offset % secsz;
413 	total_size = roundup2(head + bytes, secsz);
414 	tail = total_size - (head + bytes);
415 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
416 	full_sec_size = total_size;
417 	if (head > 0)
418 		full_sec_size -= secsz;
419 	if (do_tail_read)
420 		full_sec_size -= secsz;
421 
422 	/* Return of partial sector data requires a bounce buffer. */
423 	if ((head > 0) || do_tail_read || bytes < secsz) {
424 		bouncebuf = malloc(secsz);
425 		if (bouncebuf == NULL) {
426 			printf("vdev_read: out of memory\n");
427 			return (ENOMEM);
428 		}
429 	}
430 
431 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
432 		ret = errno;
433 		goto error;
434 	}
435 
436 	/* Partial data return from first sector */
437 	if (head > 0) {
438 		res = read(fd, bouncebuf, secsz);
439 		if (res != secsz) {
440 			ret = EIO;
441 			goto error;
442 		}
443 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
444 		outbuf += min(secsz - head, bytes);
445 	}
446 
447 	/*
448 	 * Full data return from read sectors.
449 	 * Note, there is still corner case where we read
450 	 * from sector boundary, but less than sector size, e.g. reading 512B
451 	 * from 4k sector.
452 	 */
453 	if (full_sec_size > 0) {
454 		if (bytes < full_sec_size) {
455 			res = read(fd, bouncebuf, secsz);
456 			if (res != secsz) {
457 				ret = EIO;
458 				goto error;
459 			}
460 			memcpy(outbuf, bouncebuf, bytes);
461 		} else {
462 			res = read(fd, outbuf, full_sec_size);
463 			if (res != full_sec_size) {
464 				ret = EIO;
465 				goto error;
466 			}
467 			outbuf += full_sec_size;
468 		}
469 	}
470 
471 	/* Partial data return from last sector */
472 	if (do_tail_read) {
473 		res = read(fd, bouncebuf, secsz);
474 		if (res != secsz) {
475 			ret = EIO;
476 			goto error;
477 		}
478 		memcpy(outbuf, bouncebuf, secsz - tail);
479 	}
480 
481 	ret = 0;
482 error:
483 	free(bouncebuf);
484 	return (ret);
485 }
486 
487 static int
488 vdev_write(vdev_t *vdev, off_t offset, void *buf, size_t bytes)
489 {
490 	int fd, ret;
491 	size_t head, tail, total_size, full_sec_size;
492 	unsigned secsz, do_tail_write;
493 	off_t start_sec;
494 	ssize_t res;
495 	char *outbuf, *bouncebuf;
496 
497 	fd = (uintptr_t)vdev->v_priv;
498 	outbuf = (char *)buf;
499 	bouncebuf = NULL;
500 
501 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
502 	if (ret != 0)
503 		return (ret);
504 
505 	start_sec = offset / secsz;
506 	head = offset % secsz;
507 	total_size = roundup2(head + bytes, secsz);
508 	tail = total_size - (head + bytes);
509 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
510 	full_sec_size = total_size;
511 	if (head > 0)
512 		full_sec_size -= secsz;
513 	if (do_tail_write)
514 		full_sec_size -= secsz;
515 
516 	/* Partial sector write requires a bounce buffer. */
517 	if ((head > 0) || do_tail_write || bytes < secsz) {
518 		bouncebuf = malloc(secsz);
519 		if (bouncebuf == NULL) {
520 			printf("vdev_write: out of memory\n");
521 			return (ENOMEM);
522 		}
523 	}
524 
525 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
526 		ret = errno;
527 		goto error;
528 	}
529 
530 	/* Partial data for first sector */
531 	if (head > 0) {
532 		res = read(fd, bouncebuf, secsz);
533 		if ((unsigned)res != secsz) {
534 			ret = EIO;
535 			goto error;
536 		}
537 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
538 		(void) lseek(fd, -secsz, SEEK_CUR);
539 		res = write(fd, bouncebuf, secsz);
540 		if ((unsigned)res != secsz) {
541 			ret = EIO;
542 			goto error;
543 		}
544 		outbuf += min(secsz - head, bytes);
545 	}
546 
547 	/*
548 	 * Full data write to sectors.
549 	 * Note, there is still corner case where we write
550 	 * to sector boundary, but less than sector size, e.g. write 512B
551 	 * to 4k sector.
552 	 */
553 	if (full_sec_size > 0) {
554 		if (bytes < full_sec_size) {
555 			res = read(fd, bouncebuf, secsz);
556 			if ((unsigned)res != secsz) {
557 				ret = EIO;
558 				goto error;
559 			}
560 			memcpy(bouncebuf, outbuf, bytes);
561 			(void) lseek(fd, -secsz, SEEK_CUR);
562 			res = write(fd, bouncebuf, secsz);
563 			if ((unsigned)res != secsz) {
564 				ret = EIO;
565 				goto error;
566 			}
567 		} else {
568 			res = write(fd, outbuf, full_sec_size);
569 			if ((unsigned)res != full_sec_size) {
570 				ret = EIO;
571 				goto error;
572 			}
573 			outbuf += full_sec_size;
574 		}
575 	}
576 
577 	/* Partial data write to last sector */
578 	if (do_tail_write) {
579 		res = read(fd, bouncebuf, secsz);
580 		if ((unsigned)res != secsz) {
581 			ret = EIO;
582 			goto error;
583 		}
584 		memcpy(bouncebuf, outbuf, secsz - tail);
585 		(void) lseek(fd, -secsz, SEEK_CUR);
586 		res = write(fd, bouncebuf, secsz);
587 		if ((unsigned)res != secsz) {
588 			ret = EIO;
589 			goto error;
590 		}
591 	}
592 
593 	ret = 0;
594 error:
595 	free(bouncebuf);
596 	return (ret);
597 }
598 
599 static int
600 zfs_dev_init(void)
601 {
602 	spa_t *spa;
603 	spa_t *next;
604 	spa_t *prev;
605 
606 	zfs_init();
607 	if (archsw.arch_zfs_probe == NULL)
608 		return (ENXIO);
609 	archsw.arch_zfs_probe();
610 
611 	prev = NULL;
612 	spa = STAILQ_FIRST(&zfs_pools);
613 	while (spa != NULL) {
614 		next = STAILQ_NEXT(spa, spa_link);
615 		if (zfs_spa_init(spa)) {
616 			if (prev == NULL)
617 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
618 			else
619 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
620 		} else
621 			prev = spa;
622 		spa = next;
623 	}
624 	return (0);
625 }
626 
627 struct zfs_probe_args {
628 	int		fd;
629 	const char	*devname;
630 	uint64_t	*pool_guid;
631 	u_int		secsz;
632 };
633 
634 static int
635 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
636 {
637 	struct zfs_probe_args *ppa;
638 
639 	ppa = (struct zfs_probe_args *)arg;
640 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
641 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
642 }
643 
644 static int
645 zfs_probe(int fd, uint64_t *pool_guid)
646 {
647 	spa_t *spa;
648 	int ret;
649 
650 	spa = NULL;
651 	ret = vdev_probe(vdev_read, vdev_write, (void *)(uintptr_t)fd, &spa);
652 	if (ret == 0 && pool_guid != NULL)
653 		if (*pool_guid == 0)
654 			*pool_guid = spa->spa_guid;
655 	return (ret);
656 }
657 
658 static int
659 zfs_probe_partition(void *arg, const char *partname,
660     const struct ptable_entry *part)
661 {
662 	struct zfs_probe_args *ppa, pa;
663 	struct ptable *table;
664 	char devname[32];
665 	int ret;
666 
667 	/* Probe only freebsd-zfs and freebsd partitions */
668 	if (part->type != PART_FREEBSD &&
669 	    part->type != PART_FREEBSD_ZFS)
670 		return (0);
671 
672 	ppa = (struct zfs_probe_args *)arg;
673 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
674 	devname[strlen(ppa->devname) - 1] = '\0';
675 	snprintf(devname, sizeof(devname), "%s%s:", devname, partname);
676 	pa.fd = open(devname, O_RDWR);
677 	if (pa.fd == -1)
678 		return (0);
679 	ret = zfs_probe(pa.fd, ppa->pool_guid);
680 	if (ret == 0)
681 		return (0);
682 	/* Do we have BSD label here? */
683 	if (part->type == PART_FREEBSD) {
684 		pa.devname = devname;
685 		pa.pool_guid = ppa->pool_guid;
686 		pa.secsz = ppa->secsz;
687 		table = ptable_open(&pa, part->end - part->start + 1,
688 		    ppa->secsz, zfs_diskread);
689 		if (table != NULL) {
690 			ptable_iterate(table, &pa, zfs_probe_partition);
691 			ptable_close(table);
692 		}
693 	}
694 	close(pa.fd);
695 	return (0);
696 }
697 
698 /*
699  * Return bootenv nvlist from pool label.
700  */
701 int
702 zfs_get_bootenv(void *vdev, nvlist_t **benvp)
703 {
704 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
705 	nvlist_t *benv = NULL;
706 	vdev_t *vd;
707 	spa_t *spa;
708 
709 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
710 		return (ENOTSUP);
711 
712 	if ((spa = spa_find_by_dev(dev)) == NULL)
713 		return (ENXIO);
714 
715 	if (spa->spa_bootenv == NULL) {
716 		STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children,
717 		    v_childlink) {
718 			benv = vdev_read_bootenv(vd);
719 
720 			if (benv != NULL)
721 				break;
722 		}
723 		spa->spa_bootenv = benv;
724 	} else {
725 		benv = spa->spa_bootenv;
726 	}
727 
728 	if (benv == NULL)
729 		return (ENOENT);
730 
731 	*benvp = benv;
732 	return (0);
733 }
734 
735 /*
736  * Store nvlist to pool label bootenv area. Also updates cached pointer in spa.
737  */
738 int
739 zfs_set_bootenv(void *vdev, nvlist_t *benv)
740 {
741 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
742 	spa_t *spa;
743 	vdev_t *vd;
744 
745 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
746 		return (ENOTSUP);
747 
748 	if ((spa = spa_find_by_dev(dev)) == NULL)
749 		return (ENXIO);
750 
751 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
752 		vdev_write_bootenv(vd, benv);
753 	}
754 
755 	spa->spa_bootenv = benv;
756 	return (0);
757 }
758 
759 /*
760  * Get bootonce value by key. The bootonce <key, value> pair is removed
761  * from the bootenv nvlist and the remaining nvlist is committed back to disk.
762  */
763 int
764 zfs_get_bootonce(void *vdev, const char *key, char *buf, size_t size)
765 {
766 	nvlist_t *benv;
767 	char *result = NULL;
768 	int result_size, rv;
769 
770 	if ((rv = zfs_get_bootenv(vdev, &benv)) != 0)
771 		return (rv);
772 
773 	if ((rv = nvlist_find(benv, key, DATA_TYPE_STRING, NULL,
774 	    &result, &result_size)) == 0) {
775 		if (result_size == 0) {
776 			/* ignore empty string */
777 			rv = ENOENT;
778 		} else {
779 			size = MIN((size_t)result_size + 1, size);
780 			strlcpy(buf, result, size);
781 		}
782 		(void) nvlist_remove(benv, key, DATA_TYPE_STRING);
783 		(void) zfs_set_bootenv(vdev, benv);
784 	}
785 
786 	return (rv);
787 }
788 
789 /*
790  * nvstore backend.
791  */
792 
793 static int zfs_nvstore_setter(void *, int, const char *,
794     const void *, size_t);
795 static int zfs_nvstore_setter_str(void *, const char *, const char *,
796     const char *);
797 static int zfs_nvstore_unset_impl(void *, const char *, bool);
798 static int zfs_nvstore_setenv(void *, void *);
799 
800 /*
801  * nvstore is only present for current rootfs pool.
802  */
803 static int
804 zfs_nvstore_sethook(struct env_var *ev, int flags __unused, const void *value)
805 {
806 	struct zfs_devdesc *dev;
807 	int rv;
808 
809 	archsw.arch_getdev((void **)&dev, NULL, NULL);
810 	if (dev == NULL)
811 		return (ENXIO);
812 
813 	rv = zfs_nvstore_setter_str(dev, NULL, ev->ev_name, value);
814 
815 	free(dev);
816 	return (rv);
817 }
818 
819 /*
820  * nvstore is only present for current rootfs pool.
821  */
822 static int
823 zfs_nvstore_unsethook(struct env_var *ev)
824 {
825 	struct zfs_devdesc *dev;
826 	int rv;
827 
828 	archsw.arch_getdev((void **)&dev, NULL, NULL);
829 	if (dev == NULL)
830 		return (ENXIO);
831 
832 	rv = zfs_nvstore_unset_impl(dev, ev->ev_name, false);
833 
834 	free(dev);
835 	return (rv);
836 }
837 
838 static int
839 zfs_nvstore_getter(void *vdev, const char *name, void **data)
840 {
841 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
842 	spa_t *spa;
843 	nvlist_t *nv;
844 	char *str, **ptr;
845 	int size;
846 	int rv;
847 
848 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
849 		return (ENOTSUP);
850 
851 	if ((spa = spa_find_by_dev(dev)) == NULL)
852 		return (ENXIO);
853 
854 	if (spa->spa_bootenv == NULL)
855 		return (ENXIO);
856 
857 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
858 	    NULL, &nv, NULL) != 0)
859 		return (ENOENT);
860 
861 	rv = nvlist_find(nv, name, DATA_TYPE_STRING, NULL, &str, &size);
862 	if (rv == 0) {
863 		ptr = (char **)data;
864 		asprintf(ptr, "%.*s", size, str);
865 		if (*data == NULL)
866 			rv = ENOMEM;
867 	}
868 	nvlist_destroy(nv);
869 	return (rv);
870 }
871 
872 static int
873 zfs_nvstore_setter(void *vdev, int type, const char *name,
874     const void *data, size_t size)
875 {
876 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
877 	spa_t *spa;
878 	nvlist_t *nv;
879 	int rv;
880 	bool env_set = true;
881 
882 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
883 		return (ENOTSUP);
884 
885 	if ((spa = spa_find_by_dev(dev)) == NULL)
886 		return (ENXIO);
887 
888 	if (spa->spa_bootenv == NULL)
889 		return (ENXIO);
890 
891 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
892 	    NULL, &nv, NULL) != 0) {
893 		nv = nvlist_create(NV_UNIQUE_NAME);
894 		if (nv == NULL)
895 			return (ENOMEM);
896 	}
897 
898 	rv = 0;
899 	switch (type) {
900         case DATA_TYPE_INT8:
901 		if (size != sizeof (int8_t)) {
902 			rv = EINVAL;
903 			break;
904 		}
905 		rv = nvlist_add_int8(nv, name, *(int8_t *)data);
906 		break;
907 
908         case DATA_TYPE_INT16:
909 		if (size != sizeof (int16_t)) {
910 			rv = EINVAL;
911 			break;
912 		}
913 		rv = nvlist_add_int16(nv, name, *(int16_t *)data);
914 		break;
915 
916         case DATA_TYPE_INT32:
917 		if (size != sizeof (int32_t)) {
918 			rv = EINVAL;
919 			break;
920 		}
921 		rv = nvlist_add_int32(nv, name, *(int32_t *)data);
922 		break;
923 
924         case DATA_TYPE_INT64:
925 		if (size != sizeof (int64_t)) {
926 			rv = EINVAL;
927 			break;
928 		}
929 		rv = nvlist_add_int64(nv, name, *(int64_t *)data);
930 		break;
931 
932         case DATA_TYPE_BYTE:
933 		if (size != sizeof (uint8_t)) {
934 			rv = EINVAL;
935 			break;
936 		}
937 		rv = nvlist_add_byte(nv, name, *(int8_t *)data);
938 		break;
939 
940         case DATA_TYPE_UINT8:
941 		if (size != sizeof (uint8_t)) {
942 			rv = EINVAL;
943 			break;
944 		}
945 		rv = nvlist_add_uint8(nv, name, *(int8_t *)data);
946 		break;
947 
948         case DATA_TYPE_UINT16:
949 		if (size != sizeof (uint16_t)) {
950 			rv = EINVAL;
951 			break;
952 		}
953 		rv = nvlist_add_uint16(nv, name, *(uint16_t *)data);
954 		break;
955 
956         case DATA_TYPE_UINT32:
957 		if (size != sizeof (uint32_t)) {
958 			rv = EINVAL;
959 			break;
960 		}
961 		rv = nvlist_add_uint32(nv, name, *(uint32_t *)data);
962 		break;
963 
964         case DATA_TYPE_UINT64:
965 		if (size != sizeof (uint64_t)) {
966 			rv = EINVAL;
967 			break;
968 		}
969 		rv = nvlist_add_uint64(nv, name, *(uint64_t *)data);
970 		break;
971 
972         case DATA_TYPE_STRING:
973 		rv = nvlist_add_string(nv, name, data);
974 		break;
975 
976 	case DATA_TYPE_BOOLEAN_VALUE:
977 		if (size != sizeof (boolean_t)) {
978 			rv = EINVAL;
979 			break;
980 		}
981 		rv = nvlist_add_boolean_value(nv, name, *(boolean_t *)data);
982 		break;
983 
984 	default:
985 		rv = EINVAL;
986 		break;
987 	}
988 
989 	if (rv == 0) {
990 		rv = nvlist_add_nvlist(spa->spa_bootenv, OS_NVSTORE, nv);
991 		if (rv == 0) {
992 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
993 		}
994 		if (rv == 0) {
995 			if (env_set) {
996 				rv = zfs_nvstore_setenv(vdev,
997 				    nvpair_find(nv, name));
998 			} else {
999 				env_discard(env_getenv(name));
1000 				rv = 0;
1001 			}
1002 		}
1003 	}
1004 
1005 	nvlist_destroy(nv);
1006 	return (rv);
1007 }
1008 
1009 static int
1010 get_int64(const char *data, int64_t *ip)
1011 {
1012 	char *end;
1013 	int64_t val;
1014 
1015 	errno = 0;
1016 	val = strtoll(data, &end, 0);
1017 	if (errno != 0 || *data == '\0' || *end != '\0')
1018 		return (EINVAL);
1019 
1020 	*ip = val;
1021 	return (0);
1022 }
1023 
1024 static int
1025 get_uint64(const char *data, uint64_t *ip)
1026 {
1027 	char *end;
1028 	uint64_t val;
1029 
1030 	errno = 0;
1031 	val = strtoull(data, &end, 0);
1032 	if (errno != 0 || *data == '\0' || *end != '\0')
1033 		return (EINVAL);
1034 
1035 	*ip = val;
1036 	return (0);
1037 }
1038 
1039 /*
1040  * Translate textual data to data type. If type is not set, and we are
1041  * creating new pair, use DATA_TYPE_STRING.
1042  */
1043 static int
1044 zfs_nvstore_setter_str(void *vdev, const char *type, const char *name,
1045     const char *data)
1046 {
1047 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1048 	spa_t *spa;
1049 	nvlist_t *nv;
1050 	int rv;
1051 	data_type_t dt;
1052 	int64_t val;
1053 	uint64_t uval;
1054 
1055 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1056 		return (ENOTSUP);
1057 
1058 	if ((spa = spa_find_by_dev(dev)) == NULL)
1059 		return (ENXIO);
1060 
1061 	if (spa->spa_bootenv == NULL)
1062 		return (ENXIO);
1063 
1064 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1065 	    NULL, &nv, NULL) != 0) {
1066 		nv = NULL;
1067 	}
1068 
1069 	if (type == NULL) {
1070 		nvp_header_t *nvh;
1071 
1072 		/*
1073 		 * if there is no existing pair, default to string.
1074 		 * Otherwise, use type from existing pair.
1075 		 */
1076 		nvh = nvpair_find(nv, name);
1077 		if (nvh == NULL) {
1078 			dt = DATA_TYPE_STRING;
1079 		} else {
1080 			nv_string_t *nvp_name;
1081 			nv_pair_data_t *nvp_data;
1082 
1083 			nvp_name = (nv_string_t *)(nvh + 1);
1084 			nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1085 			    NV_ALIGN4(nvp_name->nv_size));
1086 			dt = nvp_data->nv_type;
1087 		}
1088 	} else {
1089 		dt = nvpair_type_from_name(type);
1090 	}
1091 	nvlist_destroy(nv);
1092 
1093 	rv = 0;
1094 	switch (dt) {
1095         case DATA_TYPE_INT8:
1096 		rv = get_int64(data, &val);
1097 		if (rv == 0) {
1098 			int8_t v = val;
1099 
1100 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1101 		}
1102 		break;
1103         case DATA_TYPE_INT16:
1104 		rv = get_int64(data, &val);
1105 		if (rv == 0) {
1106 			int16_t v = val;
1107 
1108 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1109 		}
1110 		break;
1111         case DATA_TYPE_INT32:
1112 		rv = get_int64(data, &val);
1113 		if (rv == 0) {
1114 			int32_t v = val;
1115 
1116 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1117 		}
1118 		break;
1119         case DATA_TYPE_INT64:
1120 		rv = get_int64(data, &val);
1121 		if (rv == 0) {
1122 			rv = zfs_nvstore_setter(vdev, dt, name, &val,
1123 			    sizeof (val));
1124 		}
1125 		break;
1126 
1127         case DATA_TYPE_BYTE:
1128 		rv = get_uint64(data, &uval);
1129 		if (rv == 0) {
1130 			uint8_t v = uval;
1131 
1132 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1133 		}
1134 		break;
1135 
1136         case DATA_TYPE_UINT8:
1137 		rv = get_uint64(data, &uval);
1138 		if (rv == 0) {
1139 			uint8_t v = uval;
1140 
1141 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1142 		}
1143 		break;
1144 
1145         case DATA_TYPE_UINT16:
1146 		rv = get_uint64(data, &uval);
1147 		if (rv == 0) {
1148 			uint16_t v = uval;
1149 
1150 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1151 		}
1152 		break;
1153 
1154         case DATA_TYPE_UINT32:
1155 		rv = get_uint64(data, &uval);
1156 		if (rv == 0) {
1157 			uint32_t v = uval;
1158 
1159 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1160 		}
1161 		break;
1162 
1163         case DATA_TYPE_UINT64:
1164 		rv = get_uint64(data, &uval);
1165 		if (rv == 0) {
1166 			rv = zfs_nvstore_setter(vdev, dt, name, &uval,
1167 			    sizeof (uval));
1168 		}
1169 		break;
1170 
1171         case DATA_TYPE_STRING:
1172 		rv = zfs_nvstore_setter(vdev, dt, name, data, strlen(data) + 1);
1173 		break;
1174 
1175 	case DATA_TYPE_BOOLEAN_VALUE:
1176 		rv = get_int64(data, &val);
1177 		if (rv == 0) {
1178 			boolean_t v = val;
1179 
1180 			rv = zfs_nvstore_setter(vdev, dt, name, &v, sizeof (v));
1181 		}
1182 
1183 	default:
1184 		rv = EINVAL;
1185 	}
1186 	return (rv);
1187 }
1188 
1189 static int
1190 zfs_nvstore_unset_impl(void *vdev, const char *name, bool unset_env)
1191 {
1192 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1193 	spa_t *spa;
1194 	nvlist_t *nv;
1195 	int rv;
1196 
1197 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1198 		return (ENOTSUP);
1199 
1200 	if ((spa = spa_find_by_dev(dev)) == NULL)
1201 		return (ENXIO);
1202 
1203 	if (spa->spa_bootenv == NULL)
1204 		return (ENXIO);
1205 
1206 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1207 	    NULL, &nv, NULL) != 0)
1208 		return (ENOENT);
1209 
1210 	rv = nvlist_remove(nv, name, DATA_TYPE_UNKNOWN);
1211 	if (rv == 0) {
1212 		if (nvlist_next_nvpair(nv, NULL) == NULL) {
1213 			rv = nvlist_remove(spa->spa_bootenv, OS_NVSTORE,
1214 			    DATA_TYPE_NVLIST);
1215 		} else {
1216 			rv = nvlist_add_nvlist(spa->spa_bootenv,
1217 			    OS_NVSTORE, nv);
1218 		}
1219 		if (rv == 0)
1220 			rv = zfs_set_bootenv(vdev, spa->spa_bootenv);
1221 	}
1222 
1223 	if (unset_env)
1224 		env_discard(env_getenv(name));
1225 	return (rv);
1226 }
1227 
1228 static int
1229 zfs_nvstore_unset(void *vdev, const char *name)
1230 {
1231 	return (zfs_nvstore_unset_impl(vdev, name, true));
1232 }
1233 
1234 static int
1235 zfs_nvstore_print(void *vdev __unused, void *ptr)
1236 {
1237 
1238 	nvpair_print(ptr, 0);
1239 	return (0);
1240 }
1241 
1242 /*
1243  * Create environment variable from nvpair.
1244  * set hook will update nvstore with new value, unset hook will remove
1245  * variable from nvstore.
1246  */
1247 static int
1248 zfs_nvstore_setenv(void *vdev __unused, void *ptr)
1249 {
1250 	nvp_header_t *nvh = ptr;
1251 	nv_string_t *nvp_name, *nvp_value;
1252 	nv_pair_data_t *nvp_data;
1253 	char *name, *value;
1254 	int rv = 0;
1255 
1256 	if (nvh == NULL)
1257 		return (ENOENT);
1258 
1259 	nvp_name = (nv_string_t *)(nvh + 1);
1260 	nvp_data = (nv_pair_data_t *)(&nvp_name->nv_data[0] +
1261 	    NV_ALIGN4(nvp_name->nv_size));
1262 
1263 	if ((name = nvstring_get(nvp_name)) == NULL)
1264 		return (ENOMEM);
1265 
1266 	value = NULL;
1267 	switch (nvp_data->nv_type) {
1268 	case DATA_TYPE_BYTE:
1269 	case DATA_TYPE_UINT8:
1270 		(void) asprintf(&value, "%uc",
1271 		    *(unsigned *)&nvp_data->nv_data[0]);
1272 		if (value == NULL)
1273 			rv = ENOMEM;
1274 		break;
1275 
1276 	case DATA_TYPE_INT8:
1277 		(void) asprintf(&value, "%c", *(int *)&nvp_data->nv_data[0]);
1278 		if (value == NULL)
1279 			rv = ENOMEM;
1280 		break;
1281 
1282 	case DATA_TYPE_INT16:
1283 		(void) asprintf(&value, "%hd", *(short *)&nvp_data->nv_data[0]);
1284 		if (value == NULL)
1285 			rv = ENOMEM;
1286 		break;
1287 
1288 	case DATA_TYPE_UINT16:
1289 		(void) asprintf(&value, "%hu",
1290 		    *(unsigned short *)&nvp_data->nv_data[0]);
1291 		if (value == NULL)
1292 			rv = ENOMEM;
1293 		break;
1294 
1295 	case DATA_TYPE_BOOLEAN_VALUE:
1296 	case DATA_TYPE_INT32:
1297 		(void) asprintf(&value, "%d", *(int *)&nvp_data->nv_data[0]);
1298 		if (value == NULL)
1299 			rv = ENOMEM;
1300 		break;
1301 
1302 	case DATA_TYPE_UINT32:
1303 		(void) asprintf(&value, "%u",
1304 		    *(unsigned *)&nvp_data->nv_data[0]);
1305 		if (value == NULL)
1306 			rv = ENOMEM;
1307 		break;
1308 
1309 	case DATA_TYPE_INT64:
1310 		(void) asprintf(&value, "%jd",
1311 		    (intmax_t)*(int64_t *)&nvp_data->nv_data[0]);
1312 		if (value == NULL)
1313 			rv = ENOMEM;
1314 		break;
1315 
1316 	case DATA_TYPE_UINT64:
1317 		(void) asprintf(&value, "%ju",
1318 		    (uintmax_t)*(uint64_t *)&nvp_data->nv_data[0]);
1319 		if (value == NULL)
1320 			rv = ENOMEM;
1321 		break;
1322 
1323 	case DATA_TYPE_STRING:
1324 		nvp_value = (nv_string_t *)&nvp_data->nv_data[0];
1325 		if ((value = nvstring_get(nvp_value)) == NULL) {
1326 			rv = ENOMEM;
1327 			break;
1328 		}
1329 		break;
1330 
1331 	default:
1332 		rv = EINVAL;
1333 		break;
1334 	}
1335 
1336 	if (value != NULL) {
1337 		rv = env_setenv(name, EV_VOLATILE | EV_NOHOOK, value,
1338 		    zfs_nvstore_sethook, zfs_nvstore_unsethook);
1339 		free(value);
1340 	}
1341 	free(name);
1342 	return (rv);
1343 }
1344 
1345 static int
1346 zfs_nvstore_iterate(void *vdev, int (*cb)(void *, void *))
1347 {
1348 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
1349 	spa_t *spa;
1350 	nvlist_t *nv;
1351 	nvp_header_t *nvh;
1352 	int rv;
1353 
1354 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1355 		return (ENOTSUP);
1356 
1357 	if ((spa = spa_find_by_dev(dev)) == NULL)
1358 		return (ENXIO);
1359 
1360 	if (spa->spa_bootenv == NULL)
1361 		return (ENXIO);
1362 
1363 	if (nvlist_find(spa->spa_bootenv, OS_NVSTORE, DATA_TYPE_NVLIST,
1364 	    NULL, &nv, NULL) != 0)
1365 		return (ENOENT);
1366 
1367 	rv = 0;
1368 	nvh = NULL;
1369 	while ((nvh = nvlist_next_nvpair(nv, nvh)) != NULL) {
1370 		rv = cb(vdev, nvh);
1371 		if (rv != 0)
1372 			break;
1373 	}
1374 	return (rv);
1375 }
1376 
1377 nvs_callbacks_t nvstore_zfs_cb = {
1378 	.nvs_getter = zfs_nvstore_getter,
1379 	.nvs_setter = zfs_nvstore_setter,
1380 	.nvs_setter_str = zfs_nvstore_setter_str,
1381 	.nvs_unset = zfs_nvstore_unset,
1382 	.nvs_print = zfs_nvstore_print,
1383 	.nvs_iterate = zfs_nvstore_iterate
1384 };
1385 
1386 int
1387 zfs_attach_nvstore(void *vdev)
1388 {
1389 	struct zfs_devdesc *dev = vdev;
1390 	spa_t *spa;
1391 	uint64_t version;
1392 	int rv;
1393 
1394 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1395 		return (ENOTSUP);
1396 
1397 	if ((spa = spa_find_by_dev(dev)) == NULL)
1398 		return (ENXIO);
1399 
1400 	rv = nvlist_find(spa->spa_bootenv, BOOTENV_VERSION, DATA_TYPE_UINT64,
1401 	    NULL, &version, NULL);
1402 
1403 	if (rv != 0 || version != VB_NVLIST) {
1404 		return (ENXIO);
1405 	}
1406 
1407 	dev = malloc(sizeof (*dev));
1408 	if (dev == NULL)
1409 		return (ENOMEM);
1410 	memcpy(dev, vdev, sizeof (*dev));
1411 
1412 	rv = nvstore_init(spa->spa_name, &nvstore_zfs_cb, dev);
1413 	if (rv != 0)
1414 		free(dev);
1415 	else
1416 		rv = zfs_nvstore_iterate(dev, zfs_nvstore_setenv);
1417 	return (rv);
1418 }
1419 
1420 int
1421 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
1422 {
1423 	struct ptable *table;
1424 	struct zfs_probe_args pa;
1425 	uint64_t mediasz;
1426 	int ret;
1427 
1428 	if (pool_guid)
1429 		*pool_guid = 0;
1430 	pa.fd = open(devname, O_RDWR);
1431 	if (pa.fd == -1)
1432 		return (ENXIO);
1433 	/* Probe the whole disk */
1434 	ret = zfs_probe(pa.fd, pool_guid);
1435 	if (ret == 0)
1436 		return (0);
1437 
1438 	/* Probe each partition */
1439 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
1440 	if (ret == 0)
1441 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
1442 	if (ret == 0) {
1443 		pa.devname = devname;
1444 		pa.pool_guid = pool_guid;
1445 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
1446 		    zfs_diskread);
1447 		if (table != NULL) {
1448 			ptable_iterate(table, &pa, zfs_probe_partition);
1449 			ptable_close(table);
1450 		}
1451 	}
1452 	close(pa.fd);
1453 	if (pool_guid && *pool_guid == 0)
1454 		ret = ENXIO;
1455 	return (ret);
1456 }
1457 
1458 /*
1459  * Print information about ZFS pools
1460  */
1461 static int
1462 zfs_dev_print(int verbose)
1463 {
1464 	spa_t *spa;
1465 	char line[80];
1466 	int ret = 0;
1467 
1468 	if (STAILQ_EMPTY(&zfs_pools))
1469 		return (0);
1470 
1471 	printf("%s devices:", zfs_dev.dv_name);
1472 	if ((ret = pager_output("\n")) != 0)
1473 		return (ret);
1474 
1475 	if (verbose) {
1476 		return (spa_all_status());
1477 	}
1478 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
1479 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
1480 		ret = pager_output(line);
1481 		if (ret != 0)
1482 			break;
1483 	}
1484 	return (ret);
1485 }
1486 
1487 /*
1488  * Attempt to open the pool described by (dev) for use by (f).
1489  */
1490 static int
1491 zfs_dev_open(struct open_file *f, ...)
1492 {
1493 	va_list		args;
1494 	struct zfs_devdesc	*dev;
1495 	struct zfsmount	*mount;
1496 	spa_t		*spa;
1497 	int		rv;
1498 
1499 	va_start(args, f);
1500 	dev = va_arg(args, struct zfs_devdesc *);
1501 	va_end(args);
1502 
1503 	if ((spa = spa_find_by_dev(dev)) == NULL)
1504 		return (ENXIO);
1505 
1506 	mount = malloc(sizeof(*mount));
1507 	if (mount == NULL)
1508 		rv = ENOMEM;
1509 	else
1510 		rv = zfs_mount(spa, dev->root_guid, mount);
1511 	if (rv != 0) {
1512 		free(mount);
1513 		return (rv);
1514 	}
1515 	if (mount->objset.os_type != DMU_OST_ZFS) {
1516 		printf("Unexpected object set type %ju\n",
1517 		    (uintmax_t)mount->objset.os_type);
1518 		free(mount);
1519 		return (EIO);
1520 	}
1521 	f->f_devdata = mount;
1522 	free(dev);
1523 	return (0);
1524 }
1525 
1526 static int
1527 zfs_dev_close(struct open_file *f)
1528 {
1529 
1530 	free(f->f_devdata);
1531 	f->f_devdata = NULL;
1532 	return (0);
1533 }
1534 
1535 static int
1536 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
1537 {
1538 
1539 	return (ENOSYS);
1540 }
1541 
1542 struct devsw zfs_dev = {
1543 	.dv_name = "zfs",
1544 	.dv_type = DEVT_ZFS,
1545 	.dv_init = zfs_dev_init,
1546 	.dv_strategy = zfs_dev_strategy,
1547 	.dv_open = zfs_dev_open,
1548 	.dv_close = zfs_dev_close,
1549 	.dv_ioctl = noioctl,
1550 	.dv_print = zfs_dev_print,
1551 	.dv_cleanup = NULL
1552 };
1553 
1554 int
1555 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
1556 {
1557 	static char	rootname[ZFS_MAXNAMELEN];
1558 	static char	poolname[ZFS_MAXNAMELEN];
1559 	spa_t		*spa;
1560 	const char	*end;
1561 	const char	*np;
1562 	const char	*sep;
1563 	int		rv;
1564 
1565 	np = devspec;
1566 	if (*np != ':')
1567 		return (EINVAL);
1568 	np++;
1569 	end = strrchr(np, ':');
1570 	if (end == NULL)
1571 		return (EINVAL);
1572 	sep = strchr(np, '/');
1573 	if (sep == NULL || sep >= end)
1574 		sep = end;
1575 	memcpy(poolname, np, sep - np);
1576 	poolname[sep - np] = '\0';
1577 	if (sep < end) {
1578 		sep++;
1579 		memcpy(rootname, sep, end - sep);
1580 		rootname[end - sep] = '\0';
1581 	}
1582 	else
1583 		rootname[0] = '\0';
1584 
1585 	spa = spa_find_by_name(poolname);
1586 	if (!spa)
1587 		return (ENXIO);
1588 	dev->pool_guid = spa->spa_guid;
1589 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1590 	if (rv != 0)
1591 		return (rv);
1592 	if (path != NULL)
1593 		*path = (*end == '\0') ? end : end + 1;
1594 	dev->dd.d_dev = &zfs_dev;
1595 	return (0);
1596 }
1597 
1598 char *
1599 zfs_fmtdev(void *vdev)
1600 {
1601 	static char		rootname[ZFS_MAXNAMELEN];
1602 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1603 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1604 	spa_t			*spa;
1605 
1606 	buf[0] = '\0';
1607 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1608 		return (buf);
1609 
1610 	/* Do we have any pools? */
1611 	spa = STAILQ_FIRST(&zfs_pools);
1612 	if (spa == NULL)
1613 		return (buf);
1614 
1615 	if (dev->pool_guid == 0)
1616 		dev->pool_guid = spa->spa_guid;
1617 	else
1618 		spa = spa_find_by_guid(dev->pool_guid);
1619 
1620 	if (spa == NULL) {
1621 		printf("ZFS: can't find pool by guid\n");
1622 		return (buf);
1623 	}
1624 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1625 		printf("ZFS: can't find root filesystem\n");
1626 		return (buf);
1627 	}
1628 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1629 		printf("ZFS: can't find filesystem by guid\n");
1630 		return (buf);
1631 	}
1632 
1633 	if (rootname[0] == '\0')
1634 		snprintf(buf, sizeof(buf), "%s:%s:", dev->dd.d_dev->dv_name,
1635 		    spa->spa_name);
1636 	else
1637 		snprintf(buf, sizeof(buf), "%s:%s/%s:", dev->dd.d_dev->dv_name,
1638 		    spa->spa_name, rootname);
1639 	return (buf);
1640 }
1641 
1642 static int
1643 split_devname(const char *name, char *poolname, size_t size,
1644     const char **dsnamep)
1645 {
1646 	const char *dsname;
1647 	size_t len;
1648 
1649 	ASSERT(name != NULL);
1650 	ASSERT(poolname != NULL);
1651 
1652 	len = strlen(name);
1653 	dsname = strchr(name, '/');
1654 	if (dsname != NULL) {
1655 		len = dsname - name;
1656 		dsname++;
1657 	} else
1658 		dsname = "";
1659 
1660 	if (len + 1 > size)
1661 		return (EINVAL);
1662 
1663 	strlcpy(poolname, name, len + 1);
1664 
1665 	if (dsnamep != NULL)
1666 		*dsnamep = dsname;
1667 
1668 	return (0);
1669 }
1670 
1671 int
1672 zfs_list(const char *name)
1673 {
1674 	static char	poolname[ZFS_MAXNAMELEN];
1675 	uint64_t	objid;
1676 	spa_t		*spa;
1677 	const char	*dsname;
1678 	int		rv;
1679 
1680 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1681 		return (EINVAL);
1682 
1683 	spa = spa_find_by_name(poolname);
1684 	if (!spa)
1685 		return (ENXIO);
1686 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1687 	if (rv != 0)
1688 		return (rv);
1689 
1690 	return (zfs_list_dataset(spa, objid));
1691 }
1692 
1693 void
1694 init_zfs_boot_options(const char *currdev_in)
1695 {
1696 	char poolname[ZFS_MAXNAMELEN];
1697 	char *beroot, *currdev;
1698 	spa_t *spa;
1699 	int currdev_len;
1700 	const char *dsname;
1701 
1702 	currdev = NULL;
1703 	currdev_len = strlen(currdev_in);
1704 	if (currdev_len == 0)
1705 		return;
1706 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1707 		return;
1708 	currdev = strdup(currdev_in);
1709 	if (currdev == NULL)
1710 		return;
1711 	/* Remove the trailing : */
1712 	currdev[currdev_len - 1] = '\0';
1713 
1714 	setenv("zfs_be_active", currdev, 1);
1715 	setenv("zfs_be_currpage", "1", 1);
1716 	/* Remove the last element (current bootenv) */
1717 	beroot = strrchr(currdev, '/');
1718 	if (beroot != NULL)
1719 		beroot[0] = '\0';
1720 	beroot = strchr(currdev, ':') + 1;
1721 	setenv("zfs_be_root", beroot, 1);
1722 
1723 	if (split_devname(beroot, poolname, sizeof(poolname), &dsname) != 0)
1724 		return;
1725 
1726 	spa = spa_find_by_name(poolname);
1727 	if (spa == NULL)
1728 		return;
1729 
1730 	zfs_bootenv_initial("bootenvs", spa, beroot, dsname, 0);
1731 	zfs_checkpoints_initial(spa, beroot, dsname);
1732 
1733 	free(currdev);
1734 }
1735 
1736 static void
1737 zfs_checkpoints_initial(spa_t *spa, const char *name, const char *dsname)
1738 {
1739 	char envname[32];
1740 
1741 	if (spa->spa_uberblock_checkpoint.ub_checkpoint_txg != 0) {
1742 		snprintf(envname, sizeof(envname), "zpool_checkpoint");
1743 		setenv(envname, name, 1);
1744 
1745 		spa->spa_uberblock = &spa->spa_uberblock_checkpoint;
1746 		spa->spa_mos = &spa->spa_mos_checkpoint;
1747 
1748 		zfs_bootenv_initial("bootenvs_check", spa, name, dsname, 1);
1749 
1750 		spa->spa_uberblock = &spa->spa_uberblock_master;
1751 		spa->spa_mos = &spa->spa_mos_master;
1752 	}
1753 }
1754 
1755 static void
1756 zfs_bootenv_initial(const char *envprefix, spa_t *spa, const char *rootname,
1757    const char *dsname, int checkpoint)
1758 {
1759 	char		envname[32], envval[256];
1760 	uint64_t	objid;
1761 	int		bootenvs_idx, rv;
1762 
1763 	SLIST_INIT(&zfs_be_head);
1764 	zfs_env_count = 0;
1765 
1766 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1767 	if (rv != 0)
1768 		return;
1769 
1770 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1771 	bootenvs_idx = 0;
1772 	/* Populate the initial environment variables */
1773 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1774 		/* Enumerate all bootenvs for general usage */
1775 		snprintf(envname, sizeof(envname), "%s[%d]",
1776 		    envprefix, bootenvs_idx);
1777 		snprintf(envval, sizeof(envval), "zfs:%s%s/%s",
1778 		    checkpoint ? "!" : "", rootname, zfs_be->name);
1779 		rv = setenv(envname, envval, 1);
1780 		if (rv != 0)
1781 			break;
1782 		bootenvs_idx++;
1783 	}
1784 	snprintf(envname, sizeof(envname), "%s_count", envprefix);
1785 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1786 	setenv(envname, envval, 1);
1787 
1788 	/* Clean up the SLIST of ZFS BEs */
1789 	while (!SLIST_EMPTY(&zfs_be_head)) {
1790 		zfs_be = SLIST_FIRST(&zfs_be_head);
1791 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1792 		free(zfs_be->name);
1793 		free(zfs_be);
1794 	}
1795 }
1796 
1797 int
1798 zfs_bootenv(const char *name)
1799 {
1800 	char		poolname[ZFS_MAXNAMELEN], *root;
1801 	const char	*dsname;
1802 	char		becount[4];
1803 	uint64_t	objid;
1804 	spa_t		*spa;
1805 	int		rv, pages, perpage, currpage;
1806 
1807 	if (name == NULL)
1808 		return (EINVAL);
1809 	if ((root = getenv("zfs_be_root")) == NULL)
1810 		return (EINVAL);
1811 
1812 	if (strcmp(name, root) != 0) {
1813 		if (setenv("zfs_be_root", name, 1) != 0)
1814 			return (ENOMEM);
1815 	}
1816 
1817 	SLIST_INIT(&zfs_be_head);
1818 	zfs_env_count = 0;
1819 
1820 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1821 		return (EINVAL);
1822 
1823 	spa = spa_find_by_name(poolname);
1824 	if (!spa)
1825 		return (ENXIO);
1826 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1827 	if (rv != 0)
1828 		return (rv);
1829 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1830 
1831 	/* Calculate and store the number of pages of BEs */
1832 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1833 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1834 	snprintf(becount, 4, "%d", pages);
1835 	if (setenv("zfs_be_pages", becount, 1) != 0)
1836 		return (ENOMEM);
1837 
1838 	/* Roll over the page counter if it has exceeded the maximum */
1839 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1840 	if (currpage > pages) {
1841 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1842 			return (ENOMEM);
1843 	}
1844 
1845 	/* Populate the menu environment variables */
1846 	zfs_set_env();
1847 
1848 	/* Clean up the SLIST of ZFS BEs */
1849 	while (!SLIST_EMPTY(&zfs_be_head)) {
1850 		zfs_be = SLIST_FIRST(&zfs_be_head);
1851 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1852 		free(zfs_be->name);
1853 		free(zfs_be);
1854 	}
1855 
1856 	return (rv);
1857 }
1858 
1859 int
1860 zfs_belist_add(const char *name, uint64_t value __unused)
1861 {
1862 
1863 	/* Skip special datasets that start with a $ character */
1864 	if (strncmp(name, "$", 1) == 0) {
1865 		return (0);
1866 	}
1867 	/* Add the boot environment to the head of the SLIST */
1868 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1869 	if (zfs_be == NULL) {
1870 		return (ENOMEM);
1871 	}
1872 	zfs_be->name = strdup(name);
1873 	if (zfs_be->name == NULL) {
1874 		free(zfs_be);
1875 		return (ENOMEM);
1876 	}
1877 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1878 	zfs_env_count++;
1879 
1880 	return (0);
1881 }
1882 
1883 int
1884 zfs_set_env(void)
1885 {
1886 	char envname[32], envval[256];
1887 	char *beroot, *pagenum;
1888 	int rv, page, ctr;
1889 
1890 	beroot = getenv("zfs_be_root");
1891 	if (beroot == NULL) {
1892 		return (1);
1893 	}
1894 
1895 	pagenum = getenv("zfs_be_currpage");
1896 	if (pagenum != NULL) {
1897 		page = strtol(pagenum, NULL, 10);
1898 	} else {
1899 		page = 1;
1900 	}
1901 
1902 	ctr = 1;
1903 	rv = 0;
1904 	zfs_env_index = ZFS_BE_FIRST;
1905 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1906 		/* Skip to the requested page number */
1907 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
1908 			ctr++;
1909 			continue;
1910 		}
1911 
1912 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1913 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
1914 		rv = setenv(envname, envval, 1);
1915 		if (rv != 0) {
1916 			break;
1917 		}
1918 
1919 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1920 		rv = setenv(envname, envval, 1);
1921 		if (rv != 0){
1922 			break;
1923 		}
1924 
1925 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1926 		rv = setenv(envname, "set_bootenv", 1);
1927 		if (rv != 0){
1928 			break;
1929 		}
1930 
1931 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1932 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
1933 		rv = setenv(envname, envval, 1);
1934 		if (rv != 0){
1935 			break;
1936 		}
1937 
1938 		zfs_env_index++;
1939 		if (zfs_env_index > ZFS_BE_LAST) {
1940 			break;
1941 		}
1942 
1943 	}
1944 
1945 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
1946 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1947 		(void)unsetenv(envname);
1948 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1949 		(void)unsetenv(envname);
1950 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1951 		(void)unsetenv(envname);
1952 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1953 		(void)unsetenv(envname);
1954 	}
1955 
1956 	return (rv);
1957 }
1958