xref: /freebsd/stand/libsa/zfs/zfs.c (revision 9e5787d2284e187abb5b654d924394a65772e004)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  *
26  *	$FreeBSD$
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 /*
33  *	Stand-alone file reading package.
34  */
35 
36 #include <stand.h>
37 #include <sys/disk.h>
38 #include <sys/param.h>
39 #include <sys/time.h>
40 #include <sys/queue.h>
41 #include <disk.h>
42 #include <part.h>
43 #include <stddef.h>
44 #include <stdarg.h>
45 #include <string.h>
46 #include <bootstrap.h>
47 
48 #include "libzfs.h"
49 
50 #include "zfsimpl.c"
51 
52 /* Define the range of indexes to be populated with ZFS Boot Environments */
53 #define		ZFS_BE_FIRST	4
54 #define		ZFS_BE_LAST	8
55 
56 static int	zfs_open(const char *path, struct open_file *f);
57 static int	zfs_close(struct open_file *f);
58 static int	zfs_read(struct open_file *f, void *buf, size_t size, size_t *resid);
59 static off_t	zfs_seek(struct open_file *f, off_t offset, int where);
60 static int	zfs_stat(struct open_file *f, struct stat *sb);
61 static int	zfs_readdir(struct open_file *f, struct dirent *d);
62 
63 static void	zfs_bootenv_initial(const char *envname, spa_t *spa,
64 		    const char *name, const char *dsname, int checkpoint);
65 static void	zfs_checkpoints_initial(spa_t *spa, const char *name,
66 		    const char *dsname);
67 
68 struct devsw zfs_dev;
69 
70 struct fs_ops zfs_fsops = {
71 	"zfs",
72 	zfs_open,
73 	zfs_close,
74 	zfs_read,
75 	null_write,
76 	zfs_seek,
77 	zfs_stat,
78 	zfs_readdir
79 };
80 
81 /*
82  * In-core open file.
83  */
84 struct file {
85 	off_t		f_seekp;	/* seek pointer */
86 	dnode_phys_t	f_dnode;
87 	uint64_t	f_zap_type;	/* zap type for readdir */
88 	uint64_t	f_num_leafs;	/* number of fzap leaf blocks */
89 	zap_leaf_phys_t	*f_zap_leaf;	/* zap leaf buffer */
90 };
91 
92 static int	zfs_env_index;
93 static int	zfs_env_count;
94 
95 SLIST_HEAD(zfs_be_list, zfs_be_entry) zfs_be_head = SLIST_HEAD_INITIALIZER(zfs_be_head);
96 struct zfs_be_list *zfs_be_headp;
97 struct zfs_be_entry {
98 	char *name;
99 	SLIST_ENTRY(zfs_be_entry) entries;
100 } *zfs_be, *zfs_be_tmp;
101 
102 /*
103  * Open a file.
104  */
105 static int
106 zfs_open(const char *upath, struct open_file *f)
107 {
108 	struct zfsmount *mount = (struct zfsmount *)f->f_devdata;
109 	struct file *fp;
110 	int rc;
111 
112 	if (f->f_dev != &zfs_dev)
113 		return (EINVAL);
114 
115 	/* allocate file system specific data structure */
116 	fp = calloc(1, sizeof(struct file));
117 	if (fp == NULL)
118 		return (ENOMEM);
119 	f->f_fsdata = fp;
120 
121 	rc = zfs_lookup(mount, upath, &fp->f_dnode);
122 	fp->f_seekp = 0;
123 	if (rc) {
124 		f->f_fsdata = NULL;
125 		free(fp);
126 	}
127 	return (rc);
128 }
129 
130 static int
131 zfs_close(struct open_file *f)
132 {
133 	struct file *fp = (struct file *)f->f_fsdata;
134 
135 	dnode_cache_obj = NULL;
136 	f->f_fsdata = NULL;
137 
138 	free(fp);
139 	return (0);
140 }
141 
142 /*
143  * Copy a portion of a file into kernel memory.
144  * Cross block boundaries when necessary.
145  */
146 static int
147 zfs_read(struct open_file *f, void *start, size_t size, size_t *resid	/* out */)
148 {
149 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
150 	struct file *fp = (struct file *)f->f_fsdata;
151 	struct stat sb;
152 	size_t n;
153 	int rc;
154 
155 	rc = zfs_stat(f, &sb);
156 	if (rc)
157 		return (rc);
158 	n = size;
159 	if (fp->f_seekp + n > sb.st_size)
160 		n = sb.st_size - fp->f_seekp;
161 
162 	rc = dnode_read(spa, &fp->f_dnode, fp->f_seekp, start, n);
163 	if (rc)
164 		return (rc);
165 
166 	if (0) {
167 	    int i;
168 	    for (i = 0; i < n; i++)
169 		putchar(((char*) start)[i]);
170 	}
171 	fp->f_seekp += n;
172 	if (resid)
173 		*resid = size - n;
174 
175 	return (0);
176 }
177 
178 static off_t
179 zfs_seek(struct open_file *f, off_t offset, int where)
180 {
181 	struct file *fp = (struct file *)f->f_fsdata;
182 
183 	switch (where) {
184 	case SEEK_SET:
185 		fp->f_seekp = offset;
186 		break;
187 	case SEEK_CUR:
188 		fp->f_seekp += offset;
189 		break;
190 	case SEEK_END:
191 	    {
192 		struct stat sb;
193 		int error;
194 
195 		error = zfs_stat(f, &sb);
196 		if (error != 0) {
197 			errno = error;
198 			return (-1);
199 		}
200 		fp->f_seekp = sb.st_size - offset;
201 		break;
202 	    }
203 	default:
204 		errno = EINVAL;
205 		return (-1);
206 	}
207 	return (fp->f_seekp);
208 }
209 
210 static int
211 zfs_stat(struct open_file *f, struct stat *sb)
212 {
213 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
214 	struct file *fp = (struct file *)f->f_fsdata;
215 
216 	return (zfs_dnode_stat(spa, &fp->f_dnode, sb));
217 }
218 
219 static int
220 zfs_readdir(struct open_file *f, struct dirent *d)
221 {
222 	const spa_t *spa = ((struct zfsmount *)f->f_devdata)->spa;
223 	struct file *fp = (struct file *)f->f_fsdata;
224 	mzap_ent_phys_t mze;
225 	struct stat sb;
226 	size_t bsize = fp->f_dnode.dn_datablkszsec << SPA_MINBLOCKSHIFT;
227 	int rc;
228 
229 	rc = zfs_stat(f, &sb);
230 	if (rc)
231 		return (rc);
232 	if (!S_ISDIR(sb.st_mode))
233 		return (ENOTDIR);
234 
235 	/*
236 	 * If this is the first read, get the zap type.
237 	 */
238 	if (fp->f_seekp == 0) {
239 		rc = dnode_read(spa, &fp->f_dnode,
240 				0, &fp->f_zap_type, sizeof(fp->f_zap_type));
241 		if (rc)
242 			return (rc);
243 
244 		if (fp->f_zap_type == ZBT_MICRO) {
245 			fp->f_seekp = offsetof(mzap_phys_t, mz_chunk);
246 		} else {
247 			rc = dnode_read(spa, &fp->f_dnode,
248 					offsetof(zap_phys_t, zap_num_leafs),
249 					&fp->f_num_leafs,
250 					sizeof(fp->f_num_leafs));
251 			if (rc)
252 				return (rc);
253 
254 			fp->f_seekp = bsize;
255 			fp->f_zap_leaf = malloc(bsize);
256 			if (fp->f_zap_leaf == NULL)
257 				return (ENOMEM);
258 			rc = dnode_read(spa, &fp->f_dnode,
259 					fp->f_seekp,
260 					fp->f_zap_leaf,
261 					bsize);
262 			if (rc)
263 				return (rc);
264 		}
265 	}
266 
267 	if (fp->f_zap_type == ZBT_MICRO) {
268 	mzap_next:
269 		if (fp->f_seekp >= bsize)
270 			return (ENOENT);
271 
272 		rc = dnode_read(spa, &fp->f_dnode,
273 				fp->f_seekp, &mze, sizeof(mze));
274 		if (rc)
275 			return (rc);
276 		fp->f_seekp += sizeof(mze);
277 
278 		if (!mze.mze_name[0])
279 			goto mzap_next;
280 
281 		d->d_fileno = ZFS_DIRENT_OBJ(mze.mze_value);
282 		d->d_type = ZFS_DIRENT_TYPE(mze.mze_value);
283 		strcpy(d->d_name, mze.mze_name);
284 		d->d_namlen = strlen(d->d_name);
285 		return (0);
286 	} else {
287 		zap_leaf_t zl;
288 		zap_leaf_chunk_t *zc, *nc;
289 		int chunk;
290 		size_t namelen;
291 		char *p;
292 		uint64_t value;
293 
294 		/*
295 		 * Initialise this so we can use the ZAP size
296 		 * calculating macros.
297 		 */
298 		zl.l_bs = ilog2(bsize);
299 		zl.l_phys = fp->f_zap_leaf;
300 
301 		/*
302 		 * Figure out which chunk we are currently looking at
303 		 * and consider seeking to the next leaf. We use the
304 		 * low bits of f_seekp as a simple chunk index.
305 		 */
306 	fzap_next:
307 		chunk = fp->f_seekp & (bsize - 1);
308 		if (chunk == ZAP_LEAF_NUMCHUNKS(&zl)) {
309 			fp->f_seekp = rounddown2(fp->f_seekp, bsize) + bsize;
310 			chunk = 0;
311 
312 			/*
313 			 * Check for EOF and read the new leaf.
314 			 */
315 			if (fp->f_seekp >= bsize * fp->f_num_leafs)
316 				return (ENOENT);
317 
318 			rc = dnode_read(spa, &fp->f_dnode,
319 					fp->f_seekp,
320 					fp->f_zap_leaf,
321 					bsize);
322 			if (rc)
323 				return (rc);
324 		}
325 
326 		zc = &ZAP_LEAF_CHUNK(&zl, chunk);
327 		fp->f_seekp++;
328 		if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
329 			goto fzap_next;
330 
331 		namelen = zc->l_entry.le_name_numints;
332 		if (namelen > sizeof(d->d_name))
333 			namelen = sizeof(d->d_name);
334 
335 		/*
336 		 * Paste the name back together.
337 		 */
338 		nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
339 		p = d->d_name;
340 		while (namelen > 0) {
341 			int len;
342 			len = namelen;
343 			if (len > ZAP_LEAF_ARRAY_BYTES)
344 				len = ZAP_LEAF_ARRAY_BYTES;
345 			memcpy(p, nc->l_array.la_array, len);
346 			p += len;
347 			namelen -= len;
348 			nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
349 		}
350 		d->d_name[sizeof(d->d_name) - 1] = 0;
351 
352 		/*
353 		 * Assume the first eight bytes of the value are
354 		 * a uint64_t.
355 		 */
356 		value = fzap_leaf_value(&zl, zc);
357 
358 		d->d_fileno = ZFS_DIRENT_OBJ(value);
359 		d->d_type = ZFS_DIRENT_TYPE(value);
360 		d->d_namlen = strlen(d->d_name);
361 
362 		return (0);
363 	}
364 }
365 
366 static int
367 vdev_read(vdev_t *vdev, void *priv, off_t offset, void *buf, size_t bytes)
368 {
369 	int fd, ret;
370 	size_t res, head, tail, total_size, full_sec_size;
371 	unsigned secsz, do_tail_read;
372 	off_t start_sec;
373 	char *outbuf, *bouncebuf;
374 
375 	fd = (uintptr_t) priv;
376 	outbuf = (char *) buf;
377 	bouncebuf = NULL;
378 
379 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
380 	if (ret != 0)
381 		return (ret);
382 
383 	/*
384 	 * Handling reads of arbitrary offset and size - multi-sector case
385 	 * and single-sector case.
386 	 *
387 	 *                        Multi-sector Case
388 	 *                (do_tail_read = true if tail > 0)
389 	 *
390 	 *   |<----------------------total_size--------------------->|
391 	 *   |                                                       |
392 	 *   |<--head-->|<--------------bytes------------>|<--tail-->|
393 	 *   |          |                                 |          |
394 	 *   |          |       |<~full_sec_size~>|       |          |
395 	 *   +------------------+                 +------------------+
396 	 *   |          |0101010|     .  .  .     |0101011|          |
397 	 *   +------------------+                 +------------------+
398 	 *         start_sec                         start_sec + n
399 	 *
400 	 *
401 	 *                      Single-sector Case
402 	 *                    (do_tail_read = false)
403 	 *
404 	 *              |<------total_size = secsz----->|
405 	 *              |                               |
406 	 *              |<-head->|<---bytes--->|<-tail->|
407 	 *              +-------------------------------+
408 	 *              |        |0101010101010|        |
409 	 *              +-------------------------------+
410 	 *                          start_sec
411 	 */
412 	start_sec = offset / secsz;
413 	head = offset % secsz;
414 	total_size = roundup2(head + bytes, secsz);
415 	tail = total_size - (head + bytes);
416 	do_tail_read = ((tail > 0) && (head + bytes > secsz));
417 	full_sec_size = total_size;
418 	if (head > 0)
419 		full_sec_size -= secsz;
420 	if (do_tail_read)
421 		full_sec_size -= secsz;
422 
423 	/* Return of partial sector data requires a bounce buffer. */
424 	if ((head > 0) || do_tail_read || bytes < secsz) {
425 		bouncebuf = malloc(secsz);
426 		if (bouncebuf == NULL) {
427 			printf("vdev_read: out of memory\n");
428 			return (ENOMEM);
429 		}
430 	}
431 
432 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
433 		ret = errno;
434 		goto error;
435 	}
436 
437 	/* Partial data return from first sector */
438 	if (head > 0) {
439 		res = read(fd, bouncebuf, secsz);
440 		if (res != secsz) {
441 			ret = EIO;
442 			goto error;
443 		}
444 		memcpy(outbuf, bouncebuf + head, min(secsz - head, bytes));
445 		outbuf += min(secsz - head, bytes);
446 	}
447 
448 	/*
449 	 * Full data return from read sectors.
450 	 * Note, there is still corner case where we read
451 	 * from sector boundary, but less than sector size, e.g. reading 512B
452 	 * from 4k sector.
453 	 */
454 	if (full_sec_size > 0) {
455 		if (bytes < full_sec_size) {
456 			res = read(fd, bouncebuf, secsz);
457 			if (res != secsz) {
458 				ret = EIO;
459 				goto error;
460 			}
461 			memcpy(outbuf, bouncebuf, bytes);
462 		} else {
463 			res = read(fd, outbuf, full_sec_size);
464 			if (res != full_sec_size) {
465 				ret = EIO;
466 				goto error;
467 			}
468 			outbuf += full_sec_size;
469 		}
470 	}
471 
472 	/* Partial data return from last sector */
473 	if (do_tail_read) {
474 		res = read(fd, bouncebuf, secsz);
475 		if (res != secsz) {
476 			ret = EIO;
477 			goto error;
478 		}
479 		memcpy(outbuf, bouncebuf, secsz - tail);
480 	}
481 
482 	ret = 0;
483 error:
484 	free(bouncebuf);
485 	return (ret);
486 }
487 
488 static int
489 vdev_write(vdev_t *vdev __unused, void *priv, off_t offset, void *buf,
490     size_t bytes)
491 {
492 	int fd, ret;
493 	size_t head, tail, total_size, full_sec_size;
494 	unsigned secsz, do_tail_write;
495 	off_t start_sec;
496 	ssize_t res;
497 	char *outbuf, *bouncebuf;
498 
499 	fd = (uintptr_t)priv;
500 	outbuf = (char *) buf;
501 	bouncebuf = NULL;
502 
503 	ret = ioctl(fd, DIOCGSECTORSIZE, &secsz);
504 	if (ret != 0)
505 		return (ret);
506 
507 	start_sec = offset / secsz;
508 	head = offset % secsz;
509 	total_size = roundup2(head + bytes, secsz);
510 	tail = total_size - (head + bytes);
511 	do_tail_write = ((tail > 0) && (head + bytes > secsz));
512 	full_sec_size = total_size;
513 	if (head > 0)
514 		full_sec_size -= secsz;
515 	if (do_tail_write)
516 		full_sec_size -= secsz;
517 
518 	/* Partial sector write requires a bounce buffer. */
519 	if ((head > 0) || do_tail_write || bytes < secsz) {
520 		bouncebuf = malloc(secsz);
521 		if (bouncebuf == NULL) {
522 			printf("vdev_write: out of memory\n");
523 			return (ENOMEM);
524 		}
525 	}
526 
527 	if (lseek(fd, start_sec * secsz, SEEK_SET) == -1) {
528 		ret = errno;
529 		goto error;
530 	}
531 
532 	/* Partial data for first sector */
533 	if (head > 0) {
534 		res = read(fd, bouncebuf, secsz);
535 		if (res != secsz) {
536 			ret = EIO;
537 			goto error;
538 		}
539 		memcpy(bouncebuf + head, outbuf, min(secsz - head, bytes));
540 		(void) lseek(fd, -secsz, SEEK_CUR);
541 		res = write(fd, bouncebuf, secsz);
542 		if (res != secsz) {
543 			ret = EIO;
544 			goto error;
545 		}
546 		outbuf += min(secsz - head, bytes);
547 	}
548 
549 	/*
550 	 * Full data write to sectors.
551 	 * Note, there is still corner case where we write
552 	 * to sector boundary, but less than sector size, e.g. write 512B
553 	 * to 4k sector.
554 	 */
555 	if (full_sec_size > 0) {
556 		if (bytes < full_sec_size) {
557 			res = read(fd, bouncebuf, secsz);
558 			if (res != secsz) {
559 				ret = EIO;
560 				goto error;
561 			}
562 			memcpy(bouncebuf, outbuf, bytes);
563 			(void) lseek(fd, -secsz, SEEK_CUR);
564 			res = write(fd, bouncebuf, secsz);
565 			if (res != secsz) {
566 				ret = EIO;
567 				goto error;
568 			}
569 		} else {
570 			res = write(fd, outbuf, full_sec_size);
571 			if (res != full_sec_size) {
572 				ret = EIO;
573 				goto error;
574 			}
575 			outbuf += full_sec_size;
576 		}
577 	}
578 
579 	/* Partial data write to last sector */
580 	if (do_tail_write) {
581 		res = read(fd, bouncebuf, secsz);
582 		if (res != secsz) {
583 			ret = EIO;
584 			goto error;
585 		}
586 		memcpy(bouncebuf, outbuf, secsz - tail);
587 		(void) lseek(fd, -secsz, SEEK_CUR);
588 		res = write(fd, bouncebuf, secsz);
589 		if (res != secsz) {
590 			ret = EIO;
591 			goto error;
592 		}
593 	}
594 
595 	ret = 0;
596 error:
597 	free(bouncebuf);
598 	return (ret);
599 }
600 
601 static void
602 vdev_clear_pad2(vdev_t *vdev)
603 {
604 	vdev_t *kid;
605 	vdev_boot_envblock_t *be;
606 	off_t off = offsetof(vdev_label_t, vl_be);
607 	zio_checksum_info_t *ci;
608 	zio_cksum_t cksum;
609 
610 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
611 		if (kid->v_state != VDEV_STATE_HEALTHY)
612 			continue;
613 		vdev_clear_pad2(kid);
614 	}
615 
616 	if (!STAILQ_EMPTY(&vdev->v_children))
617 		return;
618 
619 	be = calloc(1, sizeof (*be));
620 	if (be == NULL) {
621 		printf("failed to clear be area: out of memory\n");
622 		return;
623 	}
624 
625 	ci = &zio_checksum_table[ZIO_CHECKSUM_LABEL];
626 	be->vbe_zbt.zec_magic = ZEC_MAGIC;
627 	zio_checksum_label_verifier(&be->vbe_zbt.zec_cksum, off);
628 	ci->ci_func[0](be, sizeof (*be), NULL, &cksum);
629 	be->vbe_zbt.zec_cksum = cksum;
630 
631 	if (vdev_write(vdev, vdev->v_read_priv, off, be, VDEV_PAD_SIZE)) {
632 		printf("failed to clear be area of primary vdev: %d\n",
633 		    errno);
634 	}
635 	free(be);
636 }
637 
638 /*
639  * Read the next boot command from pad2.
640  * If any instance of pad2 is set to empty string, or the returned string
641  * values are not the same, we consider next boot not to be set.
642  */
643 static char *
644 vdev_read_pad2(vdev_t *vdev)
645 {
646 	vdev_t *kid;
647 	char *tmp, *result = NULL;
648 	vdev_boot_envblock_t *be;
649 	off_t off = offsetof(vdev_label_t, vl_be);
650 
651 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
652 		if (kid->v_state != VDEV_STATE_HEALTHY)
653 			continue;
654 		tmp = vdev_read_pad2(kid);
655 		if (tmp == NULL)
656 			continue;
657 
658 		/* The next boot is not set, we are done. */
659 		if (*tmp == '\0') {
660 			free(result);
661 			return (tmp);
662 		}
663 		if (result == NULL) {
664 			result = tmp;
665 			continue;
666 		}
667 		/* Are the next boot strings different? */
668 		if (strcmp(result, tmp) != 0) {
669 			free(tmp);
670 			*result = '\0';
671 			break;
672 		}
673 		free(tmp);
674 	}
675 	if (result != NULL)
676 		return (result);
677 
678 	be = malloc(sizeof (*be));
679 	if (be == NULL)
680 		return (NULL);
681 
682 	if (vdev_read(vdev, vdev->v_read_priv, off, be, sizeof (*be))) {
683 		return (NULL);
684 	}
685 
686 	switch (be->vbe_version) {
687 	case VB_RAW:
688 	case VB_NVLIST:
689 		result = strdup(be->vbe_bootenv);
690 	default:
691 		/* Backward compatibility with initial nextboot feaure. */
692 		result = strdup((char *)be);
693 	}
694 	return (result);
695 }
696 
697 static int
698 zfs_dev_init(void)
699 {
700 	spa_t *spa;
701 	spa_t *next;
702 	spa_t *prev;
703 
704 	zfs_init();
705 	if (archsw.arch_zfs_probe == NULL)
706 		return (ENXIO);
707 	archsw.arch_zfs_probe();
708 
709 	prev = NULL;
710 	spa = STAILQ_FIRST(&zfs_pools);
711 	while (spa != NULL) {
712 		next = STAILQ_NEXT(spa, spa_link);
713 		if (zfs_spa_init(spa)) {
714 			if (prev == NULL)
715 				STAILQ_REMOVE_HEAD(&zfs_pools, spa_link);
716 			else
717 				STAILQ_REMOVE_AFTER(&zfs_pools, prev, spa_link);
718 		} else
719 			prev = spa;
720 		spa = next;
721 	}
722 	return (0);
723 }
724 
725 struct zfs_probe_args {
726 	int		fd;
727 	const char	*devname;
728 	uint64_t	*pool_guid;
729 	u_int		secsz;
730 };
731 
732 static int
733 zfs_diskread(void *arg, void *buf, size_t blocks, uint64_t offset)
734 {
735 	struct zfs_probe_args *ppa;
736 
737 	ppa = (struct zfs_probe_args *)arg;
738 	return (vdev_read(NULL, (void *)(uintptr_t)ppa->fd,
739 	    offset * ppa->secsz, buf, blocks * ppa->secsz));
740 }
741 
742 static int
743 zfs_probe(int fd, uint64_t *pool_guid)
744 {
745 	spa_t *spa;
746 	int ret;
747 
748 	spa = NULL;
749 	ret = vdev_probe(vdev_read, (void *)(uintptr_t)fd, &spa);
750 	if (ret == 0 && pool_guid != NULL)
751 		*pool_guid = spa->spa_guid;
752 	return (ret);
753 }
754 
755 static int
756 zfs_probe_partition(void *arg, const char *partname,
757     const struct ptable_entry *part)
758 {
759 	struct zfs_probe_args *ppa, pa;
760 	struct ptable *table;
761 	char devname[32];
762 	int ret;
763 
764 	/* Probe only freebsd-zfs and freebsd partitions */
765 	if (part->type != PART_FREEBSD &&
766 	    part->type != PART_FREEBSD_ZFS)
767 		return (0);
768 
769 	ppa = (struct zfs_probe_args *)arg;
770 	strncpy(devname, ppa->devname, strlen(ppa->devname) - 1);
771 	devname[strlen(ppa->devname) - 1] = '\0';
772 	sprintf(devname, "%s%s:", devname, partname);
773 	pa.fd = open(devname, O_RDWR);
774 	if (pa.fd == -1)
775 		return (0);
776 	ret = zfs_probe(pa.fd, ppa->pool_guid);
777 	if (ret == 0)
778 		return (0);
779 	/* Do we have BSD label here? */
780 	if (part->type == PART_FREEBSD) {
781 		pa.devname = devname;
782 		pa.pool_guid = ppa->pool_guid;
783 		pa.secsz = ppa->secsz;
784 		table = ptable_open(&pa, part->end - part->start + 1,
785 		    ppa->secsz, zfs_diskread);
786 		if (table != NULL) {
787 			ptable_iterate(table, &pa, zfs_probe_partition);
788 			ptable_close(table);
789 		}
790 	}
791 	close(pa.fd);
792 	return (0);
793 }
794 
795 int
796 zfs_nextboot(void *vdev, char *buf, size_t size)
797 {
798 	struct zfs_devdesc *dev = (struct zfs_devdesc *)vdev;
799 	spa_t *spa;
800 	vdev_t *vd;
801 	char *result = NULL;
802 
803 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
804 		return (1);
805 
806 	if (dev->pool_guid == 0)
807 		spa = STAILQ_FIRST(&zfs_pools);
808 	else
809 		spa = spa_find_by_guid(dev->pool_guid);
810 
811 	if (spa == NULL) {
812 		printf("ZFS: can't find pool by guid\n");
813 	return (1);
814 	}
815 
816 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
817 		char *tmp = vdev_read_pad2(vd);
818 
819 		/* Continue on error. */
820 		if (tmp == NULL)
821 			continue;
822 		/* Nextboot is not set. */
823 		if (*tmp == '\0') {
824 			free(result);
825 			free(tmp);
826 			return (1);
827 		}
828 		if (result == NULL) {
829 			result = tmp;
830 			continue;
831 		}
832 		free(tmp);
833 	}
834 	if (result == NULL)
835 		return (1);
836 
837 	STAILQ_FOREACH(vd, &spa->spa_root_vdev->v_children, v_childlink) {
838 		vdev_clear_pad2(vd);
839 	}
840 
841 	strlcpy(buf, result, size);
842 	free(result);
843 	return (0);
844 }
845 
846 int
847 zfs_probe_dev(const char *devname, uint64_t *pool_guid)
848 {
849 	struct disk_devdesc *dev;
850 	struct ptable *table;
851 	struct zfs_probe_args pa;
852 	uint64_t mediasz;
853 	int ret;
854 
855 	if (pool_guid)
856 		*pool_guid = 0;
857 	pa.fd = open(devname, O_RDWR);
858 	if (pa.fd == -1)
859 		return (ENXIO);
860 	/*
861 	 * We will not probe the whole disk, we can not boot from such
862 	 * disks and some systems will misreport the disk sizes and will
863 	 * hang while accessing the disk.
864 	 */
865 	if (archsw.arch_getdev((void **)&dev, devname, NULL) == 0) {
866 		int partition = dev->d_partition;
867 		int slice = dev->d_slice;
868 
869 		free(dev);
870 		if (partition != D_PARTNONE && slice != D_SLICENONE) {
871 			ret = zfs_probe(pa.fd, pool_guid);
872 			if (ret == 0)
873 				return (0);
874 		}
875 	}
876 
877 	/* Probe each partition */
878 	ret = ioctl(pa.fd, DIOCGMEDIASIZE, &mediasz);
879 	if (ret == 0)
880 		ret = ioctl(pa.fd, DIOCGSECTORSIZE, &pa.secsz);
881 	if (ret == 0) {
882 		pa.devname = devname;
883 		pa.pool_guid = pool_guid;
884 		table = ptable_open(&pa, mediasz / pa.secsz, pa.secsz,
885 		    zfs_diskread);
886 		if (table != NULL) {
887 			ptable_iterate(table, &pa, zfs_probe_partition);
888 			ptable_close(table);
889 		}
890 	}
891 	close(pa.fd);
892 	if (pool_guid && *pool_guid == 0)
893 		ret = ENXIO;
894 	return (ret);
895 }
896 
897 /*
898  * Print information about ZFS pools
899  */
900 static int
901 zfs_dev_print(int verbose)
902 {
903 	spa_t *spa;
904 	char line[80];
905 	int ret = 0;
906 
907 	if (STAILQ_EMPTY(&zfs_pools))
908 		return (0);
909 
910 	printf("%s devices:", zfs_dev.dv_name);
911 	if ((ret = pager_output("\n")) != 0)
912 		return (ret);
913 
914 	if (verbose) {
915 		return (spa_all_status());
916 	}
917 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
918 		snprintf(line, sizeof(line), "    zfs:%s\n", spa->spa_name);
919 		ret = pager_output(line);
920 		if (ret != 0)
921 			break;
922 	}
923 	return (ret);
924 }
925 
926 /*
927  * Attempt to open the pool described by (dev) for use by (f).
928  */
929 static int
930 zfs_dev_open(struct open_file *f, ...)
931 {
932 	va_list		args;
933 	struct zfs_devdesc	*dev;
934 	struct zfsmount	*mount;
935 	spa_t		*spa;
936 	int		rv;
937 
938 	va_start(args, f);
939 	dev = va_arg(args, struct zfs_devdesc *);
940 	va_end(args);
941 
942 	if (dev->pool_guid == 0)
943 		spa = STAILQ_FIRST(&zfs_pools);
944 	else
945 		spa = spa_find_by_guid(dev->pool_guid);
946 	if (!spa)
947 		return (ENXIO);
948 	mount = malloc(sizeof(*mount));
949 	if (mount == NULL)
950 		rv = ENOMEM;
951 	else
952 		rv = zfs_mount(spa, dev->root_guid, mount);
953 	if (rv != 0) {
954 		free(mount);
955 		return (rv);
956 	}
957 	if (mount->objset.os_type != DMU_OST_ZFS) {
958 		printf("Unexpected object set type %ju\n",
959 		    (uintmax_t)mount->objset.os_type);
960 		free(mount);
961 		return (EIO);
962 	}
963 	f->f_devdata = mount;
964 	free(dev);
965 	return (0);
966 }
967 
968 static int
969 zfs_dev_close(struct open_file *f)
970 {
971 
972 	free(f->f_devdata);
973 	f->f_devdata = NULL;
974 	return (0);
975 }
976 
977 static int
978 zfs_dev_strategy(void *devdata, int rw, daddr_t dblk, size_t size, char *buf, size_t *rsize)
979 {
980 
981 	return (ENOSYS);
982 }
983 
984 struct devsw zfs_dev = {
985 	.dv_name = "zfs",
986 	.dv_type = DEVT_ZFS,
987 	.dv_init = zfs_dev_init,
988 	.dv_strategy = zfs_dev_strategy,
989 	.dv_open = zfs_dev_open,
990 	.dv_close = zfs_dev_close,
991 	.dv_ioctl = noioctl,
992 	.dv_print = zfs_dev_print,
993 	.dv_cleanup = NULL
994 };
995 
996 int
997 zfs_parsedev(struct zfs_devdesc *dev, const char *devspec, const char **path)
998 {
999 	static char	rootname[ZFS_MAXNAMELEN];
1000 	static char	poolname[ZFS_MAXNAMELEN];
1001 	spa_t		*spa;
1002 	const char	*end;
1003 	const char	*np;
1004 	const char	*sep;
1005 	int		rv;
1006 
1007 	np = devspec;
1008 	if (*np != ':')
1009 		return (EINVAL);
1010 	np++;
1011 	end = strrchr(np, ':');
1012 	if (end == NULL)
1013 		return (EINVAL);
1014 	sep = strchr(np, '/');
1015 	if (sep == NULL || sep >= end)
1016 		sep = end;
1017 	memcpy(poolname, np, sep - np);
1018 	poolname[sep - np] = '\0';
1019 	if (sep < end) {
1020 		sep++;
1021 		memcpy(rootname, sep, end - sep);
1022 		rootname[end - sep] = '\0';
1023 	}
1024 	else
1025 		rootname[0] = '\0';
1026 
1027 	spa = spa_find_by_name(poolname);
1028 	if (!spa)
1029 		return (ENXIO);
1030 	dev->pool_guid = spa->spa_guid;
1031 	rv = zfs_lookup_dataset(spa, rootname, &dev->root_guid);
1032 	if (rv != 0)
1033 		return (rv);
1034 	if (path != NULL)
1035 		*path = (*end == '\0') ? end : end + 1;
1036 	dev->dd.d_dev = &zfs_dev;
1037 	return (0);
1038 }
1039 
1040 char *
1041 zfs_fmtdev(void *vdev)
1042 {
1043 	static char		rootname[ZFS_MAXNAMELEN];
1044 	static char		buf[2 * ZFS_MAXNAMELEN + 8];
1045 	struct zfs_devdesc	*dev = (struct zfs_devdesc *)vdev;
1046 	spa_t			*spa;
1047 
1048 	buf[0] = '\0';
1049 	if (dev->dd.d_dev->dv_type != DEVT_ZFS)
1050 		return (buf);
1051 
1052 	/* Do we have any pools? */
1053 	spa = STAILQ_FIRST(&zfs_pools);
1054 	if (spa == NULL)
1055 		return (buf);
1056 
1057 	if (dev->pool_guid == 0)
1058 		dev->pool_guid = spa->spa_guid;
1059 	else
1060 		spa = spa_find_by_guid(dev->pool_guid);
1061 
1062 	if (spa == NULL) {
1063 		printf("ZFS: can't find pool by guid\n");
1064 		return (buf);
1065 	}
1066 	if (dev->root_guid == 0 && zfs_get_root(spa, &dev->root_guid)) {
1067 		printf("ZFS: can't find root filesystem\n");
1068 		return (buf);
1069 	}
1070 	if (zfs_rlookup(spa, dev->root_guid, rootname)) {
1071 		printf("ZFS: can't find filesystem by guid\n");
1072 		return (buf);
1073 	}
1074 
1075 	if (rootname[0] == '\0')
1076 		sprintf(buf, "%s:%s:", dev->dd.d_dev->dv_name, spa->spa_name);
1077 	else
1078 		sprintf(buf, "%s:%s/%s:", dev->dd.d_dev->dv_name, spa->spa_name,
1079 		    rootname);
1080 	return (buf);
1081 }
1082 
1083 static int
1084 split_devname(const char *name, char *poolname, size_t size,
1085     const char **dsnamep)
1086 {
1087 	const char *dsname;
1088 	size_t len;
1089 
1090 	ASSERT(name != NULL);
1091 	ASSERT(poolname != NULL);
1092 
1093 	len = strlen(name);
1094 	dsname = strchr(name, '/');
1095 	if (dsname != NULL) {
1096 		len = dsname - name;
1097 		dsname++;
1098 	} else
1099 		dsname = "";
1100 
1101 	if (len + 1 > size)
1102 		return (EINVAL);
1103 
1104 	strlcpy(poolname, name, len + 1);
1105 
1106 	if (dsnamep != NULL)
1107 		*dsnamep = dsname;
1108 
1109 	return (0);
1110 }
1111 
1112 int
1113 zfs_list(const char *name)
1114 {
1115 	static char	poolname[ZFS_MAXNAMELEN];
1116 	uint64_t	objid;
1117 	spa_t		*spa;
1118 	const char	*dsname;
1119 	int		rv;
1120 
1121 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1122 		return (EINVAL);
1123 
1124 	spa = spa_find_by_name(poolname);
1125 	if (!spa)
1126 		return (ENXIO);
1127 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1128 	if (rv != 0)
1129 		return (rv);
1130 
1131 	return (zfs_list_dataset(spa, objid));
1132 }
1133 
1134 void
1135 init_zfs_boot_options(const char *currdev_in)
1136 {
1137 	char poolname[ZFS_MAXNAMELEN];
1138 	char *beroot, *currdev;
1139 	spa_t *spa;
1140 	int currdev_len;
1141 	const char *dsname;
1142 
1143 	currdev = NULL;
1144 	currdev_len = strlen(currdev_in);
1145 	if (currdev_len == 0)
1146 		return;
1147 	if (strncmp(currdev_in, "zfs:", 4) != 0)
1148 		return;
1149 	currdev = strdup(currdev_in);
1150 	if (currdev == NULL)
1151 		return;
1152 	/* Remove the trailing : */
1153 	currdev[currdev_len - 1] = '\0';
1154 
1155 	setenv("zfs_be_active", currdev, 1);
1156 	setenv("zfs_be_currpage", "1", 1);
1157 	/* Remove the last element (current bootenv) */
1158 	beroot = strrchr(currdev, '/');
1159 	if (beroot != NULL)
1160 		beroot[0] = '\0';
1161 	beroot = strchr(currdev, ':') + 1;
1162 	setenv("zfs_be_root", beroot, 1);
1163 
1164 	if (split_devname(beroot, poolname, sizeof(poolname), &dsname) != 0)
1165 		return;
1166 
1167 	spa = spa_find_by_name(poolname);
1168 	if (spa == NULL)
1169 		return;
1170 
1171 	zfs_bootenv_initial("bootenvs", spa, beroot, dsname, 0);
1172 	zfs_checkpoints_initial(spa, beroot, dsname);
1173 
1174 	free(currdev);
1175 }
1176 
1177 static void
1178 zfs_checkpoints_initial(spa_t *spa, const char *name, const char *dsname)
1179 {
1180 	char envname[32];
1181 
1182 	if (spa->spa_uberblock_checkpoint.ub_checkpoint_txg != 0) {
1183 		snprintf(envname, sizeof(envname), "zpool_checkpoint");
1184 		setenv(envname, name, 1);
1185 
1186 		spa->spa_uberblock = &spa->spa_uberblock_checkpoint;
1187 		spa->spa_mos = &spa->spa_mos_checkpoint;
1188 
1189 		zfs_bootenv_initial("bootenvs_check", spa, name, dsname, 1);
1190 
1191 		spa->spa_uberblock = &spa->spa_uberblock_master;
1192 		spa->spa_mos = &spa->spa_mos_master;
1193 	}
1194 }
1195 
1196 static void
1197 zfs_bootenv_initial(const char *envprefix, spa_t *spa, const char *rootname,
1198    const char *dsname, int checkpoint)
1199 {
1200 	char		envname[32], envval[256];
1201 	uint64_t	objid;
1202 	int		bootenvs_idx, rv;
1203 
1204 	SLIST_INIT(&zfs_be_head);
1205 	zfs_env_count = 0;
1206 
1207 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1208 	if (rv != 0)
1209 		return;
1210 
1211 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1212 	bootenvs_idx = 0;
1213 	/* Populate the initial environment variables */
1214 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1215 		/* Enumerate all bootenvs for general usage */
1216 		snprintf(envname, sizeof(envname), "%s[%d]",
1217 		    envprefix, bootenvs_idx);
1218 		snprintf(envval, sizeof(envval), "zfs:%s%s/%s",
1219 		    checkpoint ? "!" : "", rootname, zfs_be->name);
1220 		rv = setenv(envname, envval, 1);
1221 		if (rv != 0)
1222 			break;
1223 		bootenvs_idx++;
1224 	}
1225 	snprintf(envname, sizeof(envname), "%s_count", envprefix);
1226 	snprintf(envval, sizeof(envval), "%d", bootenvs_idx);
1227 	setenv(envname, envval, 1);
1228 
1229 	/* Clean up the SLIST of ZFS BEs */
1230 	while (!SLIST_EMPTY(&zfs_be_head)) {
1231 		zfs_be = SLIST_FIRST(&zfs_be_head);
1232 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1233 		free(zfs_be->name);
1234 		free(zfs_be);
1235 	}
1236 }
1237 
1238 int
1239 zfs_bootenv(const char *name)
1240 {
1241 	char		poolname[ZFS_MAXNAMELEN], *root;
1242 	const char	*dsname;
1243 	char		becount[4];
1244 	uint64_t	objid;
1245 	spa_t		*spa;
1246 	int		rv, pages, perpage, currpage;
1247 
1248 	if (name == NULL)
1249 		return (EINVAL);
1250 	if ((root = getenv("zfs_be_root")) == NULL)
1251 		return (EINVAL);
1252 
1253 	if (strcmp(name, root) != 0) {
1254 		if (setenv("zfs_be_root", name, 1) != 0)
1255 			return (ENOMEM);
1256 	}
1257 
1258 	SLIST_INIT(&zfs_be_head);
1259 	zfs_env_count = 0;
1260 
1261 	if (split_devname(name, poolname, sizeof(poolname), &dsname) != 0)
1262 		return (EINVAL);
1263 
1264 	spa = spa_find_by_name(poolname);
1265 	if (!spa)
1266 		return (ENXIO);
1267 	rv = zfs_lookup_dataset(spa, dsname, &objid);
1268 	if (rv != 0)
1269 		return (rv);
1270 	rv = zfs_callback_dataset(spa, objid, zfs_belist_add);
1271 
1272 	/* Calculate and store the number of pages of BEs */
1273 	perpage = (ZFS_BE_LAST - ZFS_BE_FIRST + 1);
1274 	pages = (zfs_env_count / perpage) + ((zfs_env_count % perpage) > 0 ? 1 : 0);
1275 	snprintf(becount, 4, "%d", pages);
1276 	if (setenv("zfs_be_pages", becount, 1) != 0)
1277 		return (ENOMEM);
1278 
1279 	/* Roll over the page counter if it has exceeded the maximum */
1280 	currpage = strtol(getenv("zfs_be_currpage"), NULL, 10);
1281 	if (currpage > pages) {
1282 		if (setenv("zfs_be_currpage", "1", 1) != 0)
1283 			return (ENOMEM);
1284 	}
1285 
1286 	/* Populate the menu environment variables */
1287 	zfs_set_env();
1288 
1289 	/* Clean up the SLIST of ZFS BEs */
1290 	while (!SLIST_EMPTY(&zfs_be_head)) {
1291 		zfs_be = SLIST_FIRST(&zfs_be_head);
1292 		SLIST_REMOVE_HEAD(&zfs_be_head, entries);
1293 		free(zfs_be->name);
1294 		free(zfs_be);
1295 	}
1296 
1297 	return (rv);
1298 }
1299 
1300 int
1301 zfs_belist_add(const char *name, uint64_t value __unused)
1302 {
1303 
1304 	/* Skip special datasets that start with a $ character */
1305 	if (strncmp(name, "$", 1) == 0) {
1306 		return (0);
1307 	}
1308 	/* Add the boot environment to the head of the SLIST */
1309 	zfs_be = malloc(sizeof(struct zfs_be_entry));
1310 	if (zfs_be == NULL) {
1311 		return (ENOMEM);
1312 	}
1313 	zfs_be->name = strdup(name);
1314 	if (zfs_be->name == NULL) {
1315 		free(zfs_be);
1316 		return (ENOMEM);
1317 	}
1318 	SLIST_INSERT_HEAD(&zfs_be_head, zfs_be, entries);
1319 	zfs_env_count++;
1320 
1321 	return (0);
1322 }
1323 
1324 int
1325 zfs_set_env(void)
1326 {
1327 	char envname[32], envval[256];
1328 	char *beroot, *pagenum;
1329 	int rv, page, ctr;
1330 
1331 	beroot = getenv("zfs_be_root");
1332 	if (beroot == NULL) {
1333 		return (1);
1334 	}
1335 
1336 	pagenum = getenv("zfs_be_currpage");
1337 	if (pagenum != NULL) {
1338 		page = strtol(pagenum, NULL, 10);
1339 	} else {
1340 		page = 1;
1341 	}
1342 
1343 	ctr = 1;
1344 	rv = 0;
1345 	zfs_env_index = ZFS_BE_FIRST;
1346 	SLIST_FOREACH_SAFE(zfs_be, &zfs_be_head, entries, zfs_be_tmp) {
1347 		/* Skip to the requested page number */
1348 		if (ctr <= ((ZFS_BE_LAST - ZFS_BE_FIRST + 1) * (page - 1))) {
1349 			ctr++;
1350 			continue;
1351 		}
1352 
1353 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1354 		snprintf(envval, sizeof(envval), "%s", zfs_be->name);
1355 		rv = setenv(envname, envval, 1);
1356 		if (rv != 0) {
1357 			break;
1358 		}
1359 
1360 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1361 		rv = setenv(envname, envval, 1);
1362 		if (rv != 0){
1363 			break;
1364 		}
1365 
1366 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1367 		rv = setenv(envname, "set_bootenv", 1);
1368 		if (rv != 0){
1369 			break;
1370 		}
1371 
1372 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1373 		snprintf(envval, sizeof(envval), "zfs:%s/%s", beroot, zfs_be->name);
1374 		rv = setenv(envname, envval, 1);
1375 		if (rv != 0){
1376 			break;
1377 		}
1378 
1379 		zfs_env_index++;
1380 		if (zfs_env_index > ZFS_BE_LAST) {
1381 			break;
1382 		}
1383 
1384 	}
1385 
1386 	for (; zfs_env_index <= ZFS_BE_LAST; zfs_env_index++) {
1387 		snprintf(envname, sizeof(envname), "bootenvmenu_caption[%d]", zfs_env_index);
1388 		(void)unsetenv(envname);
1389 		snprintf(envname, sizeof(envname), "bootenvansi_caption[%d]", zfs_env_index);
1390 		(void)unsetenv(envname);
1391 		snprintf(envname, sizeof(envname), "bootenvmenu_command[%d]", zfs_env_index);
1392 		(void)unsetenv(envname);
1393 		snprintf(envname, sizeof(envname), "bootenv_root[%d]", zfs_env_index);
1394 		(void)unsetenv(envname);
1395 	}
1396 
1397 	return (rv);
1398 }
1399