xref: /freebsd/stand/libsa/zfs/zfsimpl.c (revision d6eb98610fa65663bf0df4574b7cb2c5c4ffda71)
1 /*-
2  * Copyright (c) 2007 Doug Rabson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 /*
31  *	Stand-alone ZFS file reader.
32  */
33 
34 #include <sys/endian.h>
35 #include <sys/stat.h>
36 #include <sys/stdint.h>
37 
38 #include "zfsimpl.h"
39 #include "zfssubr.c"
40 
41 
42 struct zfsmount {
43 	const spa_t	*spa;
44 	objset_phys_t	objset;
45 	uint64_t	rootobj;
46 };
47 static struct zfsmount zfsmount __unused;
48 
49 /*
50  * List of all vdevs, chained through v_alllink.
51  */
52 static vdev_list_t zfs_vdevs;
53 
54  /*
55  * List of ZFS features supported for read
56  */
57 static const char *features_for_read[] = {
58 	"org.illumos:lz4_compress",
59 	"com.delphix:hole_birth",
60 	"com.delphix:extensible_dataset",
61 	"com.delphix:embedded_data",
62 	"org.open-zfs:large_blocks",
63 	"org.illumos:sha512",
64 	"org.illumos:skein",
65 	"org.zfsonlinux:large_dnode",
66 	"com.joyent:multi_vdev_crash_dump",
67 	"com.delphix:spacemap_histogram",
68 	"com.delphix:zpool_checkpoint",
69 	"com.delphix:spacemap_v2",
70 	"com.datto:encryption",
71 	"org.zfsonlinux:allocation_classes",
72 	"com.datto:resilver_defer",
73 	NULL
74 };
75 
76 /*
77  * List of all pools, chained through spa_link.
78  */
79 static spa_list_t zfs_pools;
80 
81 static const dnode_phys_t *dnode_cache_obj;
82 static uint64_t dnode_cache_bn;
83 static char *dnode_cache_buf;
84 static char *zap_scratch;
85 static char *zfs_temp_buf, *zfs_temp_end, *zfs_temp_ptr;
86 
87 #define TEMP_SIZE	(1024 * 1024)
88 
89 static int zio_read(const spa_t *spa, const blkptr_t *bp, void *buf);
90 static int zfs_get_root(const spa_t *spa, uint64_t *objid);
91 static int zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result);
92 static int zap_lookup(const spa_t *spa, const dnode_phys_t *dnode,
93     const char *name, uint64_t integer_size, uint64_t num_integers,
94     void *value);
95 
96 static void
97 zfs_init(void)
98 {
99 	STAILQ_INIT(&zfs_vdevs);
100 	STAILQ_INIT(&zfs_pools);
101 
102 	zfs_temp_buf = malloc(TEMP_SIZE);
103 	zfs_temp_end = zfs_temp_buf + TEMP_SIZE;
104 	zfs_temp_ptr = zfs_temp_buf;
105 	dnode_cache_buf = malloc(SPA_MAXBLOCKSIZE);
106 	zap_scratch = malloc(SPA_MAXBLOCKSIZE);
107 
108 	zfs_init_crc();
109 }
110 
111 static void *
112 zfs_alloc(size_t size)
113 {
114 	char *ptr;
115 
116 	if (zfs_temp_ptr + size > zfs_temp_end) {
117 		panic("ZFS: out of temporary buffer space");
118 	}
119 	ptr = zfs_temp_ptr;
120 	zfs_temp_ptr += size;
121 
122 	return (ptr);
123 }
124 
125 static void
126 zfs_free(void *ptr, size_t size)
127 {
128 
129 	zfs_temp_ptr -= size;
130 	if (zfs_temp_ptr != ptr) {
131 		panic("ZFS: zfs_alloc()/zfs_free() mismatch");
132 	}
133 }
134 
135 static int
136 xdr_int(const unsigned char **xdr, int *ip)
137 {
138 	*ip = be32dec(*xdr);
139 	(*xdr) += 4;
140 	return (0);
141 }
142 
143 static int
144 xdr_u_int(const unsigned char **xdr, u_int *ip)
145 {
146 	*ip = be32dec(*xdr);
147 	(*xdr) += 4;
148 	return (0);
149 }
150 
151 static int
152 xdr_uint64_t(const unsigned char **xdr, uint64_t *lp)
153 {
154 	u_int hi, lo;
155 
156 	xdr_u_int(xdr, &hi);
157 	xdr_u_int(xdr, &lo);
158 	*lp = (((uint64_t) hi) << 32) | lo;
159 	return (0);
160 }
161 
162 static int
163 nvlist_find(const unsigned char *nvlist, const char *name, int type,
164 	    int* elementsp, void *valuep)
165 {
166 	const unsigned char *p, *pair;
167 	int junk;
168 	int encoded_size, decoded_size;
169 
170 	p = nvlist;
171 	xdr_int(&p, &junk);
172 	xdr_int(&p, &junk);
173 
174 	pair = p;
175 	xdr_int(&p, &encoded_size);
176 	xdr_int(&p, &decoded_size);
177 	while (encoded_size && decoded_size) {
178 		int namelen, pairtype, elements;
179 		const char *pairname;
180 
181 		xdr_int(&p, &namelen);
182 		pairname = (const char*) p;
183 		p += roundup(namelen, 4);
184 		xdr_int(&p, &pairtype);
185 
186 		if (!memcmp(name, pairname, namelen) && type == pairtype) {
187 			xdr_int(&p, &elements);
188 			if (elementsp)
189 				*elementsp = elements;
190 			if (type == DATA_TYPE_UINT64) {
191 				xdr_uint64_t(&p, (uint64_t *) valuep);
192 				return (0);
193 			} else if (type == DATA_TYPE_STRING) {
194 				int len;
195 				xdr_int(&p, &len);
196 				(*(const char**) valuep) = (const char*) p;
197 				return (0);
198 			} else if (type == DATA_TYPE_NVLIST
199 				   || type == DATA_TYPE_NVLIST_ARRAY) {
200 				(*(const unsigned char**) valuep) =
201 					 (const unsigned char*) p;
202 				return (0);
203 			} else {
204 				return (EIO);
205 			}
206 		} else {
207 			/*
208 			 * Not the pair we are looking for, skip to the next one.
209 			 */
210 			p = pair + encoded_size;
211 		}
212 
213 		pair = p;
214 		xdr_int(&p, &encoded_size);
215 		xdr_int(&p, &decoded_size);
216 	}
217 
218 	return (EIO);
219 }
220 
221 static int
222 nvlist_check_features_for_read(const unsigned char *nvlist)
223 {
224 	const unsigned char *p, *pair;
225 	int junk;
226 	int encoded_size, decoded_size;
227 	int rc;
228 
229 	rc = 0;
230 
231 	p = nvlist;
232 	xdr_int(&p, &junk);
233 	xdr_int(&p, &junk);
234 
235 	pair = p;
236 	xdr_int(&p, &encoded_size);
237 	xdr_int(&p, &decoded_size);
238 	while (encoded_size && decoded_size) {
239 		int namelen, pairtype;
240 		const char *pairname;
241 		int i, found;
242 
243 		found = 0;
244 
245 		xdr_int(&p, &namelen);
246 		pairname = (const char*) p;
247 		p += roundup(namelen, 4);
248 		xdr_int(&p, &pairtype);
249 
250 		for (i = 0; features_for_read[i] != NULL; i++) {
251 			if (!memcmp(pairname, features_for_read[i], namelen)) {
252 				found = 1;
253 				break;
254 			}
255 		}
256 
257 		if (!found) {
258 			printf("ZFS: unsupported feature: %s\n", pairname);
259 			rc = EIO;
260 		}
261 
262 		p = pair + encoded_size;
263 
264 		pair = p;
265 		xdr_int(&p, &encoded_size);
266 		xdr_int(&p, &decoded_size);
267 	}
268 
269 	return (rc);
270 }
271 
272 /*
273  * Return the next nvlist in an nvlist array.
274  */
275 static const unsigned char *
276 nvlist_next(const unsigned char *nvlist)
277 {
278 	const unsigned char *p, *pair;
279 	int junk;
280 	int encoded_size, decoded_size;
281 
282 	p = nvlist;
283 	xdr_int(&p, &junk);
284 	xdr_int(&p, &junk);
285 
286 	pair = p;
287 	xdr_int(&p, &encoded_size);
288 	xdr_int(&p, &decoded_size);
289 	while (encoded_size && decoded_size) {
290 		p = pair + encoded_size;
291 
292 		pair = p;
293 		xdr_int(&p, &encoded_size);
294 		xdr_int(&p, &decoded_size);
295 	}
296 
297 	return p;
298 }
299 
300 #ifdef TEST
301 
302 static const unsigned char *
303 nvlist_print(const unsigned char *nvlist, unsigned int indent)
304 {
305 	static const char* typenames[] = {
306 		"DATA_TYPE_UNKNOWN",
307 		"DATA_TYPE_BOOLEAN",
308 		"DATA_TYPE_BYTE",
309 		"DATA_TYPE_INT16",
310 		"DATA_TYPE_UINT16",
311 		"DATA_TYPE_INT32",
312 		"DATA_TYPE_UINT32",
313 		"DATA_TYPE_INT64",
314 		"DATA_TYPE_UINT64",
315 		"DATA_TYPE_STRING",
316 		"DATA_TYPE_BYTE_ARRAY",
317 		"DATA_TYPE_INT16_ARRAY",
318 		"DATA_TYPE_UINT16_ARRAY",
319 		"DATA_TYPE_INT32_ARRAY",
320 		"DATA_TYPE_UINT32_ARRAY",
321 		"DATA_TYPE_INT64_ARRAY",
322 		"DATA_TYPE_UINT64_ARRAY",
323 		"DATA_TYPE_STRING_ARRAY",
324 		"DATA_TYPE_HRTIME",
325 		"DATA_TYPE_NVLIST",
326 		"DATA_TYPE_NVLIST_ARRAY",
327 		"DATA_TYPE_BOOLEAN_VALUE",
328 		"DATA_TYPE_INT8",
329 		"DATA_TYPE_UINT8",
330 		"DATA_TYPE_BOOLEAN_ARRAY",
331 		"DATA_TYPE_INT8_ARRAY",
332 		"DATA_TYPE_UINT8_ARRAY"
333 	};
334 
335 	unsigned int i, j;
336 	const unsigned char *p, *pair;
337 	int junk;
338 	int encoded_size, decoded_size;
339 
340 	p = nvlist;
341 	xdr_int(&p, &junk);
342 	xdr_int(&p, &junk);
343 
344 	pair = p;
345 	xdr_int(&p, &encoded_size);
346 	xdr_int(&p, &decoded_size);
347 	while (encoded_size && decoded_size) {
348 		int namelen, pairtype, elements;
349 		const char *pairname;
350 
351 		xdr_int(&p, &namelen);
352 		pairname = (const char*) p;
353 		p += roundup(namelen, 4);
354 		xdr_int(&p, &pairtype);
355 
356 		for (i = 0; i < indent; i++)
357 			printf(" ");
358 		printf("%s %s", typenames[pairtype], pairname);
359 
360 		xdr_int(&p, &elements);
361 		switch (pairtype) {
362 		case DATA_TYPE_UINT64: {
363 			uint64_t val;
364 			xdr_uint64_t(&p, &val);
365 			printf(" = 0x%jx\n", (uintmax_t)val);
366 			break;
367 		}
368 
369 		case DATA_TYPE_STRING: {
370 			int len;
371 			xdr_int(&p, &len);
372 			printf(" = \"%s\"\n", p);
373 			break;
374 		}
375 
376 		case DATA_TYPE_NVLIST:
377 			printf("\n");
378 			nvlist_print(p, indent + 1);
379 			break;
380 
381 		case DATA_TYPE_NVLIST_ARRAY:
382 			for (j = 0; j < elements; j++) {
383 				printf("[%d]\n", j);
384 				p = nvlist_print(p, indent + 1);
385 				if (j != elements - 1) {
386 					for (i = 0; i < indent; i++)
387 						printf(" ");
388 					printf("%s %s", typenames[pairtype], pairname);
389 				}
390 			}
391 			break;
392 
393 		default:
394 			printf("\n");
395 		}
396 
397 		p = pair + encoded_size;
398 
399 		pair = p;
400 		xdr_int(&p, &encoded_size);
401 		xdr_int(&p, &decoded_size);
402 	}
403 
404 	return p;
405 }
406 
407 #endif
408 
409 static int
410 vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf,
411     off_t offset, size_t size)
412 {
413 	size_t psize;
414 	int rc;
415 
416 	if (!vdev->v_phys_read)
417 		return (EIO);
418 
419 	if (bp) {
420 		psize = BP_GET_PSIZE(bp);
421 	} else {
422 		psize = size;
423 	}
424 
425 	/*printf("ZFS: reading %zu bytes at 0x%jx to %p\n", psize, (uintmax_t)offset, buf);*/
426 	rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize);
427 	if (rc)
428 		return (rc);
429 	if (bp && zio_checksum_verify(vdev->spa, bp, buf))
430 		return (EIO);
431 
432 	return (0);
433 }
434 
435 static int
436 vdev_disk_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
437     off_t offset, size_t bytes)
438 {
439 
440 	return (vdev_read_phys(vdev, bp, buf,
441 		offset + VDEV_LABEL_START_SIZE, bytes));
442 }
443 
444 
445 static int
446 vdev_mirror_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
447     off_t offset, size_t bytes)
448 {
449 	vdev_t *kid;
450 	int rc;
451 
452 	rc = EIO;
453 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
454 		if (kid->v_state != VDEV_STATE_HEALTHY)
455 			continue;
456 		rc = kid->v_read(kid, bp, buf, offset, bytes);
457 		if (!rc)
458 			return (0);
459 	}
460 
461 	return (rc);
462 }
463 
464 static int
465 vdev_replacing_read(vdev_t *vdev, const blkptr_t *bp, void *buf,
466     off_t offset, size_t bytes)
467 {
468 	vdev_t *kid;
469 
470 	/*
471 	 * Here we should have two kids:
472 	 * First one which is the one we are replacing and we can trust
473 	 * only this one to have valid data, but it might not be present.
474 	 * Second one is that one we are replacing with. It is most likely
475 	 * healthy, but we can't trust it has needed data, so we won't use it.
476 	 */
477 	kid = STAILQ_FIRST(&vdev->v_children);
478 	if (kid == NULL)
479 		return (EIO);
480 	if (kid->v_state != VDEV_STATE_HEALTHY)
481 		return (EIO);
482 	return (kid->v_read(kid, bp, buf, offset, bytes));
483 }
484 
485 static vdev_t *
486 vdev_find(uint64_t guid)
487 {
488 	vdev_t *vdev;
489 
490 	STAILQ_FOREACH(vdev, &zfs_vdevs, v_alllink)
491 		if (vdev->v_guid == guid)
492 			return (vdev);
493 
494 	return (0);
495 }
496 
497 static vdev_t *
498 vdev_create(uint64_t guid, vdev_read_t *_read)
499 {
500 	vdev_t *vdev;
501 
502 	vdev = malloc(sizeof(vdev_t));
503 	memset(vdev, 0, sizeof(vdev_t));
504 	STAILQ_INIT(&vdev->v_children);
505 	vdev->v_guid = guid;
506 	vdev->v_state = VDEV_STATE_OFFLINE;
507 	vdev->v_read = _read;
508 	vdev->v_phys_read = 0;
509 	vdev->v_read_priv = 0;
510 	STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink);
511 
512 	return (vdev);
513 }
514 
515 static int
516 vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev,
517     vdev_t **vdevp, int is_newer)
518 {
519 	int rc;
520 	uint64_t guid, id, ashift, nparity;
521 	const char *type;
522 	const char *path;
523 	vdev_t *vdev, *kid;
524 	const unsigned char *kids;
525 	int nkids, i, is_new;
526 	uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present;
527 
528 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
529 	    NULL, &guid)
530 	    || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id)
531 	    || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING,
532 	    NULL, &type)) {
533 		printf("ZFS: can't find vdev details\n");
534 		return (ENOENT);
535 	}
536 
537 	if (strcmp(type, VDEV_TYPE_MIRROR)
538 	    && strcmp(type, VDEV_TYPE_DISK)
539 #ifdef ZFS_TEST
540 	    && strcmp(type, VDEV_TYPE_FILE)
541 #endif
542 	    && strcmp(type, VDEV_TYPE_RAIDZ)
543 	    && strcmp(type, VDEV_TYPE_REPLACING)) {
544 		printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
545 		return (EIO);
546 	}
547 
548 	is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0;
549 
550 	nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL,
551 			&is_offline);
552 	nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL,
553 			&is_removed);
554 	nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL,
555 			&is_faulted);
556 	nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL,
557 			&is_degraded);
558 	nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL,
559 			&isnt_present);
560 
561 	vdev = vdev_find(guid);
562 	if (!vdev) {
563 		is_new = 1;
564 
565 		if (!strcmp(type, VDEV_TYPE_MIRROR))
566 			vdev = vdev_create(guid, vdev_mirror_read);
567 		else if (!strcmp(type, VDEV_TYPE_RAIDZ))
568 			vdev = vdev_create(guid, vdev_raidz_read);
569 		else if (!strcmp(type, VDEV_TYPE_REPLACING))
570 			vdev = vdev_create(guid, vdev_replacing_read);
571 		else
572 			vdev = vdev_create(guid, vdev_disk_read);
573 
574 		vdev->v_id = id;
575 		vdev->v_top = pvdev != NULL ? pvdev : vdev;
576 		if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT,
577 			DATA_TYPE_UINT64, NULL, &ashift) == 0) {
578 			vdev->v_ashift = ashift;
579 		} else {
580 			vdev->v_ashift = 0;
581 		}
582 		if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY,
583 			DATA_TYPE_UINT64, NULL, &nparity) == 0) {
584 			vdev->v_nparity = nparity;
585 		} else {
586 			vdev->v_nparity = 0;
587 		}
588 		if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH,
589 				DATA_TYPE_STRING, NULL, &path) == 0) {
590 			if (strncmp(path, "/dev/", 5) == 0)
591 				path += 5;
592 			vdev->v_name = strdup(path);
593 		} else {
594 			if (!strcmp(type, "raidz")) {
595 				if (vdev->v_nparity == 1)
596 					vdev->v_name = "raidz1";
597 				else if (vdev->v_nparity == 2)
598 					vdev->v_name = "raidz2";
599 				else if (vdev->v_nparity == 3)
600 					vdev->v_name = "raidz3";
601 				else {
602 					printf("ZFS: can only boot from disk, mirror, raidz1, raidz2 and raidz3 vdevs\n");
603 					return (EIO);
604 				}
605 			} else {
606 				vdev->v_name = strdup(type);
607 			}
608 		}
609 	} else {
610 		is_new = 0;
611 	}
612 
613 	if (is_new || is_newer) {
614 		/*
615 		 * This is either new vdev or we've already seen this vdev,
616 		 * but from an older vdev label, so let's refresh its state
617 		 * from the newer label.
618 		 */
619 		if (is_offline)
620 			vdev->v_state = VDEV_STATE_OFFLINE;
621 		else if (is_removed)
622 			vdev->v_state = VDEV_STATE_REMOVED;
623 		else if (is_faulted)
624 			vdev->v_state = VDEV_STATE_FAULTED;
625 		else if (is_degraded)
626 			vdev->v_state = VDEV_STATE_DEGRADED;
627 		else if (isnt_present)
628 			vdev->v_state = VDEV_STATE_CANT_OPEN;
629 	}
630 
631 	rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY,
632 	    &nkids, &kids);
633 	/*
634 	 * Its ok if we don't have any kids.
635 	 */
636 	if (rc == 0) {
637 		vdev->v_nchildren = nkids;
638 		for (i = 0; i < nkids; i++) {
639 			rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer);
640 			if (rc)
641 				return (rc);
642 			if (is_new)
643 				STAILQ_INSERT_TAIL(&vdev->v_children, kid,
644 						   v_childlink);
645 			kids = nvlist_next(kids);
646 		}
647 	} else {
648 		vdev->v_nchildren = 0;
649 	}
650 
651 	if (vdevp)
652 		*vdevp = vdev;
653 	return (0);
654 }
655 
656 static void
657 vdev_set_state(vdev_t *vdev)
658 {
659 	vdev_t *kid;
660 	int good_kids;
661 	int bad_kids;
662 
663 	/*
664 	 * A mirror or raidz is healthy if all its kids are healthy. A
665 	 * mirror is degraded if any of its kids is healthy; a raidz
666 	 * is degraded if at most nparity kids are offline.
667 	 */
668 	if (STAILQ_FIRST(&vdev->v_children)) {
669 		good_kids = 0;
670 		bad_kids = 0;
671 		STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
672 			if (kid->v_state == VDEV_STATE_HEALTHY)
673 				good_kids++;
674 			else
675 				bad_kids++;
676 		}
677 		if (bad_kids == 0) {
678 			vdev->v_state = VDEV_STATE_HEALTHY;
679 		} else {
680 			if (vdev->v_read == vdev_mirror_read) {
681 				if (good_kids) {
682 					vdev->v_state = VDEV_STATE_DEGRADED;
683 				} else {
684 					vdev->v_state = VDEV_STATE_OFFLINE;
685 				}
686 			} else if (vdev->v_read == vdev_raidz_read) {
687 				if (bad_kids > vdev->v_nparity) {
688 					vdev->v_state = VDEV_STATE_OFFLINE;
689 				} else {
690 					vdev->v_state = VDEV_STATE_DEGRADED;
691 				}
692 			}
693 		}
694 	}
695 }
696 
697 static spa_t *
698 spa_find_by_guid(uint64_t guid)
699 {
700 	spa_t *spa;
701 
702 	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
703 		if (spa->spa_guid == guid)
704 			return (spa);
705 
706 	return (0);
707 }
708 
709 static spa_t *
710 spa_find_by_name(const char *name)
711 {
712 	spa_t *spa;
713 
714 	STAILQ_FOREACH(spa, &zfs_pools, spa_link)
715 		if (!strcmp(spa->spa_name, name))
716 			return (spa);
717 
718 	return (0);
719 }
720 
721 #ifdef BOOT2
722 static spa_t *
723 spa_get_primary(void)
724 {
725 
726 	return (STAILQ_FIRST(&zfs_pools));
727 }
728 
729 static vdev_t *
730 spa_get_primary_vdev(const spa_t *spa)
731 {
732 	vdev_t *vdev;
733 	vdev_t *kid;
734 
735 	if (spa == NULL)
736 		spa = spa_get_primary();
737 	if (spa == NULL)
738 		return (NULL);
739 	vdev = STAILQ_FIRST(&spa->spa_vdevs);
740 	if (vdev == NULL)
741 		return (NULL);
742 	for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL;
743 	     kid = STAILQ_FIRST(&vdev->v_children))
744 		vdev = kid;
745 	return (vdev);
746 }
747 #endif
748 
749 static spa_t *
750 spa_create(uint64_t guid, const char *name)
751 {
752 	spa_t *spa;
753 
754 	if ((spa = calloc(1, sizeof(spa_t))) == NULL)
755 		return (NULL);
756 	if ((spa->spa_name = strdup(name)) == NULL) {
757 		free(spa);
758 		return (NULL);
759 	}
760 	STAILQ_INIT(&spa->spa_vdevs);
761 	spa->spa_guid = guid;
762 	STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link);
763 
764 	return (spa);
765 }
766 
767 static const char *
768 state_name(vdev_state_t state)
769 {
770 	static const char* names[] = {
771 		"UNKNOWN",
772 		"CLOSED",
773 		"OFFLINE",
774 		"REMOVED",
775 		"CANT_OPEN",
776 		"FAULTED",
777 		"DEGRADED",
778 		"ONLINE"
779 	};
780 	return names[state];
781 }
782 
783 #ifdef BOOT2
784 
785 #define pager_printf printf
786 
787 #else
788 
789 static int
790 pager_printf(const char *fmt, ...)
791 {
792 	char line[80];
793 	va_list args;
794 
795 	va_start(args, fmt);
796 	vsprintf(line, fmt, args);
797 	va_end(args);
798 
799 	return (pager_output(line));
800 }
801 
802 #endif
803 
804 #define STATUS_FORMAT	"        %s %s\n"
805 
806 static int
807 print_state(int indent, const char *name, vdev_state_t state)
808 {
809 	char buf[512];
810 	int i;
811 
812 	buf[0] = 0;
813 	for (i = 0; i < indent; i++)
814 		strcat(buf, "  ");
815 	strcat(buf, name);
816 
817 	return (pager_printf(STATUS_FORMAT, buf, state_name(state)));
818 }
819 
820 static int
821 vdev_status(vdev_t *vdev, int indent)
822 {
823 	vdev_t *kid;
824 	int ret;
825 	ret = print_state(indent, vdev->v_name, vdev->v_state);
826 	if (ret != 0)
827 		return (ret);
828 
829 	STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) {
830 		ret = vdev_status(kid, indent + 1);
831 		if (ret != 0)
832 			return (ret);
833 	}
834 	return (ret);
835 }
836 
837 static int
838 spa_status(spa_t *spa)
839 {
840 	static char bootfs[ZFS_MAXNAMELEN];
841 	uint64_t rootid;
842 	vdev_t *vdev;
843 	int good_kids, bad_kids, degraded_kids, ret;
844 	vdev_state_t state;
845 
846 	ret = pager_printf("  pool: %s\n", spa->spa_name);
847 	if (ret != 0)
848 		return (ret);
849 
850 	if (zfs_get_root(spa, &rootid) == 0 &&
851 	    zfs_rlookup(spa, rootid, bootfs) == 0) {
852 		if (bootfs[0] == '\0')
853 			ret = pager_printf("bootfs: %s\n", spa->spa_name);
854 		else
855 			ret = pager_printf("bootfs: %s/%s\n", spa->spa_name,
856 			    bootfs);
857 		if (ret != 0)
858 			return (ret);
859 	}
860 	ret = pager_printf("config:\n\n");
861 	if (ret != 0)
862 		return (ret);
863 	ret = pager_printf(STATUS_FORMAT, "NAME", "STATE");
864 	if (ret != 0)
865 		return (ret);
866 
867 	good_kids = 0;
868 	degraded_kids = 0;
869 	bad_kids = 0;
870 	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
871 		if (vdev->v_state == VDEV_STATE_HEALTHY)
872 			good_kids++;
873 		else if (vdev->v_state == VDEV_STATE_DEGRADED)
874 			degraded_kids++;
875 		else
876 			bad_kids++;
877 	}
878 
879 	state = VDEV_STATE_CLOSED;
880 	if (good_kids > 0 && (degraded_kids + bad_kids) == 0)
881 		state = VDEV_STATE_HEALTHY;
882 	else if ((good_kids + degraded_kids) > 0)
883 		state = VDEV_STATE_DEGRADED;
884 
885 	ret = print_state(0, spa->spa_name, state);
886 	if (ret != 0)
887 		return (ret);
888 	STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
889 		ret = vdev_status(vdev, 1);
890 		if (ret != 0)
891 			return (ret);
892 	}
893 	return (ret);
894 }
895 
896 static int
897 spa_all_status(void)
898 {
899 	spa_t *spa;
900 	int first = 1, ret = 0;
901 
902 	STAILQ_FOREACH(spa, &zfs_pools, spa_link) {
903 		if (!first) {
904 			ret = pager_printf("\n");
905 			if (ret != 0)
906 				return (ret);
907 		}
908 		first = 0;
909 		ret = spa_status(spa);
910 		if (ret != 0)
911 			return (ret);
912 	}
913 	return (ret);
914 }
915 
916 static uint64_t
917 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
918 {
919 	uint64_t label_offset;
920 
921 	if (l < VDEV_LABELS / 2)
922 		label_offset = 0;
923 	else
924 		label_offset = psize - VDEV_LABELS * sizeof (vdev_label_t);
925 
926 	return (offset + l * sizeof (vdev_label_t) + label_offset);
927 }
928 
929 static int
930 vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap)
931 {
932 	vdev_t vtmp;
933 	vdev_phys_t *vdev_label = (vdev_phys_t *) zap_scratch;
934 	vdev_phys_t *tmp_label;
935 	spa_t *spa;
936 	vdev_t *vdev, *top_vdev, *pool_vdev;
937 	off_t off;
938 	blkptr_t bp;
939 	const unsigned char *nvlist = NULL;
940 	uint64_t val;
941 	uint64_t guid;
942 	uint64_t best_txg = 0;
943 	uint64_t pool_txg, pool_guid;
944 	uint64_t psize;
945 	const char *pool_name;
946 	const unsigned char *vdevs;
947 	const unsigned char *features;
948 	int i, l, rc, is_newer;
949 	char *upbuf;
950 	const struct uberblock *up;
951 
952 	/*
953 	 * Load the vdev label and figure out which
954 	 * uberblock is most current.
955 	 */
956 	memset(&vtmp, 0, sizeof(vtmp));
957 	vtmp.v_phys_read = _read;
958 	vtmp.v_read_priv = read_priv;
959 	psize = P2ALIGN(ldi_get_size(read_priv),
960 	    (uint64_t)sizeof (vdev_label_t));
961 
962 	/* Test for minimum pool size. */
963 	if (psize < SPA_MINDEVSIZE)
964 		return (EIO);
965 
966 	tmp_label = zfs_alloc(sizeof(vdev_phys_t));
967 
968 	for (l = 0; l < VDEV_LABELS; l++) {
969 		off = vdev_label_offset(psize, l,
970 		    offsetof(vdev_label_t, vl_vdev_phys));
971 
972 		BP_ZERO(&bp);
973 		BP_SET_LSIZE(&bp, sizeof(vdev_phys_t));
974 		BP_SET_PSIZE(&bp, sizeof(vdev_phys_t));
975 		BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
976 		BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
977 		DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
978 		ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
979 
980 		if (vdev_read_phys(&vtmp, &bp, tmp_label, off, 0))
981 			continue;
982 
983 		if (tmp_label->vp_nvlist[0] != NV_ENCODE_XDR)
984 			continue;
985 
986 		nvlist = (const unsigned char *) tmp_label->vp_nvlist + 4;
987 		if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG,
988 		    DATA_TYPE_UINT64, NULL, &pool_txg) != 0)
989 			continue;
990 
991 		if (best_txg <= pool_txg) {
992 			best_txg = pool_txg;
993 			memcpy(vdev_label, tmp_label, sizeof (vdev_phys_t));
994 		}
995 	}
996 
997 	zfs_free(tmp_label, sizeof (vdev_phys_t));
998 
999 	if (best_txg == 0)
1000 		return (EIO);
1001 
1002 	if (vdev_label->vp_nvlist[0] != NV_ENCODE_XDR)
1003 		return (EIO);
1004 
1005 	nvlist = (const unsigned char *) vdev_label->vp_nvlist + 4;
1006 
1007 	if (nvlist_find(nvlist, ZPOOL_CONFIG_VERSION, DATA_TYPE_UINT64,
1008 	    NULL, &val) != 0) {
1009 		return (EIO);
1010 	}
1011 
1012 	if (!SPA_VERSION_IS_SUPPORTED(val)) {
1013 		printf("ZFS: unsupported ZFS version %u (should be %u)\n",
1014 		    (unsigned) val, (unsigned) SPA_VERSION);
1015 		return (EIO);
1016 	}
1017 
1018 	/* Check ZFS features for read */
1019 	if (nvlist_find(nvlist, ZPOOL_CONFIG_FEATURES_FOR_READ,
1020 	    DATA_TYPE_NVLIST, NULL, &features) == 0 &&
1021 	    nvlist_check_features_for_read(features) != 0) {
1022 		return (EIO);
1023 	}
1024 
1025 	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_STATE, DATA_TYPE_UINT64,
1026 	    NULL, &val) != 0) {
1027 		return (EIO);
1028 	}
1029 
1030 	if (val == POOL_STATE_DESTROYED) {
1031 		/* We don't boot only from destroyed pools. */
1032 		return (EIO);
1033 	}
1034 
1035 	if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_TXG, DATA_TYPE_UINT64,
1036 	    NULL, &pool_txg) != 0 ||
1037 	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64,
1038 	    NULL, &pool_guid) != 0 ||
1039 	    nvlist_find(nvlist, ZPOOL_CONFIG_POOL_NAME, DATA_TYPE_STRING,
1040 	    NULL, &pool_name) != 0) {
1041 		/*
1042 		 * Cache and spare devices end up here - just ignore
1043 		 * them.
1044 		 */
1045 		/*printf("ZFS: can't find pool details\n");*/
1046 		return (EIO);
1047 	}
1048 
1049 	if (nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64,
1050 	    NULL, &val) == 0 && val != 0) {
1051 		return (EIO);
1052 	}
1053 
1054 	/*
1055 	 * Create the pool if this is the first time we've seen it.
1056 	 */
1057 	spa = spa_find_by_guid(pool_guid);
1058 	if (spa == NULL) {
1059 		spa = spa_create(pool_guid, pool_name);
1060 		if (spa == NULL)
1061 			return (ENOMEM);
1062 	}
1063 	if (pool_txg > spa->spa_txg) {
1064 		spa->spa_txg = pool_txg;
1065 		is_newer = 1;
1066 	} else {
1067 		is_newer = 0;
1068 	}
1069 
1070 	/*
1071 	 * Get the vdev tree and create our in-core copy of it.
1072 	 * If we already have a vdev with this guid, this must
1073 	 * be some kind of alias (overlapping slices, dangerously dedicated
1074 	 * disks etc).
1075 	 */
1076 	if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64,
1077 	    NULL, &guid) != 0) {
1078 		return (EIO);
1079 	}
1080 	vdev = vdev_find(guid);
1081 	if (vdev && vdev->v_phys_read)	/* Has this vdev already been inited? */
1082 		return (EIO);
1083 
1084 	if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST,
1085 	    NULL, &vdevs)) {
1086 		return (EIO);
1087 	}
1088 
1089 	rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer);
1090 	if (rc != 0)
1091 		return (rc);
1092 
1093 	/*
1094 	 * Add the toplevel vdev to the pool if its not already there.
1095 	 */
1096 	STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink)
1097 		if (top_vdev == pool_vdev)
1098 			break;
1099 	if (!pool_vdev && top_vdev) {
1100 		top_vdev->spa = spa;
1101 		STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink);
1102 	}
1103 
1104 	/*
1105 	 * We should already have created an incomplete vdev for this
1106 	 * vdev. Find it and initialise it with our read proc.
1107 	 */
1108 	vdev = vdev_find(guid);
1109 	if (vdev) {
1110 		vdev->v_phys_read = _read;
1111 		vdev->v_read_priv = read_priv;
1112 		vdev->v_state = VDEV_STATE_HEALTHY;
1113 	} else {
1114 		printf("ZFS: inconsistent nvlist contents\n");
1115 		return (EIO);
1116 	}
1117 
1118 	/*
1119 	 * Re-evaluate top-level vdev state.
1120 	 */
1121 	vdev_set_state(top_vdev);
1122 
1123 	/*
1124 	 * Ok, we are happy with the pool so far. Lets find
1125 	 * the best uberblock and then we can actually access
1126 	 * the contents of the pool.
1127 	 */
1128 	upbuf = zfs_alloc(VDEV_UBERBLOCK_SIZE(vdev));
1129 	up = (const struct uberblock *)upbuf;
1130 	for (l = 0; l < VDEV_LABELS; l++) {
1131 		for (i = 0; i < VDEV_UBERBLOCK_COUNT(vdev); i++) {
1132 			off = vdev_label_offset(psize, l,
1133 			    VDEV_UBERBLOCK_OFFSET(vdev, i));
1134 			BP_ZERO(&bp);
1135 			DVA_SET_OFFSET(&bp.blk_dva[0], off);
1136 			BP_SET_LSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1137 			BP_SET_PSIZE(&bp, VDEV_UBERBLOCK_SIZE(vdev));
1138 			BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
1139 			BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
1140 			ZIO_SET_CHECKSUM(&bp.blk_cksum, off, 0, 0, 0);
1141 
1142 			if (vdev_read_phys(vdev, &bp, upbuf, off, 0))
1143 				continue;
1144 
1145 			if (up->ub_magic != UBERBLOCK_MAGIC)
1146 				continue;
1147 			if (up->ub_txg < spa->spa_txg)
1148 				continue;
1149 			if (up->ub_txg > spa->spa_uberblock.ub_txg ||
1150 			    (up->ub_txg == spa->spa_uberblock.ub_txg &&
1151 			    up->ub_timestamp >
1152 			    spa->spa_uberblock.ub_timestamp)) {
1153 				spa->spa_uberblock = *up;
1154 			}
1155 		}
1156 	}
1157 	zfs_free(upbuf, VDEV_UBERBLOCK_SIZE(vdev));
1158 
1159 	vdev->spa = spa;
1160 	if (spap != NULL)
1161 		*spap = spa;
1162 	return (0);
1163 }
1164 
1165 static int
1166 ilog2(int n)
1167 {
1168 	int v;
1169 
1170 	for (v = 0; v < 32; v++)
1171 		if (n == (1 << v))
1172 			return v;
1173 	return -1;
1174 }
1175 
1176 static int
1177 zio_read_gang(const spa_t *spa, const blkptr_t *bp, void *buf)
1178 {
1179 	blkptr_t gbh_bp;
1180 	zio_gbh_phys_t zio_gb;
1181 	char *pbuf;
1182 	int i;
1183 
1184 	/* Artificial BP for gang block header. */
1185 	gbh_bp = *bp;
1186 	BP_SET_PSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1187 	BP_SET_LSIZE(&gbh_bp, SPA_GANGBLOCKSIZE);
1188 	BP_SET_CHECKSUM(&gbh_bp, ZIO_CHECKSUM_GANG_HEADER);
1189 	BP_SET_COMPRESS(&gbh_bp, ZIO_COMPRESS_OFF);
1190 	for (i = 0; i < SPA_DVAS_PER_BP; i++)
1191 		DVA_SET_GANG(&gbh_bp.blk_dva[i], 0);
1192 
1193 	/* Read gang header block using the artificial BP. */
1194 	if (zio_read(spa, &gbh_bp, &zio_gb))
1195 		return (EIO);
1196 
1197 	pbuf = buf;
1198 	for (i = 0; i < SPA_GBH_NBLKPTRS; i++) {
1199 		blkptr_t *gbp = &zio_gb.zg_blkptr[i];
1200 
1201 		if (BP_IS_HOLE(gbp))
1202 			continue;
1203 		if (zio_read(spa, gbp, pbuf))
1204 			return (EIO);
1205 		pbuf += BP_GET_PSIZE(gbp);
1206 	}
1207 
1208 	if (zio_checksum_verify(spa, bp, buf))
1209 		return (EIO);
1210 	return (0);
1211 }
1212 
1213 static int
1214 zio_read(const spa_t *spa, const blkptr_t *bp, void *buf)
1215 {
1216 	int cpfunc = BP_GET_COMPRESS(bp);
1217 	uint64_t align, size;
1218 	void *pbuf;
1219 	int i, error;
1220 
1221 	/*
1222 	 * Process data embedded in block pointer
1223 	 */
1224 	if (BP_IS_EMBEDDED(bp)) {
1225 		ASSERT(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
1226 
1227 		size = BPE_GET_PSIZE(bp);
1228 		ASSERT(size <= BPE_PAYLOAD_SIZE);
1229 
1230 		if (cpfunc != ZIO_COMPRESS_OFF)
1231 			pbuf = zfs_alloc(size);
1232 		else
1233 			pbuf = buf;
1234 
1235 		decode_embedded_bp_compressed(bp, pbuf);
1236 		error = 0;
1237 
1238 		if (cpfunc != ZIO_COMPRESS_OFF) {
1239 			error = zio_decompress_data(cpfunc, pbuf,
1240 			    size, buf, BP_GET_LSIZE(bp));
1241 			zfs_free(pbuf, size);
1242 		}
1243 		if (error != 0)
1244 			printf("ZFS: i/o error - unable to decompress block pointer data, error %d\n",
1245 			    error);
1246 		return (error);
1247 	}
1248 
1249 	error = EIO;
1250 
1251 	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
1252 		const dva_t *dva = &bp->blk_dva[i];
1253 		vdev_t *vdev;
1254 		int vdevid;
1255 		off_t offset;
1256 
1257 		if (!dva->dva_word[0] && !dva->dva_word[1])
1258 			continue;
1259 
1260 		vdevid = DVA_GET_VDEV(dva);
1261 		offset = DVA_GET_OFFSET(dva);
1262 		STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) {
1263 			if (vdev->v_id == vdevid)
1264 				break;
1265 		}
1266 		if (!vdev || !vdev->v_read)
1267 			continue;
1268 
1269 		size = BP_GET_PSIZE(bp);
1270 		if (vdev->v_read == vdev_raidz_read) {
1271 			align = 1ULL << vdev->v_top->v_ashift;
1272 			if (P2PHASE(size, align) != 0)
1273 				size = P2ROUNDUP(size, align);
1274 		}
1275 		if (size != BP_GET_PSIZE(bp) || cpfunc != ZIO_COMPRESS_OFF)
1276 			pbuf = zfs_alloc(size);
1277 		else
1278 			pbuf = buf;
1279 
1280 		if (DVA_GET_GANG(dva))
1281 			error = zio_read_gang(spa, bp, pbuf);
1282 		else
1283 			error = vdev->v_read(vdev, bp, pbuf, offset, size);
1284 		if (error == 0) {
1285 			if (cpfunc != ZIO_COMPRESS_OFF)
1286 				error = zio_decompress_data(cpfunc, pbuf,
1287 				    BP_GET_PSIZE(bp), buf, BP_GET_LSIZE(bp));
1288 			else if (size != BP_GET_PSIZE(bp))
1289 				bcopy(pbuf, buf, BP_GET_PSIZE(bp));
1290 		}
1291 		if (buf != pbuf)
1292 			zfs_free(pbuf, size);
1293 		if (error == 0)
1294 			break;
1295 	}
1296 	if (error != 0)
1297 		printf("ZFS: i/o error - all block copies unavailable\n");
1298 	return (error);
1299 }
1300 
1301 static int
1302 dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, void *buf, size_t buflen)
1303 {
1304 	int ibshift = dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
1305 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1306 	int nlevels = dnode->dn_nlevels;
1307 	int i, rc;
1308 
1309 	if (bsize > SPA_MAXBLOCKSIZE) {
1310 		printf("ZFS: I/O error - blocks larger than %llu are not "
1311 		    "supported\n", SPA_MAXBLOCKSIZE);
1312 		return (EIO);
1313 	}
1314 
1315 	/*
1316 	 * Note: bsize may not be a power of two here so we need to do an
1317 	 * actual divide rather than a bitshift.
1318 	 */
1319 	while (buflen > 0) {
1320 		uint64_t bn = offset / bsize;
1321 		int boff = offset % bsize;
1322 		int ibn;
1323 		const blkptr_t *indbp;
1324 		blkptr_t bp;
1325 
1326 		if (bn > dnode->dn_maxblkid)
1327 			return (EIO);
1328 
1329 		if (dnode == dnode_cache_obj && bn == dnode_cache_bn)
1330 			goto cached;
1331 
1332 		indbp = dnode->dn_blkptr;
1333 		for (i = 0; i < nlevels; i++) {
1334 			/*
1335 			 * Copy the bp from the indirect array so that
1336 			 * we can re-use the scratch buffer for multi-level
1337 			 * objects.
1338 			 */
1339 			ibn = bn >> ((nlevels - i - 1) * ibshift);
1340 			ibn &= ((1 << ibshift) - 1);
1341 			bp = indbp[ibn];
1342 			if (BP_IS_HOLE(&bp)) {
1343 				memset(dnode_cache_buf, 0, bsize);
1344 				break;
1345 			}
1346 			rc = zio_read(spa, &bp, dnode_cache_buf);
1347 			if (rc)
1348 				return (rc);
1349 			indbp = (const blkptr_t *) dnode_cache_buf;
1350 		}
1351 		dnode_cache_obj = dnode;
1352 		dnode_cache_bn = bn;
1353 	cached:
1354 
1355 		/*
1356 		 * The buffer contains our data block. Copy what we
1357 		 * need from it and loop.
1358 		 */
1359 		i = bsize - boff;
1360 		if (i > buflen) i = buflen;
1361 		memcpy(buf, &dnode_cache_buf[boff], i);
1362 		buf = ((char*) buf) + i;
1363 		offset += i;
1364 		buflen -= i;
1365 	}
1366 
1367 	return (0);
1368 }
1369 
1370 /*
1371  * Lookup a value in a microzap directory. Assumes that the zap
1372  * scratch buffer contains the directory contents.
1373  */
1374 static int
1375 mzap_lookup(const dnode_phys_t *dnode, const char *name, uint64_t *value)
1376 {
1377 	const mzap_phys_t *mz;
1378 	const mzap_ent_phys_t *mze;
1379 	size_t size;
1380 	int chunks, i;
1381 
1382 	/*
1383 	 * Microzap objects use exactly one block. Read the whole
1384 	 * thing.
1385 	 */
1386 	size = dnode->dn_datablkszsec * 512;
1387 
1388 	mz = (const mzap_phys_t *) zap_scratch;
1389 	chunks = size / MZAP_ENT_LEN - 1;
1390 
1391 	for (i = 0; i < chunks; i++) {
1392 		mze = &mz->mz_chunk[i];
1393 		if (!strcmp(mze->mze_name, name)) {
1394 			*value = mze->mze_value;
1395 			return (0);
1396 		}
1397 	}
1398 
1399 	return (ENOENT);
1400 }
1401 
1402 /*
1403  * Compare a name with a zap leaf entry. Return non-zero if the name
1404  * matches.
1405  */
1406 static int
1407 fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *name)
1408 {
1409 	size_t namelen;
1410 	const zap_leaf_chunk_t *nc;
1411 	const char *p;
1412 
1413 	namelen = zc->l_entry.le_name_numints;
1414 
1415 	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1416 	p = name;
1417 	while (namelen > 0) {
1418 		size_t len;
1419 		len = namelen;
1420 		if (len > ZAP_LEAF_ARRAY_BYTES)
1421 			len = ZAP_LEAF_ARRAY_BYTES;
1422 		if (memcmp(p, nc->l_array.la_array, len))
1423 			return (0);
1424 		p += len;
1425 		namelen -= len;
1426 		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1427 	}
1428 
1429 	return 1;
1430 }
1431 
1432 /*
1433  * Extract a uint64_t value from a zap leaf entry.
1434  */
1435 static uint64_t
1436 fzap_leaf_value(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc)
1437 {
1438 	const zap_leaf_chunk_t *vc;
1439 	int i;
1440 	uint64_t value;
1441 	const uint8_t *p;
1442 
1443 	vc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_value_chunk);
1444 	for (i = 0, value = 0, p = vc->l_array.la_array; i < 8; i++) {
1445 		value = (value << 8) | p[i];
1446 	}
1447 
1448 	return value;
1449 }
1450 
1451 static void
1452 stv(int len, void *addr, uint64_t value)
1453 {
1454 	switch (len) {
1455 	case 1:
1456 		*(uint8_t *)addr = value;
1457 		return;
1458 	case 2:
1459 		*(uint16_t *)addr = value;
1460 		return;
1461 	case 4:
1462 		*(uint32_t *)addr = value;
1463 		return;
1464 	case 8:
1465 		*(uint64_t *)addr = value;
1466 		return;
1467 	}
1468 }
1469 
1470 /*
1471  * Extract a array from a zap leaf entry.
1472  */
1473 static void
1474 fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc,
1475     uint64_t integer_size, uint64_t num_integers, void *buf)
1476 {
1477 	uint64_t array_int_len = zc->l_entry.le_value_intlen;
1478 	uint64_t value = 0;
1479 	uint64_t *u64 = buf;
1480 	char *p = buf;
1481 	int len = MIN(zc->l_entry.le_value_numints, num_integers);
1482 	int chunk = zc->l_entry.le_value_chunk;
1483 	int byten = 0;
1484 
1485 	if (integer_size == 8 && len == 1) {
1486 		*u64 = fzap_leaf_value(zl, zc);
1487 		return;
1488 	}
1489 
1490 	while (len > 0) {
1491 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(zl, chunk).l_array;
1492 		int i;
1493 
1494 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(zl));
1495 		for (i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) {
1496 			value = (value << 8) | la->la_array[i];
1497 			byten++;
1498 			if (byten == array_int_len) {
1499 				stv(integer_size, p, value);
1500 				byten = 0;
1501 				len--;
1502 				if (len == 0)
1503 					return;
1504 				p += integer_size;
1505 			}
1506 		}
1507 		chunk = la->la_next;
1508 	}
1509 }
1510 
1511 /*
1512  * Lookup a value in a fatzap directory. Assumes that the zap scratch
1513  * buffer contains the directory header.
1514  */
1515 static int
1516 fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1517     uint64_t integer_size, uint64_t num_integers, void *value)
1518 {
1519 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1520 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1521 	fat_zap_t z;
1522 	uint64_t *ptrtbl;
1523 	uint64_t hash;
1524 	int rc;
1525 
1526 	if (zh.zap_magic != ZAP_MAGIC)
1527 		return (EIO);
1528 
1529 	z.zap_block_shift = ilog2(bsize);
1530 	z.zap_phys = (zap_phys_t *) zap_scratch;
1531 
1532 	/*
1533 	 * Figure out where the pointer table is and read it in if necessary.
1534 	 */
1535 	if (zh.zap_ptrtbl.zt_blk) {
1536 		rc = dnode_read(spa, dnode, zh.zap_ptrtbl.zt_blk * bsize,
1537 			       zap_scratch, bsize);
1538 		if (rc)
1539 			return (rc);
1540 		ptrtbl = (uint64_t *) zap_scratch;
1541 	} else {
1542 		ptrtbl = &ZAP_EMBEDDED_PTRTBL_ENT(&z, 0);
1543 	}
1544 
1545 	hash = zap_hash(zh.zap_salt, name);
1546 
1547 	zap_leaf_t zl;
1548 	zl.l_bs = z.zap_block_shift;
1549 
1550 	off_t off = ptrtbl[hash >> (64 - zh.zap_ptrtbl.zt_shift)] << zl.l_bs;
1551 	zap_leaf_chunk_t *zc;
1552 
1553 	rc = dnode_read(spa, dnode, off, zap_scratch, bsize);
1554 	if (rc)
1555 		return (rc);
1556 
1557 	zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1558 
1559 	/*
1560 	 * Make sure this chunk matches our hash.
1561 	 */
1562 	if (zl.l_phys->l_hdr.lh_prefix_len > 0
1563 	    && zl.l_phys->l_hdr.lh_prefix
1564 	    != hash >> (64 - zl.l_phys->l_hdr.lh_prefix_len))
1565 		return (ENOENT);
1566 
1567 	/*
1568 	 * Hash within the chunk to find our entry.
1569 	 */
1570 	int shift = (64 - ZAP_LEAF_HASH_SHIFT(&zl) - zl.l_phys->l_hdr.lh_prefix_len);
1571 	int h = (hash >> shift) & ((1 << ZAP_LEAF_HASH_SHIFT(&zl)) - 1);
1572 	h = zl.l_phys->l_hash[h];
1573 	if (h == 0xffff)
1574 		return (ENOENT);
1575 	zc = &ZAP_LEAF_CHUNK(&zl, h);
1576 	while (zc->l_entry.le_hash != hash) {
1577 		if (zc->l_entry.le_next == 0xffff) {
1578 			zc = NULL;
1579 			break;
1580 		}
1581 		zc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_next);
1582 	}
1583 	if (fzap_name_equal(&zl, zc, name)) {
1584 		if (zc->l_entry.le_value_intlen * zc->l_entry.le_value_numints >
1585 		    integer_size * num_integers)
1586 			return (E2BIG);
1587 		fzap_leaf_array(&zl, zc, integer_size, num_integers, value);
1588 		return (0);
1589 	}
1590 
1591 	return (ENOENT);
1592 }
1593 
1594 /*
1595  * Lookup a name in a zap object and return its value as a uint64_t.
1596  */
1597 static int
1598 zap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name,
1599     uint64_t integer_size, uint64_t num_integers, void *value)
1600 {
1601 	int rc;
1602 	uint64_t zap_type;
1603 	size_t size = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1604 
1605 	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1606 	if (rc)
1607 		return (rc);
1608 
1609 	zap_type = *(uint64_t *) zap_scratch;
1610 	if (zap_type == ZBT_MICRO)
1611 		return mzap_lookup(dnode, name, value);
1612 	else if (zap_type == ZBT_HEADER) {
1613 		return fzap_lookup(spa, dnode, name, integer_size,
1614 		    num_integers, value);
1615 	}
1616 	printf("ZFS: invalid zap_type=%d\n", (int)zap_type);
1617 	return (EIO);
1618 }
1619 
1620 /*
1621  * List a microzap directory. Assumes that the zap scratch buffer contains
1622  * the directory contents.
1623  */
1624 static int
1625 mzap_list(const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1626 {
1627 	const mzap_phys_t *mz;
1628 	const mzap_ent_phys_t *mze;
1629 	size_t size;
1630 	int chunks, i, rc;
1631 
1632 	/*
1633 	 * Microzap objects use exactly one block. Read the whole
1634 	 * thing.
1635 	 */
1636 	size = dnode->dn_datablkszsec * 512;
1637 	mz = (const mzap_phys_t *) zap_scratch;
1638 	chunks = size / MZAP_ENT_LEN - 1;
1639 
1640 	for (i = 0; i < chunks; i++) {
1641 		mze = &mz->mz_chunk[i];
1642 		if (mze->mze_name[0]) {
1643 			rc = callback(mze->mze_name, mze->mze_value);
1644 			if (rc != 0)
1645 				return (rc);
1646 		}
1647 	}
1648 
1649 	return (0);
1650 }
1651 
1652 /*
1653  * List a fatzap directory. Assumes that the zap scratch buffer contains
1654  * the directory header.
1655  */
1656 static int
1657 fzap_list(const spa_t *spa, const dnode_phys_t *dnode, int (*callback)(const char *, uint64_t))
1658 {
1659 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1660 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1661 	fat_zap_t z;
1662 	int i, j, rc;
1663 
1664 	if (zh.zap_magic != ZAP_MAGIC)
1665 		return (EIO);
1666 
1667 	z.zap_block_shift = ilog2(bsize);
1668 	z.zap_phys = (zap_phys_t *) zap_scratch;
1669 
1670 	/*
1671 	 * This assumes that the leaf blocks start at block 1. The
1672 	 * documentation isn't exactly clear on this.
1673 	 */
1674 	zap_leaf_t zl;
1675 	zl.l_bs = z.zap_block_shift;
1676 	for (i = 0; i < zh.zap_num_leafs; i++) {
1677 		off_t off = (i + 1) << zl.l_bs;
1678 		char name[256], *p;
1679 		uint64_t value;
1680 
1681 		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1682 			return (EIO);
1683 
1684 		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1685 
1686 		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1687 			zap_leaf_chunk_t *zc, *nc;
1688 			int namelen;
1689 
1690 			zc = &ZAP_LEAF_CHUNK(&zl, j);
1691 			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1692 				continue;
1693 			namelen = zc->l_entry.le_name_numints;
1694 			if (namelen > sizeof(name))
1695 				namelen = sizeof(name);
1696 
1697 			/*
1698 			 * Paste the name back together.
1699 			 */
1700 			nc = &ZAP_LEAF_CHUNK(&zl, zc->l_entry.le_name_chunk);
1701 			p = name;
1702 			while (namelen > 0) {
1703 				int len;
1704 				len = namelen;
1705 				if (len > ZAP_LEAF_ARRAY_BYTES)
1706 					len = ZAP_LEAF_ARRAY_BYTES;
1707 				memcpy(p, nc->l_array.la_array, len);
1708 				p += len;
1709 				namelen -= len;
1710 				nc = &ZAP_LEAF_CHUNK(&zl, nc->l_array.la_next);
1711 			}
1712 
1713 			/*
1714 			 * Assume the first eight bytes of the value are
1715 			 * a uint64_t.
1716 			 */
1717 			value = fzap_leaf_value(&zl, zc);
1718 
1719 			//printf("%s 0x%jx\n", name, (uintmax_t)value);
1720 			rc = callback((const char *)name, value);
1721 			if (rc != 0)
1722 				return (rc);
1723 		}
1724 	}
1725 
1726 	return (0);
1727 }
1728 
1729 static int zfs_printf(const char *name, uint64_t value __unused)
1730 {
1731 
1732 	printf("%s\n", name);
1733 
1734 	return (0);
1735 }
1736 
1737 /*
1738  * List a zap directory.
1739  */
1740 static int
1741 zap_list(const spa_t *spa, const dnode_phys_t *dnode)
1742 {
1743 	uint64_t zap_type;
1744 	size_t size = dnode->dn_datablkszsec * 512;
1745 
1746 	if (dnode_read(spa, dnode, 0, zap_scratch, size))
1747 		return (EIO);
1748 
1749 	zap_type = *(uint64_t *) zap_scratch;
1750 	if (zap_type == ZBT_MICRO)
1751 		return mzap_list(dnode, zfs_printf);
1752 	else
1753 		return fzap_list(spa, dnode, zfs_printf);
1754 }
1755 
1756 static int
1757 objset_get_dnode(const spa_t *spa, const objset_phys_t *os, uint64_t objnum, dnode_phys_t *dnode)
1758 {
1759 	off_t offset;
1760 
1761 	offset = objnum * sizeof(dnode_phys_t);
1762 	return dnode_read(spa, &os->os_meta_dnode, offset,
1763 		dnode, sizeof(dnode_phys_t));
1764 }
1765 
1766 static int
1767 mzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1768 {
1769 	const mzap_phys_t *mz;
1770 	const mzap_ent_phys_t *mze;
1771 	size_t size;
1772 	int chunks, i;
1773 
1774 	/*
1775 	 * Microzap objects use exactly one block. Read the whole
1776 	 * thing.
1777 	 */
1778 	size = dnode->dn_datablkszsec * 512;
1779 
1780 	mz = (const mzap_phys_t *) zap_scratch;
1781 	chunks = size / MZAP_ENT_LEN - 1;
1782 
1783 	for (i = 0; i < chunks; i++) {
1784 		mze = &mz->mz_chunk[i];
1785 		if (value == mze->mze_value) {
1786 			strcpy(name, mze->mze_name);
1787 			return (0);
1788 		}
1789 	}
1790 
1791 	return (ENOENT);
1792 }
1793 
1794 static void
1795 fzap_name_copy(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, char *name)
1796 {
1797 	size_t namelen;
1798 	const zap_leaf_chunk_t *nc;
1799 	char *p;
1800 
1801 	namelen = zc->l_entry.le_name_numints;
1802 
1803 	nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk);
1804 	p = name;
1805 	while (namelen > 0) {
1806 		size_t len;
1807 		len = namelen;
1808 		if (len > ZAP_LEAF_ARRAY_BYTES)
1809 			len = ZAP_LEAF_ARRAY_BYTES;
1810 		memcpy(p, nc->l_array.la_array, len);
1811 		p += len;
1812 		namelen -= len;
1813 		nc = &ZAP_LEAF_CHUNK(zl, nc->l_array.la_next);
1814 	}
1815 
1816 	*p = '\0';
1817 }
1818 
1819 static int
1820 fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1821 {
1822 	int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT;
1823 	zap_phys_t zh = *(zap_phys_t *) zap_scratch;
1824 	fat_zap_t z;
1825 	int i, j;
1826 
1827 	if (zh.zap_magic != ZAP_MAGIC)
1828 		return (EIO);
1829 
1830 	z.zap_block_shift = ilog2(bsize);
1831 	z.zap_phys = (zap_phys_t *) zap_scratch;
1832 
1833 	/*
1834 	 * This assumes that the leaf blocks start at block 1. The
1835 	 * documentation isn't exactly clear on this.
1836 	 */
1837 	zap_leaf_t zl;
1838 	zl.l_bs = z.zap_block_shift;
1839 	for (i = 0; i < zh.zap_num_leafs; i++) {
1840 		off_t off = (i + 1) << zl.l_bs;
1841 
1842 		if (dnode_read(spa, dnode, off, zap_scratch, bsize))
1843 			return (EIO);
1844 
1845 		zl.l_phys = (zap_leaf_phys_t *) zap_scratch;
1846 
1847 		for (j = 0; j < ZAP_LEAF_NUMCHUNKS(&zl); j++) {
1848 			zap_leaf_chunk_t *zc;
1849 
1850 			zc = &ZAP_LEAF_CHUNK(&zl, j);
1851 			if (zc->l_entry.le_type != ZAP_CHUNK_ENTRY)
1852 				continue;
1853 			if (zc->l_entry.le_value_intlen != 8 ||
1854 			    zc->l_entry.le_value_numints != 1)
1855 				continue;
1856 
1857 			if (fzap_leaf_value(&zl, zc) == value) {
1858 				fzap_name_copy(&zl, zc, name);
1859 				return (0);
1860 			}
1861 		}
1862 	}
1863 
1864 	return (ENOENT);
1865 }
1866 
1867 static int
1868 zap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value)
1869 {
1870 	int rc;
1871 	uint64_t zap_type;
1872 	size_t size = dnode->dn_datablkszsec * 512;
1873 
1874 	rc = dnode_read(spa, dnode, 0, zap_scratch, size);
1875 	if (rc)
1876 		return (rc);
1877 
1878 	zap_type = *(uint64_t *) zap_scratch;
1879 	if (zap_type == ZBT_MICRO)
1880 		return mzap_rlookup(spa, dnode, name, value);
1881 	else
1882 		return fzap_rlookup(spa, dnode, name, value);
1883 }
1884 
1885 static int
1886 zfs_rlookup(const spa_t *spa, uint64_t objnum, char *result)
1887 {
1888 	char name[256];
1889 	char component[256];
1890 	uint64_t dir_obj, parent_obj, child_dir_zapobj;
1891 	dnode_phys_t child_dir_zap, dataset, dir, parent;
1892 	dsl_dir_phys_t *dd;
1893 	dsl_dataset_phys_t *ds;
1894 	char *p;
1895 	int len;
1896 
1897 	p = &name[sizeof(name) - 1];
1898 	*p = '\0';
1899 
1900 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
1901 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
1902 		return (EIO);
1903 	}
1904 	ds = (dsl_dataset_phys_t *)&dataset.dn_bonus;
1905 	dir_obj = ds->ds_dir_obj;
1906 
1907 	for (;;) {
1908 		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir) != 0)
1909 			return (EIO);
1910 		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1911 
1912 		/* Actual loop condition. */
1913 		parent_obj  = dd->dd_parent_obj;
1914 		if (parent_obj == 0)
1915 			break;
1916 
1917 		if (objset_get_dnode(spa, &spa->spa_mos, parent_obj, &parent) != 0)
1918 			return (EIO);
1919 		dd = (dsl_dir_phys_t *)&parent.dn_bonus;
1920 		child_dir_zapobj = dd->dd_child_dir_zapobj;
1921 		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1922 			return (EIO);
1923 		if (zap_rlookup(spa, &child_dir_zap, component, dir_obj) != 0)
1924 			return (EIO);
1925 
1926 		len = strlen(component);
1927 		p -= len;
1928 		memcpy(p, component, len);
1929 		--p;
1930 		*p = '/';
1931 
1932 		/* Actual loop iteration. */
1933 		dir_obj = parent_obj;
1934 	}
1935 
1936 	if (*p != '\0')
1937 		++p;
1938 	strcpy(result, p);
1939 
1940 	return (0);
1941 }
1942 
1943 static int
1944 zfs_lookup_dataset(const spa_t *spa, const char *name, uint64_t *objnum)
1945 {
1946 	char element[256];
1947 	uint64_t dir_obj, child_dir_zapobj;
1948 	dnode_phys_t child_dir_zap, dir;
1949 	dsl_dir_phys_t *dd;
1950 	const char *p, *q;
1951 
1952 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir))
1953 		return (EIO);
1954 	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (dir_obj),
1955 	    1, &dir_obj))
1956 		return (EIO);
1957 
1958 	p = name;
1959 	for (;;) {
1960 		if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir))
1961 			return (EIO);
1962 		dd = (dsl_dir_phys_t *)&dir.dn_bonus;
1963 
1964 		while (*p == '/')
1965 			p++;
1966 		/* Actual loop condition #1. */
1967 		if (*p == '\0')
1968 			break;
1969 
1970 		q = strchr(p, '/');
1971 		if (q) {
1972 			memcpy(element, p, q - p);
1973 			element[q - p] = '\0';
1974 			p = q + 1;
1975 		} else {
1976 			strcpy(element, p);
1977 			p += strlen(p);
1978 		}
1979 
1980 		child_dir_zapobj = dd->dd_child_dir_zapobj;
1981 		if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0)
1982 			return (EIO);
1983 
1984 		/* Actual loop condition #2. */
1985 		if (zap_lookup(spa, &child_dir_zap, element, sizeof (dir_obj),
1986 		    1, &dir_obj) != 0)
1987 			return (ENOENT);
1988 	}
1989 
1990 	*objnum = dd->dd_head_dataset_obj;
1991 	return (0);
1992 }
1993 
1994 #ifndef BOOT2
1995 static int
1996 zfs_list_dataset(const spa_t *spa, uint64_t objnum/*, int pos, char *entry*/)
1997 {
1998 	uint64_t dir_obj, child_dir_zapobj;
1999 	dnode_phys_t child_dir_zap, dir, dataset;
2000 	dsl_dataset_phys_t *ds;
2001 	dsl_dir_phys_t *dd;
2002 
2003 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2004 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2005 		return (EIO);
2006 	}
2007 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2008 	dir_obj = ds->ds_dir_obj;
2009 
2010 	if (objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir)) {
2011 		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2012 		return (EIO);
2013 	}
2014 	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2015 
2016 	child_dir_zapobj = dd->dd_child_dir_zapobj;
2017 	if (objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap) != 0) {
2018 		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2019 		return (EIO);
2020 	}
2021 
2022 	return (zap_list(spa, &child_dir_zap) != 0);
2023 }
2024 
2025 int
2026 zfs_callback_dataset(const spa_t *spa, uint64_t objnum, int (*callback)(const char *, uint64_t))
2027 {
2028 	uint64_t dir_obj, child_dir_zapobj, zap_type;
2029 	dnode_phys_t child_dir_zap, dir, dataset;
2030 	dsl_dataset_phys_t *ds;
2031 	dsl_dir_phys_t *dd;
2032 	int err;
2033 
2034 	err = objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset);
2035 	if (err != 0) {
2036 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2037 		return (err);
2038 	}
2039 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2040 	dir_obj = ds->ds_dir_obj;
2041 
2042 	err = objset_get_dnode(spa, &spa->spa_mos, dir_obj, &dir);
2043 	if (err != 0) {
2044 		printf("ZFS: can't find dirobj %ju\n", (uintmax_t)dir_obj);
2045 		return (err);
2046 	}
2047 	dd = (dsl_dir_phys_t *)&dir.dn_bonus;
2048 
2049 	child_dir_zapobj = dd->dd_child_dir_zapobj;
2050 	err = objset_get_dnode(spa, &spa->spa_mos, child_dir_zapobj, &child_dir_zap);
2051 	if (err != 0) {
2052 		printf("ZFS: can't find child zap %ju\n", (uintmax_t)dir_obj);
2053 		return (err);
2054 	}
2055 
2056 	err = dnode_read(spa, &child_dir_zap, 0, zap_scratch, child_dir_zap.dn_datablkszsec * 512);
2057 	if (err != 0)
2058 		return (err);
2059 
2060 	zap_type = *(uint64_t *) zap_scratch;
2061 	if (zap_type == ZBT_MICRO)
2062 		return mzap_list(&child_dir_zap, callback);
2063 	else
2064 		return fzap_list(spa, &child_dir_zap, callback);
2065 }
2066 #endif
2067 
2068 /*
2069  * Find the object set given the object number of its dataset object
2070  * and return its details in *objset
2071  */
2072 static int
2073 zfs_mount_dataset(const spa_t *spa, uint64_t objnum, objset_phys_t *objset)
2074 {
2075 	dnode_phys_t dataset;
2076 	dsl_dataset_phys_t *ds;
2077 
2078 	if (objset_get_dnode(spa, &spa->spa_mos, objnum, &dataset)) {
2079 		printf("ZFS: can't find dataset %ju\n", (uintmax_t)objnum);
2080 		return (EIO);
2081 	}
2082 
2083 	ds = (dsl_dataset_phys_t *) &dataset.dn_bonus;
2084 	if (zio_read(spa, &ds->ds_bp, objset)) {
2085 		printf("ZFS: can't read object set for dataset %ju\n",
2086 		    (uintmax_t)objnum);
2087 		return (EIO);
2088 	}
2089 
2090 	return (0);
2091 }
2092 
2093 /*
2094  * Find the object set pointed to by the BOOTFS property or the root
2095  * dataset if there is none and return its details in *objset
2096  */
2097 static int
2098 zfs_get_root(const spa_t *spa, uint64_t *objid)
2099 {
2100 	dnode_phys_t dir, propdir;
2101 	uint64_t props, bootfs, root;
2102 
2103 	*objid = 0;
2104 
2105 	/*
2106 	 * Start with the MOS directory object.
2107 	 */
2108 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT, &dir)) {
2109 		printf("ZFS: can't read MOS object directory\n");
2110 		return (EIO);
2111 	}
2112 
2113 	/*
2114 	 * Lookup the pool_props and see if we can find a bootfs.
2115 	 */
2116 	if (zap_lookup(spa, &dir, DMU_POOL_PROPS, sizeof (props), 1, &props) == 0
2117 	     && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0
2118 	     && zap_lookup(spa, &propdir, "bootfs", sizeof (bootfs), 1, &bootfs) == 0
2119 	     && bootfs != 0)
2120 	{
2121 		*objid = bootfs;
2122 		return (0);
2123 	}
2124 	/*
2125 	 * Lookup the root dataset directory
2126 	 */
2127 	if (zap_lookup(spa, &dir, DMU_POOL_ROOT_DATASET, sizeof (root), 1, &root)
2128 	    || objset_get_dnode(spa, &spa->spa_mos, root, &dir)) {
2129 		printf("ZFS: can't find root dsl_dir\n");
2130 		return (EIO);
2131 	}
2132 
2133 	/*
2134 	 * Use the information from the dataset directory's bonus buffer
2135 	 * to find the dataset object and from that the object set itself.
2136 	 */
2137 	dsl_dir_phys_t *dd = (dsl_dir_phys_t *) &dir.dn_bonus;
2138 	*objid = dd->dd_head_dataset_obj;
2139 	return (0);
2140 }
2141 
2142 static int
2143 zfs_mount(const spa_t *spa, uint64_t rootobj, struct zfsmount *mount)
2144 {
2145 
2146 	mount->spa = spa;
2147 
2148 	/*
2149 	 * Find the root object set if not explicitly provided
2150 	 */
2151 	if (rootobj == 0 && zfs_get_root(spa, &rootobj)) {
2152 		printf("ZFS: can't find root filesystem\n");
2153 		return (EIO);
2154 	}
2155 
2156 	if (zfs_mount_dataset(spa, rootobj, &mount->objset)) {
2157 		printf("ZFS: can't open root filesystem\n");
2158 		return (EIO);
2159 	}
2160 
2161 	mount->rootobj = rootobj;
2162 
2163 	return (0);
2164 }
2165 
2166 /*
2167  * callback function for feature name checks.
2168  */
2169 static int
2170 check_feature(const char *name, uint64_t value)
2171 {
2172 	int i;
2173 
2174 	if (value == 0)
2175 		return (0);
2176 	if (name[0] == '\0')
2177 		return (0);
2178 
2179 	for (i = 0; features_for_read[i] != NULL; i++) {
2180 		if (strcmp(name, features_for_read[i]) == 0)
2181 			return (0);
2182 	}
2183 	printf("ZFS: unsupported feature: %s\n", name);
2184 	return (EIO);
2185 }
2186 
2187 /*
2188  * Checks whether the MOS features that are active are supported.
2189  */
2190 static int
2191 check_mos_features(const spa_t *spa)
2192 {
2193 	dnode_phys_t dir;
2194 	uint64_t objnum, zap_type;
2195 	size_t size;
2196 	int rc;
2197 
2198 	if ((rc = objset_get_dnode(spa, &spa->spa_mos, DMU_OT_OBJECT_DIRECTORY,
2199 	    &dir)) != 0)
2200 		return (rc);
2201 	if ((rc = zap_lookup(spa, &dir, DMU_POOL_FEATURES_FOR_READ,
2202 	    sizeof (objnum), 1, &objnum)) != 0) {
2203 		/*
2204 		 * It is older pool without features. As we have already
2205 		 * tested the label, just return without raising the error.
2206 		 */
2207 		return (0);
2208 	}
2209 
2210 	if ((rc = objset_get_dnode(spa, &spa->spa_mos, objnum, &dir)) != 0)
2211 		return (rc);
2212 
2213 	if (dir.dn_type != DMU_OTN_ZAP_METADATA)
2214 		return (EIO);
2215 
2216 	size = dir.dn_datablkszsec * 512;
2217 	if (dnode_read(spa, &dir, 0, zap_scratch, size))
2218 		return (EIO);
2219 
2220 	zap_type = *(uint64_t *) zap_scratch;
2221 	if (zap_type == ZBT_MICRO)
2222 		rc = mzap_list(&dir, check_feature);
2223 	else
2224 		rc = fzap_list(spa, &dir, check_feature);
2225 
2226 	return (rc);
2227 }
2228 
2229 static int
2230 zfs_spa_init(spa_t *spa)
2231 {
2232 	dnode_phys_t dir;
2233 	int rc;
2234 
2235 	if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) {
2236 		printf("ZFS: can't read MOS of pool %s\n", spa->spa_name);
2237 		return (EIO);
2238 	}
2239 	if (spa->spa_mos.os_type != DMU_OST_META) {
2240 		printf("ZFS: corrupted MOS of pool %s\n", spa->spa_name);
2241 		return (EIO);
2242 	}
2243 
2244 	if (objset_get_dnode(spa, &spa->spa_mos, DMU_POOL_DIRECTORY_OBJECT,
2245 	    &dir)) {
2246 		printf("ZFS: failed to read pool %s directory object\n",
2247 		    spa->spa_name);
2248 		return (EIO);
2249 	}
2250 	/* this is allowed to fail, older pools do not have salt */
2251 	rc = zap_lookup(spa, &dir, DMU_POOL_CHECKSUM_SALT, 1,
2252 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
2253 	    spa->spa_cksum_salt.zcs_bytes);
2254 
2255 	rc = check_mos_features(spa);
2256 	if (rc != 0) {
2257 		printf("ZFS: pool %s is not supported\n", spa->spa_name);
2258 	}
2259 
2260 	return (rc);
2261 }
2262 
2263 static int
2264 zfs_dnode_stat(const spa_t *spa, dnode_phys_t *dn, struct stat *sb)
2265 {
2266 
2267 	if (dn->dn_bonustype != DMU_OT_SA) {
2268 		znode_phys_t *zp = (znode_phys_t *)dn->dn_bonus;
2269 
2270 		sb->st_mode = zp->zp_mode;
2271 		sb->st_uid = zp->zp_uid;
2272 		sb->st_gid = zp->zp_gid;
2273 		sb->st_size = zp->zp_size;
2274 	} else {
2275 		sa_hdr_phys_t *sahdrp;
2276 		int hdrsize;
2277 		size_t size = 0;
2278 		void *buf = NULL;
2279 
2280 		if (dn->dn_bonuslen != 0)
2281 			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2282 		else {
2283 			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0) {
2284 				blkptr_t *bp = DN_SPILL_BLKPTR(dn);
2285 				int error;
2286 
2287 				size = BP_GET_LSIZE(bp);
2288 				buf = zfs_alloc(size);
2289 				error = zio_read(spa, bp, buf);
2290 				if (error != 0) {
2291 					zfs_free(buf, size);
2292 					return (error);
2293 				}
2294 				sahdrp = buf;
2295 			} else {
2296 				return (EIO);
2297 			}
2298 		}
2299 		hdrsize = SA_HDR_SIZE(sahdrp);
2300 		sb->st_mode = *(uint64_t *)((char *)sahdrp + hdrsize +
2301 		    SA_MODE_OFFSET);
2302 		sb->st_uid = *(uint64_t *)((char *)sahdrp + hdrsize +
2303 		    SA_UID_OFFSET);
2304 		sb->st_gid = *(uint64_t *)((char *)sahdrp + hdrsize +
2305 		    SA_GID_OFFSET);
2306 		sb->st_size = *(uint64_t *)((char *)sahdrp + hdrsize +
2307 		    SA_SIZE_OFFSET);
2308 		if (buf != NULL)
2309 			zfs_free(buf, size);
2310 	}
2311 
2312 	return (0);
2313 }
2314 
2315 static int
2316 zfs_dnode_readlink(const spa_t *spa, dnode_phys_t *dn, char *path, size_t psize)
2317 {
2318 	int rc = 0;
2319 
2320 	if (dn->dn_bonustype == DMU_OT_SA) {
2321 		sa_hdr_phys_t *sahdrp = NULL;
2322 		size_t size = 0;
2323 		void *buf = NULL;
2324 		int hdrsize;
2325 		char *p;
2326 
2327 		if (dn->dn_bonuslen != 0)
2328 			sahdrp = (sa_hdr_phys_t *)DN_BONUS(dn);
2329 		else {
2330 			blkptr_t *bp;
2331 
2332 			if ((dn->dn_flags & DNODE_FLAG_SPILL_BLKPTR) == 0)
2333 				return (EIO);
2334 			bp = DN_SPILL_BLKPTR(dn);
2335 
2336 			size = BP_GET_LSIZE(bp);
2337 			buf = zfs_alloc(size);
2338 			rc = zio_read(spa, bp, buf);
2339 			if (rc != 0) {
2340 				zfs_free(buf, size);
2341 				return (rc);
2342 			}
2343 			sahdrp = buf;
2344 		}
2345 		hdrsize = SA_HDR_SIZE(sahdrp);
2346 		p = (char *)((uintptr_t)sahdrp + hdrsize + SA_SYMLINK_OFFSET);
2347 		memcpy(path, p, psize);
2348 		if (buf != NULL)
2349 			zfs_free(buf, size);
2350 		return (0);
2351 	}
2352 	/*
2353 	 * Second test is purely to silence bogus compiler
2354 	 * warning about accessing past the end of dn_bonus.
2355 	 */
2356 	if (psize + sizeof(znode_phys_t) <= dn->dn_bonuslen &&
2357 	    sizeof(znode_phys_t) <= sizeof(dn->dn_bonus)) {
2358 		memcpy(path, &dn->dn_bonus[sizeof(znode_phys_t)], psize);
2359 	} else {
2360 		rc = dnode_read(spa, dn, 0, path, psize);
2361 	}
2362 	return (rc);
2363 }
2364 
2365 struct obj_list {
2366 	uint64_t		objnum;
2367 	STAILQ_ENTRY(obj_list)	entry;
2368 };
2369 
2370 /*
2371  * Lookup a file and return its dnode.
2372  */
2373 static int
2374 zfs_lookup(const struct zfsmount *mount, const char *upath, dnode_phys_t *dnode)
2375 {
2376 	int rc;
2377 	uint64_t objnum;
2378 	const spa_t *spa;
2379 	dnode_phys_t dn;
2380 	const char *p, *q;
2381 	char element[256];
2382 	char path[1024];
2383 	int symlinks_followed = 0;
2384 	struct stat sb;
2385 	struct obj_list *entry, *tentry;
2386 	STAILQ_HEAD(, obj_list) on_cache = STAILQ_HEAD_INITIALIZER(on_cache);
2387 
2388 	spa = mount->spa;
2389 	if (mount->objset.os_type != DMU_OST_ZFS) {
2390 		printf("ZFS: unexpected object set type %ju\n",
2391 		    (uintmax_t)mount->objset.os_type);
2392 		return (EIO);
2393 	}
2394 
2395 	if ((entry = malloc(sizeof(struct obj_list))) == NULL)
2396 		return (ENOMEM);
2397 
2398 	/*
2399 	 * Get the root directory dnode.
2400 	 */
2401 	rc = objset_get_dnode(spa, &mount->objset, MASTER_NODE_OBJ, &dn);
2402 	if (rc) {
2403 		free(entry);
2404 		return (rc);
2405 	}
2406 
2407 	rc = zap_lookup(spa, &dn, ZFS_ROOT_OBJ, sizeof (objnum), 1, &objnum);
2408 	if (rc) {
2409 		free(entry);
2410 		return (rc);
2411 	}
2412 	entry->objnum = objnum;
2413 	STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2414 
2415 	rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2416 	if (rc != 0)
2417 		goto done;
2418 
2419 	p = upath;
2420 	while (p && *p) {
2421 		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2422 		if (rc != 0)
2423 			goto done;
2424 
2425 		while (*p == '/')
2426 			p++;
2427 		if (*p == '\0')
2428 			break;
2429 		q = p;
2430 		while (*q != '\0' && *q != '/')
2431 			q++;
2432 
2433 		/* skip dot */
2434 		if (p + 1 == q && p[0] == '.') {
2435 			p++;
2436 			continue;
2437 		}
2438 		/* double dot */
2439 		if (p + 2 == q && p[0] == '.' && p[1] == '.') {
2440 			p += 2;
2441 			if (STAILQ_FIRST(&on_cache) ==
2442 			    STAILQ_LAST(&on_cache, obj_list, entry)) {
2443 				rc = ENOENT;
2444 				goto done;
2445 			}
2446 			entry = STAILQ_FIRST(&on_cache);
2447 			STAILQ_REMOVE_HEAD(&on_cache, entry);
2448 			free(entry);
2449 			objnum = (STAILQ_FIRST(&on_cache))->objnum;
2450 			continue;
2451 		}
2452 		if (q - p + 1 > sizeof(element)) {
2453 			rc = ENAMETOOLONG;
2454 			goto done;
2455 		}
2456 		memcpy(element, p, q - p);
2457 		element[q - p] = 0;
2458 		p = q;
2459 
2460 		if ((rc = zfs_dnode_stat(spa, &dn, &sb)) != 0)
2461 			goto done;
2462 		if (!S_ISDIR(sb.st_mode)) {
2463 			rc = ENOTDIR;
2464 			goto done;
2465 		}
2466 
2467 		rc = zap_lookup(spa, &dn, element, sizeof (objnum), 1, &objnum);
2468 		if (rc)
2469 			goto done;
2470 		objnum = ZFS_DIRENT_OBJ(objnum);
2471 
2472 		if ((entry = malloc(sizeof(struct obj_list))) == NULL) {
2473 			rc = ENOMEM;
2474 			goto done;
2475 		}
2476 		entry->objnum = objnum;
2477 		STAILQ_INSERT_HEAD(&on_cache, entry, entry);
2478 		rc = objset_get_dnode(spa, &mount->objset, objnum, &dn);
2479 		if (rc)
2480 			goto done;
2481 
2482 		/*
2483 		 * Check for symlink.
2484 		 */
2485 		rc = zfs_dnode_stat(spa, &dn, &sb);
2486 		if (rc)
2487 			goto done;
2488 		if (S_ISLNK(sb.st_mode)) {
2489 			if (symlinks_followed > 10) {
2490 				rc = EMLINK;
2491 				goto done;
2492 			}
2493 			symlinks_followed++;
2494 
2495 			/*
2496 			 * Read the link value and copy the tail of our
2497 			 * current path onto the end.
2498 			 */
2499 			if (sb.st_size + strlen(p) + 1 > sizeof(path)) {
2500 				rc = ENAMETOOLONG;
2501 				goto done;
2502 			}
2503 			strcpy(&path[sb.st_size], p);
2504 
2505 			rc = zfs_dnode_readlink(spa, &dn, path, sb.st_size);
2506 			if (rc != 0)
2507 				goto done;
2508 
2509 			/*
2510 			 * Restart with the new path, starting either at
2511 			 * the root or at the parent depending whether or
2512 			 * not the link is relative.
2513 			 */
2514 			p = path;
2515 			if (*p == '/') {
2516 				while (STAILQ_FIRST(&on_cache) !=
2517 				    STAILQ_LAST(&on_cache, obj_list, entry)) {
2518 					entry = STAILQ_FIRST(&on_cache);
2519 					STAILQ_REMOVE_HEAD(&on_cache, entry);
2520 					free(entry);
2521 				}
2522 			} else {
2523 				entry = STAILQ_FIRST(&on_cache);
2524 				STAILQ_REMOVE_HEAD(&on_cache, entry);
2525 				free(entry);
2526 			}
2527 			objnum = (STAILQ_FIRST(&on_cache))->objnum;
2528 		}
2529 	}
2530 
2531 	*dnode = dn;
2532 done:
2533 	STAILQ_FOREACH_SAFE(entry, &on_cache, entry, tentry)
2534 		free(entry);
2535 	return (rc);
2536 }
2537