xref: /freebsd/stand/i386/zfsboot/zfsboot.c (revision b9f654b163bce26de79705e77b872427c9f2afa1)
1 /*-
2  * Copyright (c) 1998 Robert Nordier
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms are freely
6  * permitted provided that the above copyright notice and this
7  * paragraph and the following disclaimer are duplicated in all
8  * such forms.
9  *
10  * This software is provided "AS IS" and without any express or
11  * implied warranties, including, without limitation, the implied
12  * warranties of merchantability and fitness for a particular
13  * purpose.
14  */
15 
16 #include <sys/cdefs.h>
17 __FBSDID("$FreeBSD$");
18 
19 #include "stand.h"
20 
21 #include <sys/param.h>
22 #include <sys/errno.h>
23 #include <sys/diskmbr.h>
24 #ifdef GPT
25 #include <sys/gpt.h>
26 #endif
27 #include <sys/reboot.h>
28 #include <sys/queue.h>
29 
30 #include <machine/bootinfo.h>
31 #include <machine/elf.h>
32 #include <machine/pc/bios.h>
33 
34 #include <stdarg.h>
35 #include <stddef.h>
36 
37 #include <a.out.h>
38 
39 #include <btxv86.h>
40 
41 #include "lib.h"
42 #include "rbx.h"
43 #include "drv.h"
44 #include "edd.h"
45 #include "cons.h"
46 #include "bootargs.h"
47 #include "paths.h"
48 
49 #include "libzfs.h"
50 
51 #define ARGS			0x900
52 #define NOPT			14
53 #define NDEV			3
54 
55 #define BIOS_NUMDRIVES		0x475
56 #define DRV_HARD		0x80
57 #define DRV_MASK		0x7f
58 
59 #define TYPE_AD			0
60 #define TYPE_DA			1
61 #define TYPE_MAXHARD		TYPE_DA
62 #define TYPE_FD			2
63 
64 #define DEV_GELIBOOT_BSIZE	4096
65 
66 extern uint32_t _end;
67 
68 #ifdef GPT
69 static const uuid_t freebsd_zfs_uuid = GPT_ENT_TYPE_FREEBSD_ZFS;
70 #endif
71 static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
72 static const unsigned char flags[NOPT] = {
73     RBX_DUAL,
74     RBX_SERIAL,
75     RBX_ASKNAME,
76     RBX_CDROM,
77     RBX_CONFIG,
78     RBX_KDB,
79     RBX_GDB,
80     RBX_MUTE,
81     RBX_NOINTR,
82     RBX_PAUSE,
83     RBX_QUIET,
84     RBX_DFLTROOT,
85     RBX_SINGLE,
86     RBX_VERBOSE
87 };
88 uint32_t opts;
89 
90 /*
91  * Paths to try loading before falling back to the boot2 prompt.
92  *
93  * /boot/zfsloader must be tried before /boot/loader in order to remain
94  * backward compatible with ZFS boot environments where /boot/loader exists
95  * but does not have ZFS support, which was the case before FreeBSD 12.
96  *
97  * If no loader is found, try to load a kernel directly instead.
98  */
99 static const struct string {
100     const char *p;
101     size_t len;
102 } loadpath[] = {
103     { PATH_LOADER_ZFS, sizeof(PATH_LOADER_ZFS) },
104     { PATH_LOADER, sizeof(PATH_LOADER) },
105     { PATH_KERNEL, sizeof(PATH_KERNEL) },
106 };
107 
108 static const unsigned char dev_maj[NDEV] = {30, 4, 2};
109 
110 static char cmd[512];
111 static char cmddup[512];
112 static char kname[1024];
113 static char rootname[256];
114 static int comspeed = SIOSPD;
115 static struct bootinfo bootinfo;
116 static uint32_t bootdev;
117 static struct zfs_boot_args zfsargs;
118 
119 vm_offset_t	high_heap_base;
120 uint32_t	bios_basemem, bios_extmem, high_heap_size;
121 
122 static struct bios_smap smap;
123 
124 /*
125  * The minimum amount of memory to reserve in bios_extmem for the heap.
126  */
127 #define	HEAP_MIN		(64 * 1024 * 1024)
128 
129 static char *heap_next;
130 static char *heap_end;
131 
132 /* Buffers that must not span a 64k boundary. */
133 #define READ_BUF_SIZE		8192
134 struct dmadat {
135 	char rdbuf[READ_BUF_SIZE];	/* for reading large things */
136 	char secbuf[READ_BUF_SIZE];	/* for MBR/disklabel */
137 };
138 static struct dmadat *dmadat;
139 
140 void exit(int);
141 void reboot(void);
142 static void load(void);
143 static int parse_cmd(void);
144 static void bios_getmem(void);
145 int main(void);
146 
147 #ifdef LOADER_GELI_SUPPORT
148 #include "geliboot.h"
149 static char gelipw[GELI_PW_MAXLEN];
150 #endif
151 
152 struct zfsdsk {
153 	struct dsk       dsk;
154 #ifdef LOADER_GELI_SUPPORT
155 	struct geli_dev *gdev;
156 #endif
157 };
158 
159 #include "zfsimpl.c"
160 
161 /*
162  * Read from a dnode (which must be from a ZPL filesystem).
163  */
164 static int
165 zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
166 {
167 	const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
168 	size_t n;
169 	int rc;
170 
171 	n = size;
172 	if (*offp + n > zp->zp_size)
173 		n = zp->zp_size - *offp;
174 
175 	rc = dnode_read(spa, dnode, *offp, start, n);
176 	if (rc)
177 		return (-1);
178 	*offp += n;
179 
180 	return (n);
181 }
182 
183 /*
184  * Current ZFS pool
185  */
186 static spa_t *spa;
187 static spa_t *primary_spa;
188 static vdev_t *primary_vdev;
189 
190 /*
191  * A wrapper for dskread that doesn't have to worry about whether the
192  * buffer pointer crosses a 64k boundary.
193  */
194 static int
195 vdev_read(void *xvdev, void *priv, off_t off, void *buf, size_t bytes)
196 {
197 	char *p;
198 	daddr_t lba, alignlba;
199 	off_t diff;
200 	unsigned int nb, alignnb;
201 	struct zfsdsk *zdsk = (struct zfsdsk *) priv;
202 
203 	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
204 		return -1;
205 
206 	p = buf;
207 	lba = off / DEV_BSIZE;
208 	lba += zdsk->dsk.start;
209 	/*
210 	 * Align reads to 4k else 4k sector GELIs will not decrypt.
211 	 * Round LBA down to nearest multiple of DEV_GELIBOOT_BSIZE bytes.
212 	 */
213 	alignlba = rounddown2(off, DEV_GELIBOOT_BSIZE) / DEV_BSIZE;
214 	/*
215 	 * The read must be aligned to DEV_GELIBOOT_BSIZE bytes relative to the
216 	 * start of the GELI partition, not the start of the actual disk.
217 	 */
218 	alignlba += zdsk->dsk.start;
219 	diff = (lba - alignlba) * DEV_BSIZE;
220 
221 	while (bytes > 0) {
222 		nb = bytes / DEV_BSIZE;
223 		/*
224 		 * Ensure that the read size plus the leading offset does not
225 		 * exceed the size of the read buffer.
226 		 */
227 		if (nb > (READ_BUF_SIZE - diff) / DEV_BSIZE)
228 			nb = (READ_BUF_SIZE - diff) / DEV_BSIZE;
229 		/*
230 		 * Round the number of blocks to read up to the nearest multiple
231 		 * of DEV_GELIBOOT_BSIZE.
232 		 */
233 		alignnb = roundup2(nb * DEV_BSIZE + diff, DEV_GELIBOOT_BSIZE)
234 		    / DEV_BSIZE;
235 
236 		if (zdsk->dsk.size > 0 && alignlba + alignnb >
237 		    zdsk->dsk.size + zdsk->dsk.start) {
238 			printf("Shortening read at %lld from %d to %lld\n",
239 			    alignlba, alignnb,
240 			    (zdsk->dsk.size + zdsk->dsk.start) - alignlba);
241 			alignnb = (zdsk->dsk.size + zdsk->dsk.start) - alignlba;
242 		}
243 
244 		if (drvread(&zdsk->dsk, dmadat->rdbuf, alignlba, alignnb))
245 			return -1;
246 #ifdef LOADER_GELI_SUPPORT
247 		/* decrypt */
248 		if (zdsk->gdev != NULL) {
249 			if (geli_read(zdsk->gdev, ((alignlba - zdsk->dsk.start) *
250 			    DEV_BSIZE), dmadat->rdbuf, alignnb * DEV_BSIZE))
251 				return (-1);
252 		}
253 #endif
254 		memcpy(p, dmadat->rdbuf + diff, nb * DEV_BSIZE);
255 		p += nb * DEV_BSIZE;
256 		lba += nb;
257 		alignlba += alignnb;
258 		bytes -= nb * DEV_BSIZE;
259 		/* Don't need the leading offset after the first block. */
260 		diff = 0;
261 	}
262 
263 	return 0;
264 }
265 /* Match the signature exactly due to signature madness */
266 static int
267 vdev_read2(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
268 {
269 	return vdev_read(vdev, priv, off, buf, bytes);
270 }
271 
272 
273 static int
274 vdev_write(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
275 {
276 	char *p;
277 	daddr_t lba;
278 	unsigned int nb;
279 	struct zfsdsk *zdsk = (struct zfsdsk *) priv;
280 
281 	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
282 		return -1;
283 
284 	p = buf;
285 	lba = off / DEV_BSIZE;
286 	lba += zdsk->dsk.start;
287 	while (bytes > 0) {
288 		nb = bytes / DEV_BSIZE;
289 		if (nb > READ_BUF_SIZE / DEV_BSIZE)
290 			nb = READ_BUF_SIZE / DEV_BSIZE;
291 		memcpy(dmadat->rdbuf, p, nb * DEV_BSIZE);
292 		if (drvwrite(&zdsk->dsk, dmadat->rdbuf, lba, nb))
293 			return -1;
294 		p += nb * DEV_BSIZE;
295 		lba += nb;
296 		bytes -= nb * DEV_BSIZE;
297 	}
298 
299 	return 0;
300 }
301 
302 static int
303 xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
304 {
305     if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
306 	printf("Invalid format\n");
307 	return -1;
308     }
309     return 0;
310 }
311 
312 /*
313  * Read Pad2 (formerly "Boot Block Header") area of the first
314  * vdev label of the given vdev.
315  */
316 static int
317 vdev_read_pad2(vdev_t *vdev, char *buf, size_t size)
318 {
319 	blkptr_t bp;
320 	char *tmp = zap_scratch;
321 	off_t off = offsetof(vdev_label_t, vl_pad2);
322 
323 	if (size > VDEV_PAD_SIZE)
324 		size = VDEV_PAD_SIZE;
325 
326 	BP_ZERO(&bp);
327 	BP_SET_LSIZE(&bp, VDEV_PAD_SIZE);
328 	BP_SET_PSIZE(&bp, VDEV_PAD_SIZE);
329 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
330 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
331 	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
332 	if (vdev_read_phys(vdev, &bp, tmp, off, 0))
333 		return (EIO);
334 	memcpy(buf, tmp, size);
335 	return (0);
336 }
337 
338 static int
339 vdev_clear_pad2(vdev_t *vdev)
340 {
341 	char *zeroes = zap_scratch;
342 	uint64_t *end;
343 	off_t off = offsetof(vdev_label_t, vl_pad2);
344 
345 	memset(zeroes, 0, VDEV_PAD_SIZE);
346 	end = (uint64_t *)(zeroes + VDEV_PAD_SIZE);
347 	/* ZIO_CHECKSUM_LABEL magic and pre-calcualted checksum for all zeros */
348 	end[-5] = 0x0210da7ab10c7a11;
349 	end[-4] = 0x97f48f807f6e2a3f;
350 	end[-3] = 0xaf909f1658aacefc;
351 	end[-2] = 0xcbd1ea57ff6db48b;
352 	end[-1] = 0x6ec692db0d465fab;
353 	if (vdev_write(vdev, vdev->v_read_priv, off, zeroes, VDEV_PAD_SIZE))
354 		return (EIO);
355 	return (0);
356 }
357 
358 static void
359 bios_getmem(void)
360 {
361     uint64_t size;
362 
363     /* Parse system memory map */
364     v86.ebx = 0;
365     do {
366 	v86.ctl = V86_FLAGS;
367 	v86.addr = 0x15;		/* int 0x15 function 0xe820*/
368 	v86.eax = 0xe820;
369 	v86.ecx = sizeof(struct bios_smap);
370 	v86.edx = SMAP_SIG;
371 	v86.es = VTOPSEG(&smap);
372 	v86.edi = VTOPOFF(&smap);
373 	v86int();
374 	if (V86_CY(v86.efl) || (v86.eax != SMAP_SIG))
375 	    break;
376 	/* look for a low-memory segment that's large enough */
377 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) &&
378 	    (smap.length >= (512 * 1024)))
379 	    bios_basemem = smap.length;
380 	/* look for the first segment in 'extended' memory */
381 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0x100000)) {
382 	    bios_extmem = smap.length;
383 	}
384 
385 	/*
386 	 * Look for the largest segment in 'extended' memory beyond
387 	 * 1MB but below 4GB.
388 	 */
389 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) &&
390 	    (smap.base < 0x100000000ull)) {
391 	    size = smap.length;
392 
393 	    /*
394 	     * If this segment crosses the 4GB boundary, truncate it.
395 	     */
396 	    if (smap.base + size > 0x100000000ull)
397 		size = 0x100000000ull - smap.base;
398 
399 	    if (size > high_heap_size) {
400 		high_heap_size = size;
401 		high_heap_base = smap.base;
402 	    }
403 	}
404     } while (v86.ebx != 0);
405 
406     /* Fall back to the old compatibility function for base memory */
407     if (bios_basemem == 0) {
408 	v86.ctl = 0;
409 	v86.addr = 0x12;		/* int 0x12 */
410 	v86int();
411 
412 	bios_basemem = (v86.eax & 0xffff) * 1024;
413     }
414 
415     /* Fall back through several compatibility functions for extended memory */
416     if (bios_extmem == 0) {
417 	v86.ctl = V86_FLAGS;
418 	v86.addr = 0x15;		/* int 0x15 function 0xe801*/
419 	v86.eax = 0xe801;
420 	v86int();
421 	if (!V86_CY(v86.efl)) {
422 	    bios_extmem = ((v86.ecx & 0xffff) + ((v86.edx & 0xffff) * 64)) * 1024;
423 	}
424     }
425     if (bios_extmem == 0) {
426 	v86.ctl = 0;
427 	v86.addr = 0x15;		/* int 0x15 function 0x88*/
428 	v86.eax = 0x8800;
429 	v86int();
430 	bios_extmem = (v86.eax & 0xffff) * 1024;
431     }
432 
433     /*
434      * If we have extended memory and did not find a suitable heap
435      * region in the SMAP, use the last 3MB of 'extended' memory as a
436      * high heap candidate.
437      */
438     if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) {
439 	high_heap_size = HEAP_MIN;
440 	high_heap_base = bios_extmem + 0x100000 - HEAP_MIN;
441     }
442 }
443 
444 /*
445  * Try to detect a device supported by the legacy int13 BIOS
446  */
447 static int
448 int13probe(int drive)
449 {
450     v86.ctl = V86_FLAGS;
451     v86.addr = 0x13;
452     v86.eax = 0x800;
453     v86.edx = drive;
454     v86int();
455 
456     if (!V86_CY(v86.efl) &&				/* carry clear */
457 	((v86.edx & 0xff) != (drive & DRV_MASK))) {	/* unit # OK */
458 	if ((v86.ecx & 0x3f) == 0) {			/* absurd sector size */
459 		return(0);				/* skip device */
460 	}
461 	return (1);
462     }
463     return(0);
464 }
465 
466 /*
467  * We call this when we find a ZFS vdev - ZFS consumes the dsk
468  * structure so we must make a new one.
469  */
470 static struct zfsdsk *
471 copy_dsk(struct zfsdsk *zdsk)
472 {
473     struct zfsdsk *newdsk;
474 
475     newdsk = malloc(sizeof(struct zfsdsk));
476     *newdsk = *zdsk;
477     return (newdsk);
478 }
479 
480 /*
481  * Get disk size from GPT.
482  */
483 static uint64_t
484 drvsize_gpt(struct dsk *dskp)
485 {
486 #ifdef GPT
487 	struct gpt_hdr hdr;
488 	char *sec;
489 
490 	sec = dmadat->secbuf;
491 	if (drvread(dskp, sec, 1, 1))
492 		return (0);
493 
494 	memcpy(&hdr, sec, sizeof(hdr));
495 	if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 ||
496 	    hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 ||
497 	    hdr.hdr_entsz < sizeof(struct gpt_ent) ||
498 	    DEV_BSIZE % hdr.hdr_entsz != 0) {
499 		return (0);
500 	}
501 	return (hdr.hdr_lba_alt + 1);
502 #else
503 	return (0);
504 #endif
505 }
506 
507 /*
508  * Get disk size from eax=0x800 and 0x4800. We need to probe both
509  * because 0x4800 may not be available and we would like to get more
510  * or less correct disk size - if it is possible at all.
511  * Note we do not really want to touch drv.c because that code is shared
512  * with boot2 and we can not afford to grow that code.
513  */
514 static uint64_t
515 drvsize_ext(struct zfsdsk *zdsk)
516 {
517 	struct dsk *dskp;
518 	uint64_t size, tmp;
519 	int cyl, hds, sec;
520 
521 	dskp = &zdsk->dsk;
522 
523 	/* Try to read disk size from GPT */
524 	size = drvsize_gpt(dskp);
525 	if (size != 0)
526 		return (size);
527 
528 	v86.ctl = V86_FLAGS;
529 	v86.addr = 0x13;
530 	v86.eax = 0x800;
531 	v86.edx = dskp->drive;
532 	v86int();
533 
534 	/* Don't error out if we get bad sector number, try EDD as well */
535 	if (V86_CY(v86.efl) ||	/* carry set */
536 	    (v86.edx & 0xff) <= (unsigned)(dskp->drive & 0x7f)) /* unit # bad */
537 		return (0);
538 	cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1;
539 	/* Convert max head # -> # of heads */
540 	hds = ((v86.edx & 0xff00) >> 8) + 1;
541 	sec = v86.ecx & 0x3f;
542 
543 	size = (uint64_t)cyl * hds * sec;
544 
545 	/* Determine if we can use EDD with this device. */
546 	v86.ctl = V86_FLAGS;
547 	v86.addr = 0x13;
548 	v86.eax = 0x4100;
549 	v86.edx = dskp->drive;
550 	v86.ebx = 0x55aa;
551 	v86int();
552 	if (V86_CY(v86.efl) ||  /* carry set */
553 	    (v86.ebx & 0xffff) != 0xaa55 || /* signature */
554 	    (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0)
555 		return (size);
556 
557 	tmp = drvsize(dskp);
558 	if (tmp > size)
559 		size = tmp;
560 
561 	return (size);
562 }
563 
564 /*
565  * The "layered" ioctl to read disk/partition size. Unfortunately
566  * the zfsboot case is hardest, because we do not have full software
567  * stack available, so we need to do some manual work here.
568  */
569 uint64_t
570 ldi_get_size(void *priv)
571 {
572 	struct zfsdsk *zdsk = priv;
573 	uint64_t size = zdsk->dsk.size;
574 
575 	if (zdsk->dsk.start == 0)
576 		size = drvsize_ext(zdsk);
577 
578 	return (size * DEV_BSIZE);
579 }
580 
581 static void
582 probe_drive(struct zfsdsk *zdsk)
583 {
584 #ifdef GPT
585     struct gpt_hdr hdr;
586     struct gpt_ent *ent;
587     unsigned part, entries_per_sec;
588     daddr_t slba;
589 #endif
590 #if defined(GPT) || defined(LOADER_GELI_SUPPORT)
591     daddr_t elba;
592 #endif
593 
594     struct dos_partition *dp;
595     char *sec;
596     unsigned i;
597 
598 #ifdef LOADER_GELI_SUPPORT
599     /*
600      * Taste the disk, if it is GELI encrypted, decrypt it then dig out the
601      * partition table and probe each slice/partition in turn for a vdev or
602      * GELI encrypted vdev.
603      */
604     elba = drvsize_ext(zdsk);
605     if (elba > 0) {
606 	elba--;
607     }
608     zdsk->gdev = geli_taste(vdev_read, zdsk, elba, "disk%u:0:");
609     if ((zdsk->gdev != NULL) && (geli_havekey(zdsk->gdev) == 0))
610 	    geli_passphrase(zdsk->gdev, gelipw);
611 #endif /* LOADER_GELI_SUPPORT */
612 
613     sec = dmadat->secbuf;
614     zdsk->dsk.start = 0;
615 
616 #ifdef GPT
617     /*
618      * First check for GPT.
619      */
620     if (drvread(&zdsk->dsk, sec, 1, 1)) {
621 	return;
622     }
623     memcpy(&hdr, sec, sizeof(hdr));
624     if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 ||
625 	hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 ||
626 	hdr.hdr_entsz < sizeof(*ent) || DEV_BSIZE % hdr.hdr_entsz != 0) {
627 	goto trymbr;
628     }
629 
630     /*
631      * Probe all GPT partitions for the presence of ZFS pools. We
632      * return the spa_t for the first we find (if requested). This
633      * will have the effect of booting from the first pool on the
634      * disk.
635      *
636      * If no vdev is found, GELI decrypting the device and try again
637      */
638     entries_per_sec = DEV_BSIZE / hdr.hdr_entsz;
639     slba = hdr.hdr_lba_table;
640     elba = slba + hdr.hdr_entries / entries_per_sec;
641     while (slba < elba) {
642 	zdsk->dsk.start = 0;
643 	if (drvread(&zdsk->dsk, sec, slba, 1))
644 	    return;
645 	for (part = 0; part < entries_per_sec; part++) {
646 	    ent = (struct gpt_ent *)(sec + part * hdr.hdr_entsz);
647 	    if (memcmp(&ent->ent_type, &freebsd_zfs_uuid,
648 		     sizeof(uuid_t)) == 0) {
649 		zdsk->dsk.start = ent->ent_lba_start;
650 		zdsk->dsk.size = ent->ent_lba_end - ent->ent_lba_start + 1;
651 		zdsk->dsk.slice = part + 1;
652 		zdsk->dsk.part = 255;
653 		if (vdev_probe(vdev_read2, zdsk, NULL) == 0) {
654 		    /*
655 		     * This slice had a vdev. We need a new dsk
656 		     * structure now since the vdev now owns this one.
657 		     */
658 		    zdsk = copy_dsk(zdsk);
659 		}
660 #ifdef LOADER_GELI_SUPPORT
661 		else if ((zdsk->gdev = geli_taste(vdev_read, zdsk,
662 		    ent->ent_lba_end - ent->ent_lba_start, "disk%up%u:",
663 		    zdsk->dsk.unit, zdsk->dsk.slice)) != NULL) {
664 		    if (geli_havekey(zdsk->gdev) == 0 ||
665 			geli_passphrase(zdsk->gdev, gelipw) == 0) {
666 			/*
667 			 * This slice has GELI, check it for ZFS.
668 			 */
669 			if (vdev_probe(vdev_read2, zdsk, NULL) == 0) {
670 			    /*
671 			     * This slice had a vdev. We need a new dsk
672 			     * structure now since the vdev now owns this one.
673 			     */
674 			    zdsk = copy_dsk(zdsk);
675 			}
676 			break;
677 		    }
678 		}
679 #endif /* LOADER_GELI_SUPPORT */
680 	    }
681 	}
682 	slba++;
683     }
684     return;
685 trymbr:
686 #endif /* GPT */
687 
688     if (drvread(&zdsk->dsk, sec, DOSBBSECTOR, 1))
689 	return;
690     dp = (void *)(sec + DOSPARTOFF);
691 
692     for (i = 0; i < NDOSPART; i++) {
693 	if (!dp[i].dp_typ)
694 	    continue;
695 	zdsk->dsk.start = dp[i].dp_start;
696 	zdsk->dsk.size = dp[i].dp_size;
697 	zdsk->dsk.slice = i + 1;
698 	if (vdev_probe(vdev_read2, zdsk, NULL) == 0) {
699 	    zdsk = copy_dsk(zdsk);
700 	}
701 #ifdef LOADER_GELI_SUPPORT
702 	else if ((zdsk->gdev = geli_taste(vdev_read, zdsk, dp[i].dp_size -
703 		 dp[i].dp_start, "disk%us%u:")) != NULL) {
704 	    if (geli_havekey(zdsk->gdev) == 0 ||
705 		geli_passphrase(zdsk->gdev, gelipw) == 0) {
706 		/*
707 		 * This slice has GELI, check it for ZFS.
708 		 */
709 		if (vdev_probe(vdev_read2, zdsk, NULL) == 0) {
710 		    /*
711 		     * This slice had a vdev. We need a new dsk
712 		     * structure now since the vdev now owns this one.
713 		     */
714 		    zdsk = copy_dsk(zdsk);
715 		}
716 		break;
717 	    }
718 	}
719 #endif /* LOADER_GELI_SUPPORT */
720     }
721 }
722 
723 int
724 main(void)
725 {
726     dnode_phys_t dn;
727     off_t off;
728     struct zfsdsk *zdsk;
729     int autoboot, i;
730     int nextboot;
731     int rc;
732 
733     dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
734 
735     bios_getmem();
736 
737     if (high_heap_size > 0) {
738 	heap_end = PTOV(high_heap_base + high_heap_size);
739 	heap_next = PTOV(high_heap_base);
740     } else {
741 	heap_next = (char *)dmadat + sizeof(*dmadat);
742 	heap_end = (char *)PTOV(bios_basemem);
743     }
744     setheap(heap_next, heap_end);
745 
746     zdsk = calloc(1, sizeof(struct zfsdsk));
747     zdsk->dsk.drive = *(uint8_t *)PTOV(ARGS);
748     zdsk->dsk.type = zdsk->dsk.drive & DRV_HARD ? TYPE_AD : TYPE_FD;
749     zdsk->dsk.unit = zdsk->dsk.drive & DRV_MASK;
750     zdsk->dsk.slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
751     zdsk->dsk.part = 0;
752     zdsk->dsk.start = 0;
753     zdsk->dsk.size = drvsize_ext(zdsk);
754 
755     bootinfo.bi_version = BOOTINFO_VERSION;
756     bootinfo.bi_size = sizeof(bootinfo);
757     bootinfo.bi_basemem = bios_basemem / 1024;
758     bootinfo.bi_extmem = bios_extmem / 1024;
759     bootinfo.bi_memsizes_valid++;
760     bootinfo.bi_bios_dev = zdsk->dsk.drive;
761 
762     bootdev = MAKEBOOTDEV(dev_maj[zdsk->dsk.type],
763 			  zdsk->dsk.slice, zdsk->dsk.unit, zdsk->dsk.part);
764 
765     /* Process configuration file */
766 
767     autoboot = 1;
768 
769     zfs_init();
770 
771     /*
772      * Probe the boot drive first - we will try to boot from whatever
773      * pool we find on that drive.
774      */
775     probe_drive(zdsk);
776 
777     /*
778      * Probe the rest of the drives that the bios knows about. This
779      * will find any other available pools and it may fill in missing
780      * vdevs for the boot pool.
781      */
782 #ifndef VIRTUALBOX
783     for (i = 0; i < *(unsigned char *)PTOV(BIOS_NUMDRIVES); i++)
784 #else
785     for (i = 0; i < MAXBDDEV; i++)
786 #endif
787     {
788 	if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
789 	    continue;
790 
791 	if (!int13probe(i | DRV_HARD))
792 	    break;
793 
794 	zdsk = calloc(1, sizeof(struct zfsdsk));
795 	zdsk->dsk.drive = i | DRV_HARD;
796 	zdsk->dsk.type = zdsk->dsk.drive & TYPE_AD;
797 	zdsk->dsk.unit = i;
798 	zdsk->dsk.slice = 0;
799 	zdsk->dsk.part = 0;
800 	zdsk->dsk.start = 0;
801 	zdsk->dsk.size = drvsize_ext(zdsk);
802 	probe_drive(zdsk);
803     }
804 
805     /*
806      * The first discovered pool, if any, is the pool.
807      */
808     spa = spa_get_primary();
809     if (!spa) {
810 	printf("%s: No ZFS pools located, can't boot\n", BOOTPROG);
811 	for (;;)
812 	    ;
813     }
814 
815     primary_spa = spa;
816     primary_vdev = spa_get_primary_vdev(spa);
817 
818     nextboot = 0;
819     rc  = vdev_read_pad2(primary_vdev, cmd, sizeof(cmd));
820     if (vdev_clear_pad2(primary_vdev))
821 	printf("failed to clear pad2 area of primary vdev\n");
822     if (rc == 0) {
823 	if (*cmd) {
824 	    /*
825 	     * We could find an old-style ZFS Boot Block header here.
826 	     * Simply ignore it.
827 	     */
828 	    if (*(uint64_t *)cmd != 0x2f5b007b10c) {
829 		/*
830 		 * Note that parse() is destructive to cmd[] and we also want
831 		 * to honor RBX_QUIET option that could be present in cmd[].
832 		 */
833 		nextboot = 1;
834 		memcpy(cmddup, cmd, sizeof(cmd));
835 		if (parse_cmd()) {
836 		    printf("failed to parse pad2 area of primary vdev\n");
837 		    reboot();
838 		}
839 		if (!OPT_CHECK(RBX_QUIET))
840 		    printf("zfs nextboot: %s\n", cmddup);
841 	    }
842 	    /* Do not process this command twice */
843 	    *cmd = 0;
844 	}
845     } else
846 	printf("failed to read pad2 area of primary vdev\n");
847 
848     /* Mount ZFS only if it's not already mounted via nextboot parsing. */
849     if (zfsmount.spa == NULL &&
850 	(zfs_spa_init(spa) != 0 || zfs_mount(spa, 0, &zfsmount) != 0)) {
851 	printf("%s: failed to mount default pool %s\n",
852 	    BOOTPROG, spa->spa_name);
853 	autoboot = 0;
854     } else if (zfs_lookup(&zfsmount, PATH_CONFIG, &dn) == 0 ||
855         zfs_lookup(&zfsmount, PATH_DOTCONFIG, &dn) == 0) {
856 	off = 0;
857 	zfs_read(spa, &dn, &off, cmd, sizeof(cmd));
858     }
859 
860     if (*cmd) {
861 	/*
862 	 * Note that parse_cmd() is destructive to cmd[] and we also want
863 	 * to honor RBX_QUIET option that could be present in cmd[].
864 	 */
865 	memcpy(cmddup, cmd, sizeof(cmd));
866 	if (parse_cmd())
867 	    autoboot = 0;
868 	if (!OPT_CHECK(RBX_QUIET))
869 	    printf("%s: %s\n", PATH_CONFIG, cmddup);
870 	/* Do not process this command twice */
871 	*cmd = 0;
872     }
873 
874     /* Do not risk waiting at the prompt forever. */
875     if (nextboot && !autoboot)
876 	reboot();
877 
878     if (autoboot && !*kname) {
879 	/*
880 	 * Iterate through the list of loader and kernel paths, trying to load.
881 	 * If interrupted by a keypress, or in case of failure, drop the user
882 	 * to the boot2 prompt.
883 	 */
884 	for (i = 0; i < nitems(loadpath); i++) {
885 	    memcpy(kname, loadpath[i].p, loadpath[i].len);
886 	    if (keyhit(3))
887 		break;
888 	    load();
889 	}
890     }
891 
892     /* Present the user with the boot2 prompt. */
893 
894     for (;;) {
895 	if (!autoboot || !OPT_CHECK(RBX_QUIET)) {
896 	    printf("\nFreeBSD/x86 boot\n");
897 	    if (zfs_rlookup(spa, zfsmount.rootobj, rootname) != 0)
898 		printf("Default: %s/<0x%llx>:%s\n"
899 		       "boot: ",
900 		       spa->spa_name, zfsmount.rootobj, kname);
901 	    else if (rootname[0] != '\0')
902 		printf("Default: %s/%s:%s\n"
903 		       "boot: ",
904 		       spa->spa_name, rootname, kname);
905 	    else
906 		printf("Default: %s:%s\n"
907 		       "boot: ",
908 		       spa->spa_name, kname);
909 	}
910 	if (ioctrl & IO_SERIAL)
911 	    sio_flush();
912 	if (!autoboot || keyhit(5))
913 	    getstr(cmd, sizeof(cmd));
914 	else if (!autoboot || !OPT_CHECK(RBX_QUIET))
915 	    putchar('\n');
916 	autoboot = 0;
917 	if (parse_cmd())
918 	    putchar('\a');
919 	else
920 	    load();
921     }
922 }
923 
924 /* XXX - Needed for btxld to link the boot2 binary; do not remove. */
925 void
926 exit(int x)
927 {
928     __exit(x);
929 }
930 
931 void
932 reboot(void)
933 {
934     __exit(0);
935 }
936 
937 static void
938 load(void)
939 {
940     union {
941 	struct exec ex;
942 	Elf32_Ehdr eh;
943     } hdr;
944     static Elf32_Phdr ep[2];
945     static Elf32_Shdr es[2];
946     caddr_t p;
947     dnode_phys_t dn;
948     off_t off;
949     uint32_t addr, x;
950     int fmt, i, j;
951 
952     if (zfs_lookup(&zfsmount, kname, &dn)) {
953 	printf("\nCan't find %s\n", kname);
954 	return;
955     }
956     off = 0;
957     if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
958 	return;
959     if (N_GETMAGIC(hdr.ex) == ZMAGIC)
960 	fmt = 0;
961     else if (IS_ELF(hdr.eh))
962 	fmt = 1;
963     else {
964 	printf("Invalid %s\n", "format");
965 	return;
966     }
967     if (fmt == 0) {
968 	addr = hdr.ex.a_entry & 0xffffff;
969 	p = PTOV(addr);
970 	off = PAGE_SIZE;
971 	if (xfsread(&dn, &off, p, hdr.ex.a_text))
972 	    return;
973 	p += roundup2(hdr.ex.a_text, PAGE_SIZE);
974 	if (xfsread(&dn, &off, p, hdr.ex.a_data))
975 	    return;
976 	p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
977 	bootinfo.bi_symtab = VTOP(p);
978 	memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
979 	p += sizeof(hdr.ex.a_syms);
980 	if (hdr.ex.a_syms) {
981 	    if (xfsread(&dn, &off, p, hdr.ex.a_syms))
982 		return;
983 	    p += hdr.ex.a_syms;
984 	    if (xfsread(&dn, &off, p, sizeof(int)))
985 		return;
986 	    x = *(uint32_t *)p;
987 	    p += sizeof(int);
988 	    x -= sizeof(int);
989 	    if (xfsread(&dn, &off, p, x))
990 		return;
991 	    p += x;
992 	}
993     } else {
994 	off = hdr.eh.e_phoff;
995 	for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
996 	    if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
997 		return;
998 	    if (ep[j].p_type == PT_LOAD)
999 		j++;
1000 	}
1001 	for (i = 0; i < 2; i++) {
1002 	    p = PTOV(ep[i].p_paddr & 0xffffff);
1003 	    off = ep[i].p_offset;
1004 	    if (xfsread(&dn, &off, p, ep[i].p_filesz))
1005 		return;
1006 	}
1007 	p += roundup2(ep[1].p_memsz, PAGE_SIZE);
1008 	bootinfo.bi_symtab = VTOP(p);
1009 	if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
1010 	    off = hdr.eh.e_shoff + sizeof(es[0]) *
1011 		(hdr.eh.e_shstrndx + 1);
1012 	    if (xfsread(&dn, &off, &es, sizeof(es)))
1013 		return;
1014 	    for (i = 0; i < 2; i++) {
1015 		memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
1016 		p += sizeof(es[i].sh_size);
1017 		off = es[i].sh_offset;
1018 		if (xfsread(&dn, &off, p, es[i].sh_size))
1019 		    return;
1020 		p += es[i].sh_size;
1021 	    }
1022 	}
1023 	addr = hdr.eh.e_entry & 0xffffff;
1024     }
1025     bootinfo.bi_esymtab = VTOP(p);
1026     bootinfo.bi_kernelname = VTOP(kname);
1027     zfsargs.size = sizeof(zfsargs);
1028     zfsargs.pool = zfsmount.spa->spa_guid;
1029     zfsargs.root = zfsmount.rootobj;
1030     zfsargs.primary_pool = primary_spa->spa_guid;
1031 #ifdef LOADER_GELI_SUPPORT
1032     explicit_bzero(gelipw, sizeof(gelipw));
1033     export_geli_boot_data(&zfsargs.gelidata);
1034 #endif
1035     if (primary_vdev != NULL)
1036 	zfsargs.primary_vdev = primary_vdev->v_guid;
1037     else
1038 	printf("failed to detect primary vdev\n");
1039     /*
1040      * Note that the zfsargs struct is passed by value, not by pointer.  Code in
1041      * btxldr.S copies the values from the entry stack to a fixed location
1042      * within loader(8) at startup due to the presence of KARGS_FLAGS_EXTARG.
1043      */
1044     __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
1045 	   bootdev,
1046 	   KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG,
1047 	   (uint32_t) spa->spa_guid,
1048 	   (uint32_t) (spa->spa_guid >> 32),
1049 	   VTOP(&bootinfo),
1050 	   zfsargs);
1051 }
1052 
1053 static int
1054 zfs_mount_ds(char *dsname)
1055 {
1056     uint64_t newroot;
1057     spa_t *newspa;
1058     char *q;
1059 
1060     q = strchr(dsname, '/');
1061     if (q)
1062 	*q++ = '\0';
1063     newspa = spa_find_by_name(dsname);
1064     if (newspa == NULL) {
1065 	printf("\nCan't find ZFS pool %s\n", dsname);
1066 	return -1;
1067     }
1068 
1069     if (zfs_spa_init(newspa))
1070 	return -1;
1071 
1072     newroot = 0;
1073     if (q) {
1074 	if (zfs_lookup_dataset(newspa, q, &newroot)) {
1075 	    printf("\nCan't find dataset %s in ZFS pool %s\n",
1076 		    q, newspa->spa_name);
1077 	    return -1;
1078 	}
1079     }
1080     if (zfs_mount(newspa, newroot, &zfsmount)) {
1081 	printf("\nCan't mount ZFS dataset\n");
1082 	return -1;
1083     }
1084     spa = newspa;
1085     return (0);
1086 }
1087 
1088 static int
1089 parse_cmd(void)
1090 {
1091     char *arg = cmd;
1092     char *ep, *p, *q;
1093     const char *cp;
1094     int c, i, j;
1095 
1096     while ((c = *arg++)) {
1097 	if (c == ' ' || c == '\t' || c == '\n')
1098 	    continue;
1099 	for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
1100 	ep = p;
1101 	if (*p)
1102 	    *p++ = 0;
1103 	if (c == '-') {
1104 	    while ((c = *arg++)) {
1105 		if (c == 'P') {
1106 		    if (*(uint8_t *)PTOV(0x496) & 0x10) {
1107 			cp = "yes";
1108 		    } else {
1109 			opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
1110 			cp = "no";
1111 		    }
1112 		    printf("Keyboard: %s\n", cp);
1113 		    continue;
1114 		} else if (c == 'S') {
1115 		    j = 0;
1116 		    while ((unsigned int)(i = *arg++ - '0') <= 9)
1117 			j = j * 10 + i;
1118 		    if (j > 0 && i == -'0') {
1119 			comspeed = j;
1120 			break;
1121 		    }
1122 		    /* Fall through to error below ('S' not in optstr[]). */
1123 		}
1124 		for (i = 0; c != optstr[i]; i++)
1125 		    if (i == NOPT - 1)
1126 			return -1;
1127 		opts ^= OPT_SET(flags[i]);
1128 	    }
1129 	    ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
1130 		     OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
1131 	    if (ioctrl & IO_SERIAL) {
1132 	        if (sio_init(115200 / comspeed) != 0)
1133 		    ioctrl &= ~IO_SERIAL;
1134 	    }
1135 	} if (c == '?') {
1136 	    dnode_phys_t dn;
1137 
1138 	    if (zfs_lookup(&zfsmount, arg, &dn) == 0) {
1139 		zap_list(spa, &dn);
1140 	    }
1141 	    return -1;
1142 	} else {
1143 	    arg--;
1144 
1145 	    /*
1146 	     * Report pool status if the comment is 'status'. Lets
1147 	     * hope no-one wants to load /status as a kernel.
1148 	     */
1149 	    if (!strcmp(arg, "status")) {
1150 		spa_all_status();
1151 		return -1;
1152 	    }
1153 
1154 	    /*
1155 	     * If there is "zfs:" prefix simply ignore it.
1156 	     */
1157 	    if (strncmp(arg, "zfs:", 4) == 0)
1158 		arg += 4;
1159 
1160 	    /*
1161 	     * If there is a colon, switch pools.
1162 	     */
1163 	    q = strchr(arg, ':');
1164 	    if (q) {
1165 		*q++ = '\0';
1166 		if (zfs_mount_ds(arg) != 0)
1167 		    return -1;
1168 		arg = q;
1169 	    }
1170 	    if ((i = ep - arg)) {
1171 		if ((size_t)i >= sizeof(kname))
1172 		    return -1;
1173 		memcpy(kname, arg, i + 1);
1174 	    }
1175 	}
1176 	arg = p;
1177     }
1178     return 0;
1179 }
1180