xref: /freebsd/stand/i386/zfsboot/zfsboot.c (revision 6e778a7efdc0e804471750157f6bacd1ef7d1580)
1 /*-
2  * Copyright (c) 1998 Robert Nordier
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms are freely
6  * permitted provided that the above copyright notice and this
7  * paragraph and the following disclaimer are duplicated in all
8  * such forms.
9  *
10  * This software is provided "AS IS" and without any express or
11  * implied warranties, including, without limitation, the implied
12  * warranties of merchantability and fitness for a particular
13  * purpose.
14  */
15 
16 #include <sys/cdefs.h>
17 __FBSDID("$FreeBSD$");
18 
19 #include <sys/param.h>
20 #include <sys/errno.h>
21 #include <sys/diskmbr.h>
22 #ifdef GPT
23 #include <sys/gpt.h>
24 #endif
25 #include <sys/reboot.h>
26 #include <sys/queue.h>
27 
28 #include <machine/bootinfo.h>
29 #include <machine/elf.h>
30 #include <machine/pc/bios.h>
31 
32 #include <stdarg.h>
33 #include <stddef.h>
34 
35 #include <a.out.h>
36 
37 #include <btxv86.h>
38 
39 /* Forward declared to avoid warnings -- these shouldn't be needed */
40 int strcasecmp(const char *s1, const char *s2);
41 void explicit_bzero(void *b, size_t len);
42 
43 #include "lib.h"
44 #include "rbx.h"
45 #include "drv.h"
46 #include "edd.h"
47 #include "util.h"
48 #include "cons.h"
49 #include "bootargs.h"
50 #include "paths.h"
51 
52 #include "libzfs.h"
53 
54 #define ARGS			0x900
55 #define NOPT			14
56 #define NDEV			3
57 
58 #define BIOS_NUMDRIVES		0x475
59 #define DRV_HARD		0x80
60 #define DRV_MASK		0x7f
61 
62 #define TYPE_AD			0
63 #define TYPE_DA			1
64 #define TYPE_MAXHARD		TYPE_DA
65 #define TYPE_FD			2
66 
67 #define DEV_GELIBOOT_BSIZE	4096
68 
69 extern uint32_t _end;
70 
71 #ifdef GPT
72 static const uuid_t freebsd_zfs_uuid = GPT_ENT_TYPE_FREEBSD_ZFS;
73 #endif
74 static const char optstr[NOPT] = "DhaCcdgmnpqrsv"; /* Also 'P', 'S' */
75 static const unsigned char flags[NOPT] = {
76     RBX_DUAL,
77     RBX_SERIAL,
78     RBX_ASKNAME,
79     RBX_CDROM,
80     RBX_CONFIG,
81     RBX_KDB,
82     RBX_GDB,
83     RBX_MUTE,
84     RBX_NOINTR,
85     RBX_PAUSE,
86     RBX_QUIET,
87     RBX_DFLTROOT,
88     RBX_SINGLE,
89     RBX_VERBOSE
90 };
91 uint32_t opts;
92 
93 static const unsigned char dev_maj[NDEV] = {30, 4, 2};
94 
95 static char cmd[512];
96 static char cmddup[512];
97 static char kname[1024];
98 static char rootname[256];
99 static int comspeed = SIOSPD;
100 static struct bootinfo bootinfo;
101 static uint32_t bootdev;
102 static struct zfs_boot_args zfsargs;
103 
104 vm_offset_t	high_heap_base;
105 uint32_t	bios_basemem, bios_extmem, high_heap_size;
106 
107 static struct bios_smap smap;
108 
109 /*
110  * The minimum amount of memory to reserve in bios_extmem for the heap.
111  */
112 #define	HEAP_MIN		(64 * 1024 * 1024)
113 
114 static char *heap_next;
115 static char *heap_end;
116 
117 /* Buffers that must not span a 64k boundary. */
118 #define READ_BUF_SIZE		8192
119 struct dmadat {
120 	char rdbuf[READ_BUF_SIZE];	/* for reading large things */
121 	char secbuf[READ_BUF_SIZE];	/* for MBR/disklabel */
122 };
123 static struct dmadat *dmadat;
124 
125 void exit(int);
126 void reboot(void);
127 static void load(void);
128 static int parse_cmd(void);
129 static void bios_getmem(void);
130 void *malloc(size_t n);
131 void free(void *ptr);
132 int main(void);
133 
134 void *
135 malloc(size_t n)
136 {
137 	char *p = heap_next;
138 	if (p + n > heap_end) {
139 		printf("malloc failure\n");
140 		for (;;)
141 		    ;
142 		/* NOTREACHED */
143 		return (0);
144 	}
145 	heap_next += n;
146 	return (p);
147 }
148 
149 void
150 free(void *ptr)
151 {
152 
153 	return;
154 }
155 
156 static char *
157 strdup(const char *s)
158 {
159 	char *p = malloc(strlen(s) + 1);
160 	strcpy(p, s);
161 	return (p);
162 }
163 
164 #ifdef LOADER_GELI_SUPPORT
165 #include "geliboot.c"
166 static char gelipw[GELI_PW_MAXLEN];
167 static struct keybuf *gelibuf;
168 #endif
169 
170 #include "zfsimpl.c"
171 
172 /*
173  * Read from a dnode (which must be from a ZPL filesystem).
174  */
175 static int
176 zfs_read(spa_t *spa, const dnode_phys_t *dnode, off_t *offp, void *start, size_t size)
177 {
178 	const znode_phys_t *zp = (const znode_phys_t *) dnode->dn_bonus;
179 	size_t n;
180 	int rc;
181 
182 	n = size;
183 	if (*offp + n > zp->zp_size)
184 		n = zp->zp_size - *offp;
185 
186 	rc = dnode_read(spa, dnode, *offp, start, n);
187 	if (rc)
188 		return (-1);
189 	*offp += n;
190 
191 	return (n);
192 }
193 
194 /*
195  * Current ZFS pool
196  */
197 static spa_t *spa;
198 static spa_t *primary_spa;
199 static vdev_t *primary_vdev;
200 
201 /*
202  * A wrapper for dskread that doesn't have to worry about whether the
203  * buffer pointer crosses a 64k boundary.
204  */
205 static int
206 vdev_read(void *xvdev, void *priv, off_t off, void *buf, size_t bytes)
207 {
208 	char *p;
209 	daddr_t lba, alignlba;
210 	off_t diff;
211 	unsigned int nb, alignnb;
212 	struct dsk *dsk = (struct dsk *) priv;
213 
214 	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
215 		return -1;
216 
217 	p = buf;
218 	lba = off / DEV_BSIZE;
219 	lba += dsk->start;
220 	/*
221 	 * Align reads to 4k else 4k sector GELIs will not decrypt.
222 	 * Round LBA down to nearest multiple of DEV_GELIBOOT_BSIZE bytes.
223 	 */
224 	alignlba = rounddown2(off, DEV_GELIBOOT_BSIZE) / DEV_BSIZE;
225 	/*
226 	 * The read must be aligned to DEV_GELIBOOT_BSIZE bytes relative to the
227 	 * start of the GELI partition, not the start of the actual disk.
228 	 */
229 	alignlba += dsk->start;
230 	diff = (lba - alignlba) * DEV_BSIZE;
231 
232 	while (bytes > 0) {
233 		nb = bytes / DEV_BSIZE;
234 		/*
235 		 * Ensure that the read size plus the leading offset does not
236 		 * exceed the size of the read buffer.
237 		 */
238 		if (nb > (READ_BUF_SIZE - diff) / DEV_BSIZE)
239 			nb = (READ_BUF_SIZE - diff) / DEV_BSIZE;
240 		/*
241 		 * Round the number of blocks to read up to the nearest multiple
242 		 * of DEV_GELIBOOT_BSIZE.
243 		 */
244 		alignnb = roundup2(nb * DEV_BSIZE + diff, DEV_GELIBOOT_BSIZE)
245 		    / DEV_BSIZE;
246 
247 		if (drvread(dsk, dmadat->rdbuf, alignlba, alignnb))
248 			return -1;
249 #ifdef LOADER_GELI_SUPPORT
250 		/* decrypt */
251 		if (is_geli(dsk) == 0) {
252 			if (geli_read(dsk, ((alignlba - dsk->start) *
253 			    DEV_BSIZE), dmadat->rdbuf, alignnb * DEV_BSIZE))
254 				return (-1);
255 		}
256 #endif
257 		memcpy(p, dmadat->rdbuf + diff, nb * DEV_BSIZE);
258 		p += nb * DEV_BSIZE;
259 		lba += nb;
260 		alignlba += alignnb;
261 		bytes -= nb * DEV_BSIZE;
262 		/* Don't need the leading offset after the first block. */
263 		diff = 0;
264 	}
265 
266 	return 0;
267 }
268 /* Match the signature exactly due to signature madness */
269 static int
270 vdev_read2(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
271 {
272 	return vdev_read(vdev, priv, off, buf, bytes);
273 }
274 
275 
276 static int
277 vdev_write(vdev_t *vdev, void *priv, off_t off, void *buf, size_t bytes)
278 {
279 	char *p;
280 	daddr_t lba;
281 	unsigned int nb;
282 	struct dsk *dsk = (struct dsk *) priv;
283 
284 	if ((off & (DEV_BSIZE - 1)) || (bytes & (DEV_BSIZE - 1)))
285 		return -1;
286 
287 	p = buf;
288 	lba = off / DEV_BSIZE;
289 	lba += dsk->start;
290 	while (bytes > 0) {
291 		nb = bytes / DEV_BSIZE;
292 		if (nb > READ_BUF_SIZE / DEV_BSIZE)
293 			nb = READ_BUF_SIZE / DEV_BSIZE;
294 		memcpy(dmadat->rdbuf, p, nb * DEV_BSIZE);
295 		if (drvwrite(dsk, dmadat->rdbuf, lba, nb))
296 			return -1;
297 		p += nb * DEV_BSIZE;
298 		lba += nb;
299 		bytes -= nb * DEV_BSIZE;
300 	}
301 
302 	return 0;
303 }
304 
305 static int
306 xfsread(const dnode_phys_t *dnode, off_t *offp, void *buf, size_t nbyte)
307 {
308     if ((size_t)zfs_read(spa, dnode, offp, buf, nbyte) != nbyte) {
309 	printf("Invalid format\n");
310 	return -1;
311     }
312     return 0;
313 }
314 
315 /*
316  * Read Pad2 (formerly "Boot Block Header") area of the first
317  * vdev label of the given vdev.
318  */
319 static int
320 vdev_read_pad2(vdev_t *vdev, char *buf, size_t size)
321 {
322 	blkptr_t bp;
323 	char *tmp = zap_scratch;
324 	off_t off = offsetof(vdev_label_t, vl_pad2);
325 
326 	if (size > VDEV_PAD_SIZE)
327 		size = VDEV_PAD_SIZE;
328 
329 	BP_ZERO(&bp);
330 	BP_SET_LSIZE(&bp, VDEV_PAD_SIZE);
331 	BP_SET_PSIZE(&bp, VDEV_PAD_SIZE);
332 	BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_LABEL);
333 	BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
334 	DVA_SET_OFFSET(BP_IDENTITY(&bp), off);
335 	if (vdev_read_phys(vdev, &bp, tmp, off, 0))
336 		return (EIO);
337 	memcpy(buf, tmp, size);
338 	return (0);
339 }
340 
341 static int
342 vdev_clear_pad2(vdev_t *vdev)
343 {
344 	char *zeroes = zap_scratch;
345 	uint64_t *end;
346 	off_t off = offsetof(vdev_label_t, vl_pad2);
347 
348 	memset(zeroes, 0, VDEV_PAD_SIZE);
349 	end = (uint64_t *)(zeroes + VDEV_PAD_SIZE);
350 	/* ZIO_CHECKSUM_LABEL magic and pre-calcualted checksum for all zeros */
351 	end[-5] = 0x0210da7ab10c7a11;
352 	end[-4] = 0x97f48f807f6e2a3f;
353 	end[-3] = 0xaf909f1658aacefc;
354 	end[-2] = 0xcbd1ea57ff6db48b;
355 	end[-1] = 0x6ec692db0d465fab;
356 	if (vdev_write(vdev, vdev->v_read_priv, off, zeroes, VDEV_PAD_SIZE))
357 		return (EIO);
358 	return (0);
359 }
360 
361 static void
362 bios_getmem(void)
363 {
364     uint64_t size;
365 
366     /* Parse system memory map */
367     v86.ebx = 0;
368     do {
369 	v86.ctl = V86_FLAGS;
370 	v86.addr = 0x15;		/* int 0x15 function 0xe820*/
371 	v86.eax = 0xe820;
372 	v86.ecx = sizeof(struct bios_smap);
373 	v86.edx = SMAP_SIG;
374 	v86.es = VTOPSEG(&smap);
375 	v86.edi = VTOPOFF(&smap);
376 	v86int();
377 	if (V86_CY(v86.efl) || (v86.eax != SMAP_SIG))
378 	    break;
379 	/* look for a low-memory segment that's large enough */
380 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0) &&
381 	    (smap.length >= (512 * 1024)))
382 	    bios_basemem = smap.length;
383 	/* look for the first segment in 'extended' memory */
384 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base == 0x100000)) {
385 	    bios_extmem = smap.length;
386 	}
387 
388 	/*
389 	 * Look for the largest segment in 'extended' memory beyond
390 	 * 1MB but below 4GB.
391 	 */
392 	if ((smap.type == SMAP_TYPE_MEMORY) && (smap.base > 0x100000) &&
393 	    (smap.base < 0x100000000ull)) {
394 	    size = smap.length;
395 
396 	    /*
397 	     * If this segment crosses the 4GB boundary, truncate it.
398 	     */
399 	    if (smap.base + size > 0x100000000ull)
400 		size = 0x100000000ull - smap.base;
401 
402 	    if (size > high_heap_size) {
403 		high_heap_size = size;
404 		high_heap_base = smap.base;
405 	    }
406 	}
407     } while (v86.ebx != 0);
408 
409     /* Fall back to the old compatibility function for base memory */
410     if (bios_basemem == 0) {
411 	v86.ctl = 0;
412 	v86.addr = 0x12;		/* int 0x12 */
413 	v86int();
414 
415 	bios_basemem = (v86.eax & 0xffff) * 1024;
416     }
417 
418     /* Fall back through several compatibility functions for extended memory */
419     if (bios_extmem == 0) {
420 	v86.ctl = V86_FLAGS;
421 	v86.addr = 0x15;		/* int 0x15 function 0xe801*/
422 	v86.eax = 0xe801;
423 	v86int();
424 	if (!V86_CY(v86.efl)) {
425 	    bios_extmem = ((v86.ecx & 0xffff) + ((v86.edx & 0xffff) * 64)) * 1024;
426 	}
427     }
428     if (bios_extmem == 0) {
429 	v86.ctl = 0;
430 	v86.addr = 0x15;		/* int 0x15 function 0x88*/
431 	v86.eax = 0x8800;
432 	v86int();
433 	bios_extmem = (v86.eax & 0xffff) * 1024;
434     }
435 
436     /*
437      * If we have extended memory and did not find a suitable heap
438      * region in the SMAP, use the last 3MB of 'extended' memory as a
439      * high heap candidate.
440      */
441     if (bios_extmem >= HEAP_MIN && high_heap_size < HEAP_MIN) {
442 	high_heap_size = HEAP_MIN;
443 	high_heap_base = bios_extmem + 0x100000 - HEAP_MIN;
444     }
445 }
446 
447 /*
448  * Try to detect a device supported by the legacy int13 BIOS
449  */
450 static int
451 int13probe(int drive)
452 {
453     v86.ctl = V86_FLAGS;
454     v86.addr = 0x13;
455     v86.eax = 0x800;
456     v86.edx = drive;
457     v86int();
458 
459     if (!V86_CY(v86.efl) &&				/* carry clear */
460 	((v86.edx & 0xff) != (drive & DRV_MASK))) {	/* unit # OK */
461 	if ((v86.ecx & 0x3f) == 0) {			/* absurd sector size */
462 		return(0);				/* skip device */
463 	}
464 	return (1);
465     }
466     return(0);
467 }
468 
469 /*
470  * We call this when we find a ZFS vdev - ZFS consumes the dsk
471  * structure so we must make a new one.
472  */
473 static struct dsk *
474 copy_dsk(struct dsk *dsk)
475 {
476     struct dsk *newdsk;
477 
478     newdsk = malloc(sizeof(struct dsk));
479     *newdsk = *dsk;
480     return (newdsk);
481 }
482 
483 /*
484  * Get disk size from eax=0x800 and 0x4800. We need to probe both
485  * because 0x4800 may not be available and we would like to get more
486  * or less correct disk size - if it is possible at all.
487  * Note we do not really want to touch drv.c because that code is shared
488  * with boot2 and we can not afford to grow that code.
489  */
490 static uint64_t
491 drvsize_ext(struct dsk *dskp)
492 {
493 	uint64_t size, tmp;
494 	int cyl, hds, sec;
495 
496 	v86.ctl = V86_FLAGS;
497 	v86.addr = 0x13;
498 	v86.eax = 0x800;
499 	v86.edx = dskp->drive;
500 	v86int();
501 
502 	/* Don't error out if we get bad sector number, try EDD as well */
503 	if (V86_CY(v86.efl) ||	/* carry set */
504 	    (v86.edx & 0xff) <= (unsigned)(dskp->drive & 0x7f)) /* unit # bad */
505 		return (0);
506 
507 	cyl = ((v86.ecx & 0xc0) << 2) + ((v86.ecx & 0xff00) >> 8) + 1;
508 	/* Convert max head # -> # of heads */
509 	hds = ((v86.edx & 0xff00) >> 8) + 1;
510 	sec = v86.ecx & 0x3f;
511 
512 	size = (uint64_t)cyl * hds * sec;
513 
514 	/* Determine if we can use EDD with this device. */
515 	v86.ctl = V86_FLAGS;
516 	v86.addr = 0x13;
517 	v86.eax = 0x4100;
518 	v86.edx = dskp->drive;
519 	v86.ebx = 0x55aa;
520 	v86int();
521 	if (V86_CY(v86.efl) ||  /* carry set */
522 	    (v86.ebx & 0xffff) != 0xaa55 || /* signature */
523 	    (v86.ecx & EDD_INTERFACE_FIXED_DISK) == 0)
524 		return (size);
525 
526 	tmp = drvsize(dskp);
527 	if (tmp > size)
528 		size = tmp;
529 
530 	return (size);
531 }
532 
533 /*
534  * The "layered" ioctl to read disk/partition size. Unfortunately
535  * the zfsboot case is hardest, because we do not have full software
536  * stack available, so we need to do some manual work here.
537  */
538 uint64_t
539 ldi_get_size(void *priv)
540 {
541 	struct dsk *dskp = priv;
542 	uint64_t size = dskp->size;
543 
544 	if (dskp->start == 0)
545 		size = drvsize_ext(dskp);
546 
547 	return (size * DEV_BSIZE);
548 }
549 
550 static void
551 probe_drive(struct dsk *dsk)
552 {
553 #ifdef GPT
554     struct gpt_hdr hdr;
555     struct gpt_ent *ent;
556     unsigned part, entries_per_sec;
557     daddr_t slba;
558 #endif
559 #if defined(GPT) || defined(LOADER_GELI_SUPPORT)
560     daddr_t elba;
561 #endif
562 
563     struct dos_partition *dp;
564     char *sec;
565     unsigned i;
566 
567     /*
568      * If we find a vdev on the whole disk, stop here.
569      */
570     if (vdev_probe(vdev_read2, dsk, NULL) == 0)
571 	return;
572 
573 #ifdef LOADER_GELI_SUPPORT
574     /*
575      * Taste the disk, if it is GELI encrypted, decrypt it and check to see if
576      * it is a usable vdev then. Otherwise dig
577      * out the partition table and probe each slice/partition
578      * in turn for a vdev or GELI encrypted vdev.
579      */
580     elba = drvsize_ext(dsk);
581     if (elba > 0) {
582 	elba--;
583     }
584     if (geli_taste(vdev_read, dsk, elba) == 0) {
585 	if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit,
586 	  ':', 0, dsk) == 0) {
587 	    if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
588 		return;
589 	    }
590 	}
591     }
592 #endif /* LOADER_GELI_SUPPORT */
593 
594     sec = dmadat->secbuf;
595     dsk->start = 0;
596 
597 #ifdef GPT
598     /*
599      * First check for GPT.
600      */
601     if (drvread(dsk, sec, 1, 1)) {
602 	return;
603     }
604     memcpy(&hdr, sec, sizeof(hdr));
605     if (memcmp(hdr.hdr_sig, GPT_HDR_SIG, sizeof(hdr.hdr_sig)) != 0 ||
606 	hdr.hdr_lba_self != 1 || hdr.hdr_revision < 0x00010000 ||
607 	hdr.hdr_entsz < sizeof(*ent) || DEV_BSIZE % hdr.hdr_entsz != 0) {
608 	goto trymbr;
609     }
610 
611     /*
612      * Probe all GPT partitions for the presence of ZFS pools. We
613      * return the spa_t for the first we find (if requested). This
614      * will have the effect of booting from the first pool on the
615      * disk.
616      *
617      * If no vdev is found, GELI decrypting the device and try again
618      */
619     entries_per_sec = DEV_BSIZE / hdr.hdr_entsz;
620     slba = hdr.hdr_lba_table;
621     elba = slba + hdr.hdr_entries / entries_per_sec;
622     while (slba < elba) {
623 	dsk->start = 0;
624 	if (drvread(dsk, sec, slba, 1))
625 	    return;
626 	for (part = 0; part < entries_per_sec; part++) {
627 	    ent = (struct gpt_ent *)(sec + part * hdr.hdr_entsz);
628 	    if (memcmp(&ent->ent_type, &freebsd_zfs_uuid,
629 		     sizeof(uuid_t)) == 0) {
630 		dsk->start = ent->ent_lba_start;
631 		dsk->size = ent->ent_lba_end - ent->ent_lba_start + 1;
632 		dsk->slice = part + 1;
633 		dsk->part = 255;
634 		if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
635 		    /*
636 		     * This slice had a vdev. We need a new dsk
637 		     * structure now since the vdev now owns this one.
638 		     */
639 		    dsk = copy_dsk(dsk);
640 		}
641 #ifdef LOADER_GELI_SUPPORT
642 		else if (geli_taste(vdev_read, dsk, ent->ent_lba_end -
643 			 ent->ent_lba_start) == 0) {
644 		    if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw,
645 		      dsk->unit, 'p', dsk->slice, dsk) == 0) {
646 			/*
647 			 * This slice has GELI, check it for ZFS.
648 			 */
649 			if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
650 			    /*
651 			     * This slice had a vdev. We need a new dsk
652 			     * structure now since the vdev now owns this one.
653 			     */
654 			    dsk = copy_dsk(dsk);
655 			}
656 			break;
657 		    }
658 		}
659 #endif /* LOADER_GELI_SUPPORT */
660 	    }
661 	}
662 	slba++;
663     }
664     return;
665 trymbr:
666 #endif /* GPT */
667 
668     if (drvread(dsk, sec, DOSBBSECTOR, 1))
669 	return;
670     dp = (void *)(sec + DOSPARTOFF);
671 
672     for (i = 0; i < NDOSPART; i++) {
673 	if (!dp[i].dp_typ)
674 	    continue;
675 	dsk->start = dp[i].dp_start;
676 	dsk->size = dp[i].dp_size;
677 	dsk->slice = i + 1;
678 	if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
679 	    dsk = copy_dsk(dsk);
680 	}
681 #ifdef LOADER_GELI_SUPPORT
682 	else if (geli_taste(vdev_read, dsk, dp[i].dp_size -
683 		 dp[i].dp_start) == 0) {
684 	    if (geli_havekey(dsk) == 0 || geli_passphrase(gelipw, dsk->unit,
685 	      's', i, dsk) == 0) {
686 		/*
687 		 * This slice has GELI, check it for ZFS.
688 		 */
689 		if (vdev_probe(vdev_read2, dsk, NULL) == 0) {
690 		    /*
691 		     * This slice had a vdev. We need a new dsk
692 		     * structure now since the vdev now owns this one.
693 		     */
694 		    dsk = copy_dsk(dsk);
695 		}
696 		break;
697 	    }
698 	}
699 #endif /* LOADER_GELI_SUPPORT */
700     }
701 }
702 
703 int
704 main(void)
705 {
706     dnode_phys_t dn;
707     off_t off;
708     struct dsk *dsk;
709     int autoboot, i;
710     int nextboot;
711     int rc;
712 
713     dmadat = (void *)(roundup2(__base + (int32_t)&_end, 0x10000) - __base);
714 
715     bios_getmem();
716 
717     if (high_heap_size > 0) {
718 	heap_end = PTOV(high_heap_base + high_heap_size);
719 	heap_next = PTOV(high_heap_base);
720     } else {
721 	heap_next = (char *)dmadat + sizeof(*dmadat);
722 	heap_end = (char *)PTOV(bios_basemem);
723     }
724 
725     dsk = malloc(sizeof(struct dsk));
726     dsk->drive = *(uint8_t *)PTOV(ARGS);
727     dsk->type = dsk->drive & DRV_HARD ? TYPE_AD : TYPE_FD;
728     dsk->unit = dsk->drive & DRV_MASK;
729     dsk->slice = *(uint8_t *)PTOV(ARGS + 1) + 1;
730     dsk->part = 0;
731     dsk->start = 0;
732     dsk->size = 0;
733 
734     bootinfo.bi_version = BOOTINFO_VERSION;
735     bootinfo.bi_size = sizeof(bootinfo);
736     bootinfo.bi_basemem = bios_basemem / 1024;
737     bootinfo.bi_extmem = bios_extmem / 1024;
738     bootinfo.bi_memsizes_valid++;
739     bootinfo.bi_bios_dev = dsk->drive;
740 
741     bootdev = MAKEBOOTDEV(dev_maj[dsk->type],
742 			  dsk->slice, dsk->unit, dsk->part);
743 
744     /* Process configuration file */
745 
746     autoboot = 1;
747 
748 #ifdef LOADER_GELI_SUPPORT
749     geli_init();
750 #endif
751     zfs_init();
752 
753     /*
754      * Probe the boot drive first - we will try to boot from whatever
755      * pool we find on that drive.
756      */
757     probe_drive(dsk);
758 
759     /*
760      * Probe the rest of the drives that the bios knows about. This
761      * will find any other available pools and it may fill in missing
762      * vdevs for the boot pool.
763      */
764 #ifndef VIRTUALBOX
765     for (i = 0; i < *(unsigned char *)PTOV(BIOS_NUMDRIVES); i++)
766 #else
767     for (i = 0; i < MAXBDDEV; i++)
768 #endif
769     {
770 	if ((i | DRV_HARD) == *(uint8_t *)PTOV(ARGS))
771 	    continue;
772 
773 	if (!int13probe(i | DRV_HARD))
774 	    break;
775 
776 	dsk = malloc(sizeof(struct dsk));
777 	dsk->drive = i | DRV_HARD;
778 	dsk->type = dsk->drive & TYPE_AD;
779 	dsk->unit = i;
780 	dsk->slice = 0;
781 	dsk->part = 0;
782 	dsk->start = 0;
783 	dsk->size = 0;
784 	probe_drive(dsk);
785     }
786 
787     /*
788      * The first discovered pool, if any, is the pool.
789      */
790     spa = spa_get_primary();
791     if (!spa) {
792 	printf("%s: No ZFS pools located, can't boot\n", BOOTPROG);
793 	for (;;)
794 	    ;
795     }
796 
797     primary_spa = spa;
798     primary_vdev = spa_get_primary_vdev(spa);
799 
800     nextboot = 0;
801     rc  = vdev_read_pad2(primary_vdev, cmd, sizeof(cmd));
802     if (vdev_clear_pad2(primary_vdev))
803 	printf("failed to clear pad2 area of primary vdev\n");
804     if (rc == 0) {
805 	if (*cmd) {
806 	    /*
807 	     * We could find an old-style ZFS Boot Block header here.
808 	     * Simply ignore it.
809 	     */
810 	    if (*(uint64_t *)cmd != 0x2f5b007b10c) {
811 		/*
812 		 * Note that parse() is destructive to cmd[] and we also want
813 		 * to honor RBX_QUIET option that could be present in cmd[].
814 		 */
815 		nextboot = 1;
816 		memcpy(cmddup, cmd, sizeof(cmd));
817 		if (parse_cmd()) {
818 		    printf("failed to parse pad2 area of primary vdev\n");
819 		    reboot();
820 		}
821 		if (!OPT_CHECK(RBX_QUIET))
822 		    printf("zfs nextboot: %s\n", cmddup);
823 	    }
824 	    /* Do not process this command twice */
825 	    *cmd = 0;
826 	}
827     } else
828 	printf("failed to read pad2 area of primary vdev\n");
829 
830     /* Mount ZFS only if it's not already mounted via nextboot parsing. */
831     if (zfsmount.spa == NULL &&
832 	(zfs_spa_init(spa) != 0 || zfs_mount(spa, 0, &zfsmount) != 0)) {
833 	printf("%s: failed to mount default pool %s\n",
834 	    BOOTPROG, spa->spa_name);
835 	autoboot = 0;
836     } else if (zfs_lookup(&zfsmount, PATH_CONFIG, &dn) == 0 ||
837         zfs_lookup(&zfsmount, PATH_DOTCONFIG, &dn) == 0) {
838 	off = 0;
839 	zfs_read(spa, &dn, &off, cmd, sizeof(cmd));
840     }
841 
842     if (*cmd) {
843 	/*
844 	 * Note that parse_cmd() is destructive to cmd[] and we also want
845 	 * to honor RBX_QUIET option that could be present in cmd[].
846 	 */
847 	memcpy(cmddup, cmd, sizeof(cmd));
848 	if (parse_cmd())
849 	    autoboot = 0;
850 	if (!OPT_CHECK(RBX_QUIET))
851 	    printf("%s: %s\n", PATH_CONFIG, cmddup);
852 	/* Do not process this command twice */
853 	*cmd = 0;
854     }
855 
856     /* Do not risk waiting at the prompt forever. */
857     if (nextboot && !autoboot)
858 	reboot();
859 
860     /*
861      * Try to exec /boot/loader. If interrupted by a keypress,
862      * or in case of failure, try to load a kernel directly instead.
863      */
864 
865     if (autoboot && !*kname) {
866 	memcpy(kname, PATH_LOADER_ZFS, sizeof(PATH_LOADER_ZFS));
867 	if (!keyhit(3)) {
868 	    load();
869 	    memcpy(kname, PATH_KERNEL, sizeof(PATH_KERNEL));
870 	}
871     }
872 
873     /* Present the user with the boot2 prompt. */
874 
875     for (;;) {
876 	if (!autoboot || !OPT_CHECK(RBX_QUIET)) {
877 	    printf("\nFreeBSD/x86 boot\n");
878 	    if (zfs_rlookup(spa, zfsmount.rootobj, rootname) != 0)
879 		printf("Default: %s/<0x%llx>:%s\n"
880 		       "boot: ",
881 		       spa->spa_name, zfsmount.rootobj, kname);
882 	    else if (rootname[0] != '\0')
883 		printf("Default: %s/%s:%s\n"
884 		       "boot: ",
885 		       spa->spa_name, rootname, kname);
886 	    else
887 		printf("Default: %s:%s\n"
888 		       "boot: ",
889 		       spa->spa_name, kname);
890 	}
891 	if (ioctrl & IO_SERIAL)
892 	    sio_flush();
893 	if (!autoboot || keyhit(5))
894 	    getstr(cmd, sizeof(cmd));
895 	else if (!autoboot || !OPT_CHECK(RBX_QUIET))
896 	    putchar('\n');
897 	autoboot = 0;
898 	if (parse_cmd())
899 	    putchar('\a');
900 	else
901 	    load();
902     }
903 }
904 
905 /* XXX - Needed for btxld to link the boot2 binary; do not remove. */
906 void
907 exit(int x)
908 {
909     __exit(x);
910 }
911 
912 void
913 reboot(void)
914 {
915     __exit(0);
916 }
917 
918 static void
919 load(void)
920 {
921     union {
922 	struct exec ex;
923 	Elf32_Ehdr eh;
924     } hdr;
925     static Elf32_Phdr ep[2];
926     static Elf32_Shdr es[2];
927     caddr_t p;
928     dnode_phys_t dn;
929     off_t off;
930     uint32_t addr, x;
931     int fmt, i, j;
932 
933     if (zfs_lookup(&zfsmount, kname, &dn)) {
934 	printf("\nCan't find %s\n", kname);
935 	return;
936     }
937     off = 0;
938     if (xfsread(&dn, &off, &hdr, sizeof(hdr)))
939 	return;
940     if (N_GETMAGIC(hdr.ex) == ZMAGIC)
941 	fmt = 0;
942     else if (IS_ELF(hdr.eh))
943 	fmt = 1;
944     else {
945 	printf("Invalid %s\n", "format");
946 	return;
947     }
948     if (fmt == 0) {
949 	addr = hdr.ex.a_entry & 0xffffff;
950 	p = PTOV(addr);
951 	off = PAGE_SIZE;
952 	if (xfsread(&dn, &off, p, hdr.ex.a_text))
953 	    return;
954 	p += roundup2(hdr.ex.a_text, PAGE_SIZE);
955 	if (xfsread(&dn, &off, p, hdr.ex.a_data))
956 	    return;
957 	p += hdr.ex.a_data + roundup2(hdr.ex.a_bss, PAGE_SIZE);
958 	bootinfo.bi_symtab = VTOP(p);
959 	memcpy(p, &hdr.ex.a_syms, sizeof(hdr.ex.a_syms));
960 	p += sizeof(hdr.ex.a_syms);
961 	if (hdr.ex.a_syms) {
962 	    if (xfsread(&dn, &off, p, hdr.ex.a_syms))
963 		return;
964 	    p += hdr.ex.a_syms;
965 	    if (xfsread(&dn, &off, p, sizeof(int)))
966 		return;
967 	    x = *(uint32_t *)p;
968 	    p += sizeof(int);
969 	    x -= sizeof(int);
970 	    if (xfsread(&dn, &off, p, x))
971 		return;
972 	    p += x;
973 	}
974     } else {
975 	off = hdr.eh.e_phoff;
976 	for (j = i = 0; i < hdr.eh.e_phnum && j < 2; i++) {
977 	    if (xfsread(&dn, &off, ep + j, sizeof(ep[0])))
978 		return;
979 	    if (ep[j].p_type == PT_LOAD)
980 		j++;
981 	}
982 	for (i = 0; i < 2; i++) {
983 	    p = PTOV(ep[i].p_paddr & 0xffffff);
984 	    off = ep[i].p_offset;
985 	    if (xfsread(&dn, &off, p, ep[i].p_filesz))
986 		return;
987 	}
988 	p += roundup2(ep[1].p_memsz, PAGE_SIZE);
989 	bootinfo.bi_symtab = VTOP(p);
990 	if (hdr.eh.e_shnum == hdr.eh.e_shstrndx + 3) {
991 	    off = hdr.eh.e_shoff + sizeof(es[0]) *
992 		(hdr.eh.e_shstrndx + 1);
993 	    if (xfsread(&dn, &off, &es, sizeof(es)))
994 		return;
995 	    for (i = 0; i < 2; i++) {
996 		memcpy(p, &es[i].sh_size, sizeof(es[i].sh_size));
997 		p += sizeof(es[i].sh_size);
998 		off = es[i].sh_offset;
999 		if (xfsread(&dn, &off, p, es[i].sh_size))
1000 		    return;
1001 		p += es[i].sh_size;
1002 	    }
1003 	}
1004 	addr = hdr.eh.e_entry & 0xffffff;
1005     }
1006     bootinfo.bi_esymtab = VTOP(p);
1007     bootinfo.bi_kernelname = VTOP(kname);
1008     zfsargs.size = sizeof(zfsargs);
1009     zfsargs.pool = zfsmount.spa->spa_guid;
1010     zfsargs.root = zfsmount.rootobj;
1011     zfsargs.primary_pool = primary_spa->spa_guid;
1012 #ifdef LOADER_GELI_SUPPORT
1013     explicit_bzero(gelipw, sizeof(gelipw));
1014     gelibuf = malloc(sizeof(struct keybuf) + (GELI_MAX_KEYS * sizeof(struct keybuf_ent)));
1015     geli_fill_keybuf(gelibuf);
1016     zfsargs.notapw = '\0';
1017     zfsargs.keybuf_sentinel = KEYBUF_SENTINEL;
1018     zfsargs.keybuf = gelibuf;
1019 #else
1020     zfsargs.gelipw[0] = '\0';
1021 #endif
1022     if (primary_vdev != NULL)
1023 	zfsargs.primary_vdev = primary_vdev->v_guid;
1024     else
1025 	printf("failed to detect primary vdev\n");
1026     __exec((caddr_t)addr, RB_BOOTINFO | (opts & RBX_MASK),
1027 	   bootdev,
1028 	   KARGS_FLAGS_ZFS | KARGS_FLAGS_EXTARG,
1029 	   (uint32_t) spa->spa_guid,
1030 	   (uint32_t) (spa->spa_guid >> 32),
1031 	   VTOP(&bootinfo),
1032 	   zfsargs);
1033 }
1034 
1035 static int
1036 zfs_mount_ds(char *dsname)
1037 {
1038     uint64_t newroot;
1039     spa_t *newspa;
1040     char *q;
1041 
1042     q = strchr(dsname, '/');
1043     if (q)
1044 	*q++ = '\0';
1045     newspa = spa_find_by_name(dsname);
1046     if (newspa == NULL) {
1047 	printf("\nCan't find ZFS pool %s\n", dsname);
1048 	return -1;
1049     }
1050 
1051     if (zfs_spa_init(newspa))
1052 	return -1;
1053 
1054     newroot = 0;
1055     if (q) {
1056 	if (zfs_lookup_dataset(newspa, q, &newroot)) {
1057 	    printf("\nCan't find dataset %s in ZFS pool %s\n",
1058 		    q, newspa->spa_name);
1059 	    return -1;
1060 	}
1061     }
1062     if (zfs_mount(newspa, newroot, &zfsmount)) {
1063 	printf("\nCan't mount ZFS dataset\n");
1064 	return -1;
1065     }
1066     spa = newspa;
1067     return (0);
1068 }
1069 
1070 static int
1071 parse_cmd(void)
1072 {
1073     char *arg = cmd;
1074     char *ep, *p, *q;
1075     const char *cp;
1076     int c, i, j;
1077 
1078     while ((c = *arg++)) {
1079 	if (c == ' ' || c == '\t' || c == '\n')
1080 	    continue;
1081 	for (p = arg; *p && *p != '\n' && *p != ' ' && *p != '\t'; p++);
1082 	ep = p;
1083 	if (*p)
1084 	    *p++ = 0;
1085 	if (c == '-') {
1086 	    while ((c = *arg++)) {
1087 		if (c == 'P') {
1088 		    if (*(uint8_t *)PTOV(0x496) & 0x10) {
1089 			cp = "yes";
1090 		    } else {
1091 			opts |= OPT_SET(RBX_DUAL) | OPT_SET(RBX_SERIAL);
1092 			cp = "no";
1093 		    }
1094 		    printf("Keyboard: %s\n", cp);
1095 		    continue;
1096 		} else if (c == 'S') {
1097 		    j = 0;
1098 		    while ((unsigned int)(i = *arg++ - '0') <= 9)
1099 			j = j * 10 + i;
1100 		    if (j > 0 && i == -'0') {
1101 			comspeed = j;
1102 			break;
1103 		    }
1104 		    /* Fall through to error below ('S' not in optstr[]). */
1105 		}
1106 		for (i = 0; c != optstr[i]; i++)
1107 		    if (i == NOPT - 1)
1108 			return -1;
1109 		opts ^= OPT_SET(flags[i]);
1110 	    }
1111 	    ioctrl = OPT_CHECK(RBX_DUAL) ? (IO_SERIAL|IO_KEYBOARD) :
1112 		     OPT_CHECK(RBX_SERIAL) ? IO_SERIAL : IO_KEYBOARD;
1113 	    if (ioctrl & IO_SERIAL) {
1114 	        if (sio_init(115200 / comspeed) != 0)
1115 		    ioctrl &= ~IO_SERIAL;
1116 	    }
1117 	} if (c == '?') {
1118 	    dnode_phys_t dn;
1119 
1120 	    if (zfs_lookup(&zfsmount, arg, &dn) == 0) {
1121 		zap_list(spa, &dn);
1122 	    }
1123 	    return -1;
1124 	} else {
1125 	    arg--;
1126 
1127 	    /*
1128 	     * Report pool status if the comment is 'status'. Lets
1129 	     * hope no-one wants to load /status as a kernel.
1130 	     */
1131 	    if (!strcmp(arg, "status")) {
1132 		spa_all_status();
1133 		return -1;
1134 	    }
1135 
1136 	    /*
1137 	     * If there is "zfs:" prefix simply ignore it.
1138 	     */
1139 	    if (strncmp(arg, "zfs:", 4) == 0)
1140 		arg += 4;
1141 
1142 	    /*
1143 	     * If there is a colon, switch pools.
1144 	     */
1145 	    q = strchr(arg, ':');
1146 	    if (q) {
1147 		*q++ = '\0';
1148 		if (zfs_mount_ds(arg) != 0)
1149 		    return -1;
1150 		arg = q;
1151 	    }
1152 	    if ((i = ep - arg)) {
1153 		if ((size_t)i >= sizeof(kname))
1154 		    return -1;
1155 		memcpy(kname, arg, i + 1);
1156 	    }
1157 	}
1158 	arg = p;
1159     }
1160     return 0;
1161 }
1162