xref: /freebsd/sys/geom/eli/g_eli.c (revision 2e620256bd76c449c835c604e404483437743011)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2005-2019 Pawel Jakub Dawidek <pawel@dawidek.net>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/cons.h>
33 #include <sys/kenv.h>
34 #include <sys/kernel.h>
35 #include <sys/linker.h>
36 #include <sys/module.h>
37 #include <sys/lock.h>
38 #include <sys/mutex.h>
39 #include <sys/bio.h>
40 #include <sys/sbuf.h>
41 #include <sys/sysctl.h>
42 #include <sys/malloc.h>
43 #include <sys/eventhandler.h>
44 #include <sys/kthread.h>
45 #include <sys/proc.h>
46 #include <sys/sched.h>
47 #include <sys/smp.h>
48 #include <sys/uio.h>
49 #include <sys/vnode.h>
50 
51 #include <machine/vmparam.h>
52 
53 #include <vm/uma.h>
54 #include <vm/vm.h>
55 #include <vm/swap_pager.h>
56 
57 #include <geom/geom.h>
58 #include <geom/geom_dbg.h>
59 #include <geom/eli/g_eli.h>
60 #include <geom/eli/pkcs5v2.h>
61 
62 #include <crypto/intake.h>
63 
64 FEATURE(geom_eli, "GEOM crypto module");
65 
66 MALLOC_DEFINE(M_ELI, "eli_data", "GEOM_ELI Data");
67 
68 SYSCTL_DECL(_kern_geom);
69 SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
70     "GEOM_ELI stuff");
71 static int g_eli_version = G_ELI_VERSION;
72 SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0,
73     "GELI version");
74 int g_eli_debug = 0;
75 SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0,
76     "Debug level");
77 static u_int g_eli_tries = 3;
78 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0,
79     "Number of tries for entering the passphrase");
80 static u_int g_eli_visible_passphrase = GETS_NOECHO;
81 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN,
82     &g_eli_visible_passphrase, 0,
83     "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)");
84 u_int g_eli_overwrites = G_ELI_OVERWRITES;
85 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites,
86     0, "Number of times on-disk keys should be overwritten when destroying them");
87 static u_int g_eli_threads = 0;
88 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0,
89     "Number of threads doing crypto work");
90 u_int g_eli_batch = 0;
91 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0,
92     "Use crypto operations batching");
93 static u_int g_eli_minbufs = 16;
94 static int sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS);
95 SYSCTL_PROC(_kern_geom_eli, OID_AUTO, minbufs, CTLTYPE_UINT | CTLFLAG_RW |
96     CTLFLAG_MPSAFE, NULL, 0, sysctl_g_eli_minbufs, "IU",
97     "Number of GELI bufs reserved for swap transactions");
98 static bool g_eli_blocking_malloc = false;
99 SYSCTL_BOOL(_kern_geom_eli, OID_AUTO, blocking_malloc, CTLFLAG_RWTUN,
100     &g_eli_blocking_malloc, 0, "Use blocking malloc calls for GELI buffers");
101 static bool g_eli_unmapped_io = true;
102 SYSCTL_BOOL(_kern_geom_eli, OID_AUTO, unmapped_io, CTLFLAG_RDTUN,
103     &g_eli_unmapped_io, 0, "Enable support for unmapped I/O");
104 
105 static struct sx g_eli_umalock;	/* Controls changes to UMA zone. */
106 SX_SYSINIT(g_eli_umalock, &g_eli_umalock, "GELI UMA");
107 static uma_zone_t g_eli_uma = NULL;
108 static int g_eli_alloc_sz;
109 static volatile int g_eli_umaoutstanding;
110 static volatile int g_eli_devs;
111 
112 /*
113  * Control the number of reserved entries in the GELI zone.
114  * If the GELI zone has already been allocated, update the zone. Otherwise,
115  * simply update the variable for use the next time the zone is created.
116  */
117 static int
118 sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS)
119 {
120 	int error;
121 	u_int new;
122 
123 	new = g_eli_minbufs;
124 	error = sysctl_handle_int(oidp, &new, 0, req);
125 	if (error != 0 || req->newptr == NULL)
126 		return (error);
127 	sx_xlock(&g_eli_umalock);
128 	if (g_eli_uma != NULL) {
129 		if (new != g_eli_minbufs)
130 			uma_zone_reserve(g_eli_uma, new);
131 		if (new > g_eli_minbufs)
132 			uma_prealloc(g_eli_uma, new - g_eli_minbufs);
133 	}
134 	if (new != g_eli_minbufs)
135 		g_eli_minbufs = new;
136 	sx_xunlock(&g_eli_umalock);
137 	return (0);
138 }
139 
140 /*
141  * Passphrase cached during boot, in order to be more user-friendly if
142  * there are multiple providers using the same passphrase.
143  */
144 static char cached_passphrase[256];
145 static u_int g_eli_boot_passcache = 1;
146 TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache);
147 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD,
148     &g_eli_boot_passcache, 0,
149     "Passphrases are cached during boot process for possible reuse");
150 static void
151 fetch_loader_passphrase(void * dummy)
152 {
153 	char * env_passphrase;
154 
155 	KASSERT(dynamic_kenv, ("need dynamic kenv"));
156 
157 	if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) {
158 		/* Extract passphrase from the environment. */
159 		strlcpy(cached_passphrase, env_passphrase,
160 		    sizeof(cached_passphrase));
161 		freeenv(env_passphrase);
162 
163 		/* Wipe the passphrase from the environment. */
164 		kern_unsetenv("kern.geom.eli.passphrase");
165 	}
166 }
167 SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY,
168     fetch_loader_passphrase, NULL);
169 
170 static void
171 zero_boot_passcache(void)
172 {
173 
174         explicit_bzero(cached_passphrase, sizeof(cached_passphrase));
175 }
176 
177 static void
178 zero_geli_intake_keys(void)
179 {
180         struct keybuf *keybuf;
181         int i;
182 
183         if ((keybuf = get_keybuf()) != NULL) {
184                 /* Scan the key buffer, clear all GELI keys. */
185                 for (i = 0; i < keybuf->kb_nents; i++) {
186                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
187                                  explicit_bzero(keybuf->kb_ents[i].ke_data,
188                                      sizeof(keybuf->kb_ents[i].ke_data));
189                                  keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE;
190                          }
191                 }
192         }
193 }
194 
195 static void
196 zero_intake_passcache(void *dummy)
197 {
198         zero_boot_passcache();
199         zero_geli_intake_keys();
200 }
201 EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0);
202 
203 static eventhandler_tag g_eli_pre_sync = NULL;
204 
205 static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
206     off_t offset, struct g_eli_metadata *md);
207 
208 static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp,
209     struct g_geom *gp);
210 static void g_eli_init(struct g_class *mp);
211 static void g_eli_fini(struct g_class *mp);
212 
213 static g_taste_t g_eli_taste;
214 static g_dumpconf_t g_eli_dumpconf;
215 
216 struct g_class g_eli_class = {
217 	.name = G_ELI_CLASS_NAME,
218 	.version = G_VERSION,
219 	.ctlreq = g_eli_config,
220 	.taste = g_eli_taste,
221 	.destroy_geom = g_eli_destroy_geom,
222 	.init = g_eli_init,
223 	.fini = g_eli_fini
224 };
225 
226 /*
227  * Code paths:
228  * BIO_READ:
229  *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
230  * BIO_WRITE:
231  *	g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
232  */
233 
234 /*
235  * EAGAIN from crypto(9) means, that we were probably balanced to another crypto
236  * accelerator or something like this.
237  * The function updates the SID and rerun the operation.
238  */
239 int
240 g_eli_crypto_rerun(struct cryptop *crp)
241 {
242 	struct g_eli_softc *sc;
243 	struct g_eli_worker *wr;
244 	struct bio *bp;
245 	int error;
246 
247 	bp = (struct bio *)crp->crp_opaque;
248 	sc = bp->bio_to->geom->softc;
249 	LIST_FOREACH(wr, &sc->sc_workers, w_next) {
250 		if (wr->w_number == G_ELI_WORKER(bp->bio_pflags))
251 			break;
252 	}
253 	KASSERT(wr != NULL, ("Invalid worker (%u).",
254 	    G_ELI_WORKER(bp->bio_pflags)));
255 	G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).",
256 	    bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid,
257 	    crp->crp_session);
258 	wr->w_sid = crp->crp_session;
259 	crp->crp_etype = 0;
260 	error = crypto_dispatch(crp);
261 	if (error == 0)
262 		return (0);
263 	G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error);
264 	crp->crp_etype = error;
265 	return (error);
266 }
267 
268 static void
269 g_eli_getattr_done(struct bio *bp)
270 {
271 	if (bp->bio_error == 0 &&
272 	    !strcmp(bp->bio_attribute, "GEOM::physpath")) {
273 		strlcat(bp->bio_data, "/eli", bp->bio_length);
274 	}
275 	g_std_done(bp);
276 }
277 
278 /*
279  * The function is called afer reading encrypted data from the provider.
280  *
281  * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
282  */
283 void
284 g_eli_read_done(struct bio *bp)
285 {
286 	struct g_eli_softc *sc;
287 	struct bio *pbp;
288 
289 	G_ELI_LOGREQ(2, bp, "Request done.");
290 	pbp = bp->bio_parent;
291 	if (pbp->bio_error == 0 && bp->bio_error != 0)
292 		pbp->bio_error = bp->bio_error;
293 	g_destroy_bio(bp);
294 	/*
295 	 * Do we have all sectors already?
296 	 */
297 	pbp->bio_inbed++;
298 	if (pbp->bio_inbed < pbp->bio_children)
299 		return;
300 	sc = pbp->bio_to->geom->softc;
301 	if (pbp->bio_error != 0) {
302 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
303 		    pbp->bio_error);
304 		pbp->bio_completed = 0;
305 		g_eli_free_data(pbp);
306 		g_io_deliver(pbp, pbp->bio_error);
307 		if (sc != NULL)
308 			atomic_subtract_int(&sc->sc_inflight, 1);
309 		return;
310 	}
311 	mtx_lock(&sc->sc_queue_mtx);
312 	bioq_insert_tail(&sc->sc_queue, pbp);
313 	mtx_unlock(&sc->sc_queue_mtx);
314 	wakeup(sc);
315 }
316 
317 /*
318  * The function is called after we encrypt and write data.
319  *
320  * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver
321  */
322 void
323 g_eli_write_done(struct bio *bp)
324 {
325 	struct g_eli_softc *sc;
326 	struct bio *pbp;
327 
328 	G_ELI_LOGREQ(2, bp, "Request done.");
329 	pbp = bp->bio_parent;
330 	if (pbp->bio_error == 0 && bp->bio_error != 0)
331 		pbp->bio_error = bp->bio_error;
332 	g_destroy_bio(bp);
333 	/*
334 	 * Do we have all sectors already?
335 	 */
336 	pbp->bio_inbed++;
337 	if (pbp->bio_inbed < pbp->bio_children)
338 		return;
339 	sc = pbp->bio_to->geom->softc;
340 	g_eli_free_data(pbp);
341 	if (pbp->bio_error != 0) {
342 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
343 		    pbp->bio_error);
344 		pbp->bio_completed = 0;
345 	} else
346 		pbp->bio_completed = pbp->bio_length;
347 
348 	/*
349 	 * Write is finished, send it up.
350 	 */
351 	g_io_deliver(pbp, pbp->bio_error);
352 	if (sc != NULL)
353 		atomic_subtract_int(&sc->sc_inflight, 1);
354 }
355 
356 /*
357  * This function should never be called, but GEOM made as it set ->orphan()
358  * method for every geom.
359  */
360 static void
361 g_eli_orphan_spoil_assert(struct g_consumer *cp)
362 {
363 
364 	panic("Function %s() called for %s.", __func__, cp->geom->name);
365 }
366 
367 static void
368 g_eli_orphan(struct g_consumer *cp)
369 {
370 	struct g_eli_softc *sc;
371 
372 	g_topology_assert();
373 	sc = cp->geom->softc;
374 	if (sc == NULL)
375 		return;
376 	g_eli_destroy(sc, TRUE);
377 }
378 
379 static void
380 g_eli_resize(struct g_consumer *cp)
381 {
382 	struct g_eli_softc *sc;
383 	struct g_provider *epp, *pp;
384 	off_t oldsize;
385 
386 	g_topology_assert();
387 	sc = cp->geom->softc;
388 	if (sc == NULL)
389 		return;
390 
391 	if ((sc->sc_flags & G_ELI_FLAG_AUTORESIZE) == 0) {
392 		G_ELI_DEBUG(0, "Autoresize is turned off, old size: %jd.",
393 		    (intmax_t)sc->sc_provsize);
394 		return;
395 	}
396 
397 	pp = cp->provider;
398 
399 	if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) {
400 		struct g_eli_metadata md;
401 		u_char *sector;
402 		int error;
403 
404 		sector = NULL;
405 
406 		error = g_eli_read_metadata_offset(cp->geom->class, pp,
407 		    sc->sc_provsize - pp->sectorsize, &md);
408 		if (error != 0) {
409 			G_ELI_DEBUG(0, "Cannot read metadata from %s (error=%d).",
410 			    pp->name, error);
411 			goto iofail;
412 		}
413 
414 		md.md_provsize = pp->mediasize;
415 
416 		sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO);
417 		eli_metadata_encode(&md, sector);
418 		error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector,
419 		    pp->sectorsize);
420 		if (error != 0) {
421 			G_ELI_DEBUG(0, "Cannot store metadata on %s (error=%d).",
422 			    pp->name, error);
423 			goto iofail;
424 		}
425 		explicit_bzero(sector, pp->sectorsize);
426 		error = g_write_data(cp, sc->sc_provsize - pp->sectorsize,
427 		    sector, pp->sectorsize);
428 		if (error != 0) {
429 			G_ELI_DEBUG(0, "Cannot clear old metadata from %s (error=%d).",
430 			    pp->name, error);
431 			goto iofail;
432 		}
433 iofail:
434 		explicit_bzero(&md, sizeof(md));
435 		zfree(sector, M_ELI);
436 	}
437 
438 	oldsize = sc->sc_mediasize;
439 	sc->sc_mediasize = eli_mediasize(sc, pp->mediasize, pp->sectorsize);
440 	g_eli_key_resize(sc);
441 	sc->sc_provsize = pp->mediasize;
442 
443 	epp = LIST_FIRST(&sc->sc_geom->provider);
444 	g_resize_provider(epp, sc->sc_mediasize);
445 	G_ELI_DEBUG(0, "Device %s size changed from %jd to %jd.", epp->name,
446 	    (intmax_t)oldsize, (intmax_t)sc->sc_mediasize);
447 }
448 
449 /*
450  * BIO_READ:
451  *	G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
452  * BIO_WRITE:
453  *	G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
454  */
455 static void
456 g_eli_start(struct bio *bp)
457 {
458 	struct g_eli_softc *sc;
459 	struct g_consumer *cp;
460 	struct bio *cbp;
461 
462 	sc = bp->bio_to->geom->softc;
463 	KASSERT(sc != NULL,
464 	    ("Provider's error should be set (error=%d)(device=%s).",
465 	    bp->bio_to->error, bp->bio_to->name));
466 	G_ELI_LOGREQ(2, bp, "Request received.");
467 
468 	switch (bp->bio_cmd) {
469 	case BIO_READ:
470 	case BIO_WRITE:
471 	case BIO_GETATTR:
472 	case BIO_FLUSH:
473 	case BIO_ZONE:
474 	case BIO_SPEEDUP:
475 		break;
476 	case BIO_DELETE:
477 		/*
478 		 * If the user hasn't set the NODELETE flag, we just pass
479 		 * it down the stack and let the layers beneath us do (or
480 		 * not) whatever they do with it.  If they have, we
481 		 * reject it.  A possible extension would be an
482 		 * additional flag to take it as a hint to shred the data
483 		 * with [multiple?] overwrites.
484 		 */
485 		if (!(sc->sc_flags & G_ELI_FLAG_NODELETE))
486 			break;
487 	default:
488 		g_io_deliver(bp, EOPNOTSUPP);
489 		return;
490 	}
491 	cbp = g_clone_bio(bp);
492 	if (cbp == NULL) {
493 		g_io_deliver(bp, ENOMEM);
494 		return;
495 	}
496 	bp->bio_driver1 = cbp;
497 	bp->bio_pflags = 0;
498 	G_ELI_SET_NEW_BIO(bp->bio_pflags);
499 	switch (bp->bio_cmd) {
500 	case BIO_READ:
501 		if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) {
502 			g_eli_crypto_read(sc, bp, 0);
503 			break;
504 		}
505 		/* FALLTHROUGH */
506 	case BIO_WRITE:
507 		mtx_lock(&sc->sc_queue_mtx);
508 		bioq_insert_tail(&sc->sc_queue, bp);
509 		mtx_unlock(&sc->sc_queue_mtx);
510 		wakeup(sc);
511 		break;
512 	case BIO_GETATTR:
513 	case BIO_FLUSH:
514 	case BIO_DELETE:
515 	case BIO_SPEEDUP:
516 	case BIO_ZONE:
517 		if (bp->bio_cmd == BIO_GETATTR)
518 			cbp->bio_done = g_eli_getattr_done;
519 		else
520 			cbp->bio_done = g_std_done;
521 		cp = LIST_FIRST(&sc->sc_geom->consumer);
522 		cbp->bio_to = cp->provider;
523 		G_ELI_LOGREQ(2, cbp, "Sending request.");
524 		g_io_request(cbp, cp);
525 		break;
526 	}
527 }
528 
529 static int
530 g_eli_newsession(struct g_eli_worker *wr)
531 {
532 	struct g_eli_softc *sc;
533 	struct crypto_session_params csp;
534 	uint32_t caps;
535 	int error, new_crypto;
536 	void *key;
537 
538 	sc = wr->w_softc;
539 
540 	memset(&csp, 0, sizeof(csp));
541 	csp.csp_mode = CSP_MODE_CIPHER;
542 	csp.csp_cipher_alg = sc->sc_ealgo;
543 	csp.csp_ivlen = g_eli_ivlen(sc->sc_ealgo);
544 	csp.csp_cipher_klen = sc->sc_ekeylen / 8;
545 	if (sc->sc_ealgo == CRYPTO_AES_XTS)
546 		csp.csp_cipher_klen <<= 1;
547 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) {
548 		key = g_eli_key_hold(sc, 0,
549 		    LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize);
550 		csp.csp_cipher_key = key;
551 	} else {
552 		key = NULL;
553 		csp.csp_cipher_key = sc->sc_ekey;
554 	}
555 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
556 		csp.csp_mode = CSP_MODE_ETA;
557 		csp.csp_auth_alg = sc->sc_aalgo;
558 		csp.csp_auth_klen = G_ELI_AUTH_SECKEYLEN;
559 	}
560 
561 	switch (sc->sc_crypto) {
562 	case G_ELI_CRYPTO_SW_ACCEL:
563 	case G_ELI_CRYPTO_SW:
564 		error = crypto_newsession(&wr->w_sid, &csp,
565 		    CRYPTOCAP_F_SOFTWARE);
566 		break;
567 	case G_ELI_CRYPTO_HW:
568 		error = crypto_newsession(&wr->w_sid, &csp,
569 		    CRYPTOCAP_F_HARDWARE);
570 		break;
571 	case G_ELI_CRYPTO_UNKNOWN:
572 		error = crypto_newsession(&wr->w_sid, &csp,
573 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
574 		if (error == 0) {
575 			caps = crypto_ses2caps(wr->w_sid);
576 			if (caps & CRYPTOCAP_F_HARDWARE)
577 				new_crypto = G_ELI_CRYPTO_HW;
578 			else if (caps & CRYPTOCAP_F_ACCEL_SOFTWARE)
579 				new_crypto = G_ELI_CRYPTO_SW_ACCEL;
580 			else
581 				new_crypto = G_ELI_CRYPTO_SW;
582 			mtx_lock(&sc->sc_queue_mtx);
583 			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
584 				sc->sc_crypto = new_crypto;
585 			mtx_unlock(&sc->sc_queue_mtx);
586 		}
587 		break;
588 	default:
589 		panic("%s: invalid condition", __func__);
590 	}
591 
592 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) {
593 		if (error)
594 			g_eli_key_drop(sc, key);
595 		else
596 			wr->w_first_key = key;
597 	}
598 
599 	return (error);
600 }
601 
602 static void
603 g_eli_freesession(struct g_eli_worker *wr)
604 {
605 	struct g_eli_softc *sc;
606 
607 	crypto_freesession(wr->w_sid);
608 	if (wr->w_first_key != NULL) {
609 		sc = wr->w_softc;
610 		g_eli_key_drop(sc, wr->w_first_key);
611 		wr->w_first_key = NULL;
612 	}
613 }
614 
615 static void
616 g_eli_cancel(struct g_eli_softc *sc)
617 {
618 	struct bio *bp;
619 
620 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
621 
622 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
623 		KASSERT(G_ELI_IS_NEW_BIO(bp->bio_pflags),
624 		    ("Not new bio when canceling (bp=%p).", bp));
625 		g_io_deliver(bp, ENXIO);
626 	}
627 }
628 
629 static struct bio *
630 g_eli_takefirst(struct g_eli_softc *sc)
631 {
632 	struct bio *bp;
633 
634 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
635 
636 	if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
637 		return (bioq_takefirst(&sc->sc_queue));
638 	/*
639 	 * Device suspended, so we skip new I/O requests.
640 	 */
641 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
642 		if (!G_ELI_IS_NEW_BIO(bp->bio_pflags))
643 			break;
644 	}
645 	if (bp != NULL)
646 		bioq_remove(&sc->sc_queue, bp);
647 	return (bp);
648 }
649 
650 /*
651  * This is the main function for kernel worker thread when we don't have
652  * hardware acceleration and we have to do cryptography in software.
653  * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM
654  * threads with crypto work.
655  */
656 static void
657 g_eli_worker(void *arg)
658 {
659 	struct g_eli_softc *sc;
660 	struct g_eli_worker *wr;
661 	struct bio *bp;
662 	int error __diagused;
663 
664 	wr = arg;
665 	sc = wr->w_softc;
666 #ifdef EARLY_AP_STARTUP
667 	MPASS(!sc->sc_cpubind || smp_started);
668 #elif defined(SMP)
669 	/* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */
670 	if (sc->sc_cpubind) {
671 		while (!smp_started)
672 			tsleep(wr, 0, "geli:smp", hz / 4);
673 	}
674 #endif
675 	thread_lock(curthread);
676 	sched_prio(curthread, PUSER);
677 	if (sc->sc_cpubind)
678 		sched_bind(curthread, wr->w_number % mp_ncpus);
679 	thread_unlock(curthread);
680 
681 	G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm);
682 
683 	for (;;) {
684 		mtx_lock(&sc->sc_queue_mtx);
685 again:
686 		bp = g_eli_takefirst(sc);
687 		if (bp == NULL) {
688 			if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
689 				g_eli_cancel(sc);
690 				LIST_REMOVE(wr, w_next);
691 				g_eli_freesession(wr);
692 				free(wr, M_ELI);
693 				G_ELI_DEBUG(1, "Thread %s exiting.",
694 				    curthread->td_proc->p_comm);
695 				wakeup(&sc->sc_workers);
696 				mtx_unlock(&sc->sc_queue_mtx);
697 				kproc_exit(0);
698 			}
699 			while (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
700 				if (sc->sc_inflight > 0) {
701 					G_ELI_DEBUG(0, "inflight=%d",
702 					    sc->sc_inflight);
703 					/*
704 					 * We still have inflight BIOs, so
705 					 * sleep and retry.
706 					 */
707 					msleep(sc, &sc->sc_queue_mtx, PRIBIO,
708 					    "geli:inf", hz / 5);
709 					goto again;
710 				}
711 				/*
712 				 * Suspend requested, mark the worker as
713 				 * suspended and go to sleep.
714 				 */
715 				if (wr->w_active) {
716 					g_eli_freesession(wr);
717 					wr->w_active = FALSE;
718 				}
719 				wakeup(&sc->sc_workers);
720 				msleep(sc, &sc->sc_queue_mtx, PRIBIO,
721 				    "geli:suspend", 0);
722 				if (!wr->w_active &&
723 				    !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
724 					error = g_eli_newsession(wr);
725 					KASSERT(error == 0,
726 					    ("g_eli_newsession() failed on resume (error=%d)",
727 					    error));
728 					wr->w_active = TRUE;
729 				}
730 				goto again;
731 			}
732 			msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
733 			continue;
734 		}
735 		if (G_ELI_IS_NEW_BIO(bp->bio_pflags))
736 			atomic_add_int(&sc->sc_inflight, 1);
737 		mtx_unlock(&sc->sc_queue_mtx);
738 		if (G_ELI_IS_NEW_BIO(bp->bio_pflags)) {
739 			G_ELI_SETWORKER(bp->bio_pflags, 0);
740 			if (sc->sc_flags & G_ELI_FLAG_AUTH) {
741 				if (bp->bio_cmd == BIO_READ)
742 					g_eli_auth_read(sc, bp);
743 				else
744 					g_eli_auth_run(wr, bp);
745 			} else {
746 				if (bp->bio_cmd == BIO_READ)
747 					g_eli_crypto_read(sc, bp, 1);
748 				else
749 					g_eli_crypto_run(wr, bp);
750 			}
751 		} else {
752 			if (sc->sc_flags & G_ELI_FLAG_AUTH)
753 				g_eli_auth_run(wr, bp);
754 			else
755 				g_eli_crypto_run(wr, bp);
756 		}
757 	}
758 }
759 
760 static int
761 g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
762     off_t offset, struct g_eli_metadata *md)
763 {
764 	struct g_geom *gp;
765 	struct g_consumer *cp;
766 	u_char *buf = NULL;
767 	int error;
768 
769 	g_topology_assert();
770 
771 	gp = g_new_geomf(mp, "eli:taste");
772 	gp->start = g_eli_start;
773 	gp->access = g_std_access;
774 	/*
775 	 * g_eli_read_metadata() is always called from the event thread.
776 	 * Our geom is created and destroyed in the same event, so there
777 	 * could be no orphan nor spoil event in the meantime.
778 	 */
779 	gp->orphan = g_eli_orphan_spoil_assert;
780 	gp->spoiled = g_eli_orphan_spoil_assert;
781 	cp = g_new_consumer(gp);
782 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
783 	error = g_attach(cp, pp);
784 	if (error != 0)
785 		goto end;
786 	error = g_access(cp, 1, 0, 0);
787 	if (error != 0)
788 		goto end;
789 	g_topology_unlock();
790 	buf = g_read_data(cp, offset, pp->sectorsize, &error);
791 	g_topology_lock();
792 	if (buf == NULL)
793 		goto end;
794 	error = eli_metadata_decode(buf, md);
795 	if (error != 0)
796 		goto end;
797 	/* Metadata was read and decoded successfully. */
798 end:
799 	g_free(buf);
800 	if (cp->provider != NULL) {
801 		if (cp->acr == 1)
802 			g_access(cp, -1, 0, 0);
803 		g_detach(cp);
804 	}
805 	g_destroy_consumer(cp);
806 	g_destroy_geom(gp);
807 	return (error);
808 }
809 
810 int
811 g_eli_read_metadata(struct g_class *mp, struct g_provider *pp,
812     struct g_eli_metadata *md)
813 {
814 
815 	return (g_eli_read_metadata_offset(mp, pp,
816 	    pp->mediasize - pp->sectorsize, md));
817 }
818 
819 /*
820  * The function is called when we had last close on provider and user requested
821  * to close it when this situation occur.
822  */
823 static void
824 g_eli_last_close(void *arg, int flags __unused)
825 {
826 	struct g_geom *gp;
827 	char gpname[64];
828 	int error __diagused;
829 
830 	g_topology_assert();
831 	gp = arg;
832 	strlcpy(gpname, gp->name, sizeof(gpname));
833 	error = g_eli_destroy(gp->softc, TRUE);
834 	KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).",
835 	    gpname, error));
836 	G_ELI_DEBUG(0, "Detached %s on last close.", gpname);
837 }
838 
839 int
840 g_eli_access(struct g_provider *pp, int dr, int dw, int de)
841 {
842 	struct g_eli_softc *sc;
843 	struct g_geom *gp;
844 
845 	gp = pp->geom;
846 	sc = gp->softc;
847 
848 	if (dw > 0) {
849 		if (sc->sc_flags & G_ELI_FLAG_RO) {
850 			/* Deny write attempts. */
851 			return (EROFS);
852 		}
853 		/* Someone is opening us for write, we need to remember that. */
854 		sc->sc_flags |= G_ELI_FLAG_WOPEN;
855 		return (0);
856 	}
857 	/* Is this the last close? */
858 	if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0)
859 		return (0);
860 
861 	/*
862 	 * Automatically detach on last close if requested.
863 	 */
864 	if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) ||
865 	    (sc->sc_flags & G_ELI_FLAG_WOPEN)) {
866 		g_post_event(g_eli_last_close, gp, M_WAITOK, NULL);
867 	}
868 	return (0);
869 }
870 
871 static int
872 g_eli_cpu_is_disabled(int cpu)
873 {
874 #ifdef SMP
875 	return (CPU_ISSET(cpu, &hlt_cpus_mask));
876 #else
877 	return (0);
878 #endif
879 }
880 
881 static void
882 g_eli_init_uma(void)
883 {
884 
885 	atomic_add_int(&g_eli_devs, 1);
886 	sx_xlock(&g_eli_umalock);
887 	if (g_eli_uma == NULL) {
888 		/*
889 		 * Calculate the maximum-sized swap buffer we are
890 		 * likely to see.
891 		 */
892 		g_eli_alloc_sz = roundup2((PAGE_SIZE + sizeof(int) +
893                     G_ELI_AUTH_SECKEYLEN) * nsw_cluster_max +
894                     sizeof(uintptr_t), PAGE_SIZE);
895 
896 		g_eli_uma = uma_zcreate("GELI buffers", g_eli_alloc_sz,
897 		    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
898 
899 		/* Reserve and pre-allocate pages, as appropriate. */
900 		uma_zone_reserve(g_eli_uma, g_eli_minbufs);
901 		uma_prealloc(g_eli_uma, g_eli_minbufs);
902 	}
903 	sx_xunlock(&g_eli_umalock);
904 }
905 
906 /*
907  * Try to destroy the UMA pool. This will do nothing if there are existing
908  * GELI devices or existing UMA allocations.
909  */
910 static void
911 g_eli_destroy_uma(void)
912 {
913 	uma_zone_t oldzone;
914 
915 	sx_xlock(&g_eli_umalock);
916 	/* Ensure we really should be destroying this. */
917 	if (atomic_load_int(&g_eli_devs) == 0 &&
918 	    atomic_load_int(&g_eli_umaoutstanding) == 0) {
919 		oldzone = g_eli_uma;
920 		g_eli_uma = NULL;
921 	} else
922 		oldzone = NULL;
923 	sx_xunlock(&g_eli_umalock);
924 
925 	if (oldzone != NULL)
926 		uma_zdestroy(oldzone);
927 }
928 
929 static void
930 g_eli_fini_uma(void)
931 {
932 
933 	/*
934 	 * If this is the last outstanding GELI device, try to
935 	 * destroy the UMA pool.
936 	 */
937 	if (atomic_fetchadd_int(&g_eli_devs, -1) == 1)
938 		g_eli_destroy_uma();
939 }
940 
941 /*
942  * Allocate a data buffer. If the size fits within our swap-sized buffers,
943  * try to allocate a swap-sized buffer from the UMA pool. Otherwise, fall
944  * back to using malloc.
945  *
946  * Swap-related requests are special: they can only use the UMA pool, they
947  * use M_USE_RESERVE to let them dip farther into system resources, and
948  * they always use M_NOWAIT to prevent swap operations from deadlocking.
949  */
950 bool
951 g_eli_alloc_data(struct bio *bp, int sz)
952 {
953 
954 	KASSERT(sz <= g_eli_alloc_sz || (bp->bio_flags & BIO_SWAP) == 0,
955 	    ("BIO_SWAP request for %d bytes exceeds the precalculated buffer"
956 	    " size (%d)", sz, g_eli_alloc_sz));
957 	if (sz <= g_eli_alloc_sz) {
958 		bp->bio_driver2 = uma_zalloc(g_eli_uma, M_NOWAIT |
959 		    ((bp->bio_flags & BIO_SWAP) != 0 ? M_USE_RESERVE : 0));
960 		if (bp->bio_driver2 != NULL) {
961 			bp->bio_pflags |= G_ELI_UMA_ALLOC;
962 			atomic_add_int(&g_eli_umaoutstanding, 1);
963 		}
964 		if (bp->bio_driver2 != NULL || (bp->bio_flags & BIO_SWAP) != 0)
965 			return (bp->bio_driver2 != NULL);
966 	}
967 	bp->bio_pflags &= ~(G_ELI_UMA_ALLOC);
968 	bp->bio_driver2 = malloc(sz, M_ELI, g_eli_blocking_malloc ? M_WAITOK :
969 	    M_NOWAIT);
970 	return (bp->bio_driver2 != NULL);
971 }
972 
973 /*
974  * Free a buffer from bp->bio_driver2 which was allocated with
975  * g_eli_alloc_data(). This function makes sure that the memory is freed
976  * to the correct place.
977  *
978  * Additionally, if this function frees the last outstanding UMA request
979  * and there are no open GELI devices, this will destroy the UMA pool.
980  */
981 void
982 g_eli_free_data(struct bio *bp)
983 {
984 
985 	/*
986 	 * Mimic the free(9) behavior of allowing a NULL pointer to be
987 	 * freed.
988 	 */
989 	if (bp->bio_driver2 == NULL)
990 		return;
991 
992 	if ((bp->bio_pflags & G_ELI_UMA_ALLOC) != 0) {
993 		uma_zfree(g_eli_uma, bp->bio_driver2);
994 		if (atomic_fetchadd_int(&g_eli_umaoutstanding, -1) == 1 &&
995 		    atomic_load_int(&g_eli_devs) == 0)
996 			g_eli_destroy_uma();
997 	} else
998 		free(bp->bio_driver2, M_ELI);
999 	bp->bio_driver2 = NULL;
1000 }
1001 
1002 struct g_geom *
1003 g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
1004     const struct g_eli_metadata *md, const u_char *mkey, int nkey)
1005 {
1006 	struct g_eli_softc *sc;
1007 	struct g_eli_worker *wr;
1008 	struct g_geom *gp;
1009 	struct g_provider *pp;
1010 	struct g_consumer *cp;
1011 	struct g_geom_alias *gap;
1012 	u_int i, threads;
1013 	int dcw, error;
1014 
1015 	G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX);
1016 	KASSERT(eli_metadata_crypto_supported(md),
1017 	    ("%s: unsupported crypto for %s", __func__, bpp->name));
1018 
1019 	gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX);
1020 	sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO);
1021 	gp->start = g_eli_start;
1022 	/*
1023 	 * Spoiling can happen even though we have the provider open
1024 	 * exclusively, e.g. through media change events.
1025 	 */
1026 	gp->spoiled = g_eli_orphan;
1027 	gp->orphan = g_eli_orphan;
1028 	gp->resize = g_eli_resize;
1029 	gp->dumpconf = g_eli_dumpconf;
1030 	/*
1031 	 * If detach-on-last-close feature is not enabled and we don't operate
1032 	 * on read-only provider, we can simply use g_std_access().
1033 	 */
1034 	if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO))
1035 		gp->access = g_eli_access;
1036 	else
1037 		gp->access = g_std_access;
1038 
1039 	eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize);
1040 	sc->sc_nkey = nkey;
1041 
1042 	gp->softc = sc;
1043 	sc->sc_geom = gp;
1044 
1045 	bioq_init(&sc->sc_queue);
1046 	mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF);
1047 	mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF);
1048 
1049 	pp = NULL;
1050 	cp = g_new_consumer(gp);
1051 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
1052 	error = g_attach(cp, bpp);
1053 	if (error != 0) {
1054 		if (req != NULL) {
1055 			gctl_error(req, "Cannot attach to %s (error=%d).",
1056 			    bpp->name, error);
1057 		} else {
1058 			G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).",
1059 			    bpp->name, error);
1060 		}
1061 		goto failed;
1062 	}
1063 	/*
1064 	 * Keep provider open all the time, so we can run critical tasks,
1065 	 * like Master Keys deletion, without wondering if we can open
1066 	 * provider or not.
1067 	 * We don't open provider for writing only when user requested read-only
1068 	 * access.
1069 	 */
1070 	dcw = (sc->sc_flags & G_ELI_FLAG_RO) ? 0 : 1;
1071 	error = g_access(cp, 1, dcw, 1);
1072 	if (error != 0) {
1073 		if (req != NULL) {
1074 			gctl_error(req, "Cannot access %s (error=%d).",
1075 			    bpp->name, error);
1076 		} else {
1077 			G_ELI_DEBUG(1, "Cannot access %s (error=%d).",
1078 			    bpp->name, error);
1079 		}
1080 		goto failed;
1081 	}
1082 
1083 	/*
1084 	 * Remember the keys in our softc structure.
1085 	 */
1086 	g_eli_mkey_propagate(sc, mkey);
1087 
1088 	LIST_INIT(&sc->sc_workers);
1089 
1090 	threads = g_eli_threads;
1091 	if (threads == 0)
1092 		threads = mp_ncpus;
1093 	sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus);
1094 	g_eli_init_uma();
1095 	for (i = 0; i < threads; i++) {
1096 		if (g_eli_cpu_is_disabled(i)) {
1097 			G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.",
1098 			    bpp->name, i);
1099 			continue;
1100 		}
1101 		wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);
1102 		wr->w_softc = sc;
1103 		wr->w_number = i;
1104 		wr->w_active = TRUE;
1105 
1106 		error = g_eli_newsession(wr);
1107 		if (error != 0) {
1108 			free(wr, M_ELI);
1109 			if (req != NULL) {
1110 				gctl_error(req, "Cannot set up crypto session "
1111 				    "for %s (error=%d).", bpp->name, error);
1112 			} else {
1113 				G_ELI_DEBUG(1, "Cannot set up crypto session "
1114 				    "for %s (error=%d).", bpp->name, error);
1115 			}
1116 			goto failed;
1117 		}
1118 
1119 		error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0,
1120 		    "g_eli[%u] %s", i, bpp->name);
1121 		if (error != 0) {
1122 			g_eli_freesession(wr);
1123 			free(wr, M_ELI);
1124 			if (req != NULL) {
1125 				gctl_error(req, "Cannot create kernel thread "
1126 				    "for %s (error=%d).", bpp->name, error);
1127 			} else {
1128 				G_ELI_DEBUG(1, "Cannot create kernel thread "
1129 				    "for %s (error=%d).", bpp->name, error);
1130 			}
1131 			goto failed;
1132 		}
1133 		LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next);
1134 	}
1135 
1136 	/*
1137 	 * Create decrypted provider.
1138 	 */
1139 	pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX);
1140 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
1141 	if (g_eli_unmapped_io && CRYPTO_HAS_VMPAGE) {
1142 		/*
1143 		 * On DMAP architectures we can use unmapped I/O.  But don't
1144 		 * use it with data integrity verification.  That code hasn't
1145 		 * been written yet.
1146 		 */
1147 		 if ((sc->sc_flags & G_ELI_FLAG_AUTH) == 0)
1148 			pp->flags |= G_PF_ACCEPT_UNMAPPED;
1149 	}
1150 	pp->mediasize = sc->sc_mediasize;
1151 	pp->sectorsize = sc->sc_sectorsize;
1152 	LIST_FOREACH(gap, &bpp->aliases, ga_next)
1153 		g_provider_add_alias(pp, "%s%s", gap->ga_alias, G_ELI_SUFFIX);
1154 
1155 	g_error_provider(pp, 0);
1156 
1157 	G_ELI_DEBUG(0, "Device %s created.", pp->name);
1158 	G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo),
1159 	    sc->sc_ekeylen);
1160 	if (sc->sc_flags & G_ELI_FLAG_AUTH)
1161 		G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo));
1162 	G_ELI_DEBUG(0, "    Crypto: %s",
1163 	    sc->sc_crypto == G_ELI_CRYPTO_SW_ACCEL ? "accelerated software" :
1164 	    sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware");
1165 	return (gp);
1166 failed:
1167 	mtx_lock(&sc->sc_queue_mtx);
1168 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
1169 	wakeup(sc);
1170 	/*
1171 	 * Wait for kernel threads self destruction.
1172 	 */
1173 	while (!LIST_EMPTY(&sc->sc_workers)) {
1174 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
1175 		    "geli:destroy", 0);
1176 	}
1177 	mtx_destroy(&sc->sc_queue_mtx);
1178 	if (cp->provider != NULL) {
1179 		if (cp->acr == 1)
1180 			g_access(cp, -1, -dcw, -1);
1181 		g_detach(cp);
1182 	}
1183 	g_destroy_consumer(cp);
1184 	g_destroy_geom(gp);
1185 	g_eli_key_destroy(sc);
1186 	g_eli_fini_uma();
1187 	zfree(sc, M_ELI);
1188 	return (NULL);
1189 }
1190 
1191 int
1192 g_eli_destroy(struct g_eli_softc *sc, boolean_t force)
1193 {
1194 	struct g_geom *gp;
1195 	struct g_provider *pp;
1196 
1197 	g_topology_assert();
1198 
1199 	if (sc == NULL)
1200 		return (ENXIO);
1201 
1202 	gp = sc->sc_geom;
1203 	pp = LIST_FIRST(&gp->provider);
1204 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1205 		if (force) {
1206 			G_ELI_DEBUG(1, "Device %s is still open, so it "
1207 			    "cannot be definitely removed.", pp->name);
1208 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
1209 			gp->access = g_eli_access;
1210 			g_wither_provider(pp, ENXIO);
1211 			return (EBUSY);
1212 		} else {
1213 			G_ELI_DEBUG(1,
1214 			    "Device %s is still open (r%dw%de%d).", pp->name,
1215 			    pp->acr, pp->acw, pp->ace);
1216 			return (EBUSY);
1217 		}
1218 	}
1219 
1220 	mtx_lock(&sc->sc_queue_mtx);
1221 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
1222 	wakeup(sc);
1223 	while (!LIST_EMPTY(&sc->sc_workers)) {
1224 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
1225 		    "geli:destroy", 0);
1226 	}
1227 	mtx_destroy(&sc->sc_queue_mtx);
1228 	gp->softc = NULL;
1229 	g_eli_key_destroy(sc);
1230 	g_eli_fini_uma();
1231 	zfree(sc, M_ELI);
1232 
1233 	G_ELI_DEBUG(0, "Device %s destroyed.", gp->name);
1234 	g_wither_geom_close(gp, ENXIO);
1235 
1236 	return (0);
1237 }
1238 
1239 static int
1240 g_eli_destroy_geom(struct gctl_req *req __unused,
1241     struct g_class *mp __unused, struct g_geom *gp)
1242 {
1243 	struct g_eli_softc *sc;
1244 
1245 	sc = gp->softc;
1246 	return (g_eli_destroy(sc, FALSE));
1247 }
1248 
1249 static int
1250 g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider)
1251 {
1252 	u_char *keyfile, *data;
1253 	char *file, name[64];
1254 	size_t size;
1255 	int i;
1256 
1257 	for (i = 0; ; i++) {
1258 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
1259 		keyfile = preload_search_by_type(name);
1260 		if (keyfile == NULL && i == 0) {
1261 			/*
1262 			 * If there is only one keyfile, allow simpler name.
1263 			 */
1264 			snprintf(name, sizeof(name), "%s:geli_keyfile", provider);
1265 			keyfile = preload_search_by_type(name);
1266 		}
1267 		if (keyfile == NULL)
1268 			return (i);	/* Return number of loaded keyfiles. */
1269 		data = preload_fetch_addr(keyfile);
1270 		if (data == NULL) {
1271 			G_ELI_DEBUG(0, "Cannot find key file data for %s.",
1272 			    name);
1273 			return (0);
1274 		}
1275 		size = preload_fetch_size(keyfile);
1276 		if (size == 0) {
1277 			G_ELI_DEBUG(0, "Cannot find key file size for %s.",
1278 			    name);
1279 			return (0);
1280 		}
1281 		file = preload_search_info(keyfile, MODINFO_NAME);
1282 		if (file == NULL) {
1283 			G_ELI_DEBUG(0, "Cannot find key file name for %s.",
1284 			    name);
1285 			return (0);
1286 		}
1287 		G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file,
1288 		    provider, name);
1289 		g_eli_crypto_hmac_update(ctx, data, size);
1290 	}
1291 }
1292 
1293 static void
1294 g_eli_keyfiles_clear(const char *provider)
1295 {
1296 	u_char *keyfile, *data;
1297 	char name[64];
1298 	size_t size;
1299 	int i;
1300 
1301 	for (i = 0; ; i++) {
1302 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
1303 		keyfile = preload_search_by_type(name);
1304 		if (keyfile == NULL)
1305 			return;
1306 		data = preload_fetch_addr(keyfile);
1307 		size = preload_fetch_size(keyfile);
1308 		if (data != NULL && size != 0)
1309 			explicit_bzero(data, size);
1310 	}
1311 }
1312 
1313 /*
1314  * Tasting is only made on boot.
1315  * We detect providers which should be attached before root is mounted.
1316  */
1317 static struct g_geom *
1318 g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
1319 {
1320 	struct g_eli_metadata md;
1321 	struct g_geom *gp;
1322 	struct hmac_ctx ctx;
1323 	char passphrase[256];
1324 	u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN];
1325 	u_int i, nkey, nkeyfiles, tries, showpass;
1326 	int error;
1327         struct keybuf *keybuf;
1328 
1329 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
1330 	g_topology_assert();
1331 
1332 	if (root_mounted() || g_eli_tries == 0)
1333 		return (NULL);
1334 
1335 	G_ELI_DEBUG(3, "Tasting %s.", pp->name);
1336 
1337 	error = g_eli_read_metadata(mp, pp, &md);
1338 	if (error != 0)
1339 		return (NULL);
1340 	gp = NULL;
1341 
1342 	if (strcmp(md.md_magic, G_ELI_MAGIC) != 0)
1343 		return (NULL);
1344 	if (md.md_version > G_ELI_VERSION) {
1345 		printf("geom_eli.ko module is too old to handle %s.\n",
1346 		    pp->name);
1347 		return (NULL);
1348 	}
1349 	if (md.md_provsize != pp->mediasize)
1350 		return (NULL);
1351 	/* Should we attach it on boot? */
1352 	if (!(md.md_flags & G_ELI_FLAG_BOOT) &&
1353 	    !(md.md_flags & G_ELI_FLAG_GELIBOOT))
1354 		return (NULL);
1355 	if (md.md_keys == 0x00) {
1356 		G_ELI_DEBUG(0, "No valid keys on %s.", pp->name);
1357 		return (NULL);
1358 	}
1359 	if (!eli_metadata_crypto_supported(&md)) {
1360 		G_ELI_DEBUG(0, "%s uses invalid or unsupported algorithms\n",
1361 		    pp->name);
1362 		return (NULL);
1363 	}
1364 	if (md.md_iterations == -1) {
1365 		/* If there is no passphrase, we try only once. */
1366 		tries = 1;
1367 	} else {
1368 		/* Ask for the passphrase no more than g_eli_tries times. */
1369 		tries = g_eli_tries;
1370 	}
1371 
1372         if ((keybuf = get_keybuf()) != NULL) {
1373                 /* Scan the key buffer, try all GELI keys. */
1374                 for (i = 0; i < keybuf->kb_nents; i++) {
1375                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
1376                                  memcpy(key, keybuf->kb_ents[i].ke_data,
1377                                      sizeof(key));
1378 
1379                                  if (g_eli_mkey_decrypt_any(&md, key,
1380                                      mkey, &nkey) == 0 ) {
1381                                          explicit_bzero(key, sizeof(key));
1382                                          goto have_key;
1383                                  }
1384                          }
1385                 }
1386         }
1387 
1388         for (i = 0; i <= tries; i++) {
1389                 g_eli_crypto_hmac_init(&ctx, NULL, 0);
1390 
1391                 /*
1392                  * Load all key files.
1393                  */
1394                 nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name);
1395 
1396                 if (nkeyfiles == 0 && md.md_iterations == -1) {
1397                         /*
1398                          * No key files and no passphrase, something is
1399                          * definitely wrong here.
1400                          * geli(8) doesn't allow for such situation, so assume
1401                          * that there was really no passphrase and in that case
1402                          * key files are no properly defined in loader.conf.
1403                          */
1404                         G_ELI_DEBUG(0,
1405                             "Found no key files in loader.conf for %s.",
1406                             pp->name);
1407                         return (NULL);
1408                 }
1409 
1410                 /* Ask for the passphrase if defined. */
1411                 if (md.md_iterations >= 0) {
1412                         /* Try first with cached passphrase. */
1413                         if (i == 0) {
1414                                 if (!g_eli_boot_passcache)
1415                                         continue;
1416                                 memcpy(passphrase, cached_passphrase,
1417                                     sizeof(passphrase));
1418                         } else {
1419                                 printf("Enter passphrase for %s: ", pp->name);
1420 				showpass = g_eli_visible_passphrase;
1421 				if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0)
1422 					showpass = GETS_ECHOPASS;
1423                                 cngets(passphrase, sizeof(passphrase),
1424 				    showpass);
1425                                 memcpy(cached_passphrase, passphrase,
1426                                     sizeof(passphrase));
1427                         }
1428                 }
1429 
1430                 /*
1431                  * Prepare Derived-Key from the user passphrase.
1432                  */
1433                 if (md.md_iterations == 0) {
1434                         g_eli_crypto_hmac_update(&ctx, md.md_salt,
1435                             sizeof(md.md_salt));
1436                         g_eli_crypto_hmac_update(&ctx, passphrase,
1437                             strlen(passphrase));
1438                         explicit_bzero(passphrase, sizeof(passphrase));
1439                 } else if (md.md_iterations > 0) {
1440                         u_char dkey[G_ELI_USERKEYLEN];
1441 
1442                         pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt,
1443                             sizeof(md.md_salt), passphrase, md.md_iterations);
1444                         explicit_bzero(passphrase, sizeof(passphrase));
1445                         g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey));
1446                         explicit_bzero(dkey, sizeof(dkey));
1447                 }
1448 
1449                 g_eli_crypto_hmac_final(&ctx, key, 0);
1450 
1451                 /*
1452                  * Decrypt Master-Key.
1453                  */
1454                 error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey);
1455                 explicit_bzero(key, sizeof(key));
1456                 if (error == -1) {
1457                         if (i == tries) {
1458                                 G_ELI_DEBUG(0,
1459                                     "Wrong key for %s. No tries left.",
1460                                     pp->name);
1461                                 g_eli_keyfiles_clear(pp->name);
1462                                 return (NULL);
1463                         }
1464                         if (i > 0) {
1465                                 G_ELI_DEBUG(0,
1466                                     "Wrong key for %s. Tries left: %u.",
1467                                     pp->name, tries - i);
1468                         }
1469                         /* Try again. */
1470                         continue;
1471                 } else if (error > 0) {
1472                         G_ELI_DEBUG(0,
1473                             "Cannot decrypt Master Key for %s (error=%d).",
1474                             pp->name, error);
1475                         g_eli_keyfiles_clear(pp->name);
1476                         return (NULL);
1477                 }
1478                 g_eli_keyfiles_clear(pp->name);
1479                 G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name);
1480                 break;
1481         }
1482 have_key:
1483 
1484 	/*
1485 	 * We have correct key, let's attach provider.
1486 	 */
1487 	gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey);
1488 	explicit_bzero(mkey, sizeof(mkey));
1489 	explicit_bzero(&md, sizeof(md));
1490 	if (gp == NULL) {
1491 		G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name,
1492 		    G_ELI_SUFFIX);
1493 		return (NULL);
1494 	}
1495 	return (gp);
1496 }
1497 
1498 static void
1499 g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1500     struct g_consumer *cp, struct g_provider *pp)
1501 {
1502 	struct g_eli_softc *sc;
1503 
1504 	g_topology_assert();
1505 	sc = gp->softc;
1506 	if (sc == NULL)
1507 		return;
1508 	if (pp != NULL || cp != NULL)
1509 		return;	/* Nothing here. */
1510 
1511 	sbuf_printf(sb, "%s<KeysTotal>%ju</KeysTotal>\n", indent,
1512 	    (uintmax_t)sc->sc_ekeys_total);
1513 	sbuf_printf(sb, "%s<KeysAllocated>%ju</KeysAllocated>\n", indent,
1514 	    (uintmax_t)sc->sc_ekeys_allocated);
1515 	sbuf_printf(sb, "%s<Flags>", indent);
1516 	if (sc->sc_flags == 0)
1517 		sbuf_cat(sb, "NONE");
1518 	else {
1519 		int first = 1;
1520 
1521 #define ADD_FLAG(flag, name)	do {					\
1522 	if (sc->sc_flags & (flag)) {					\
1523 		if (!first)						\
1524 			sbuf_cat(sb, ", ");				\
1525 		else							\
1526 			first = 0;					\
1527 		sbuf_cat(sb, name);					\
1528 	}								\
1529 } while (0)
1530 		ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND");
1531 		ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY");
1532 		ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER");
1533 		ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME");
1534 		ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT");
1535 		ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH");
1536 		ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH");
1537 		ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH");
1538 		ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN");
1539 		ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY");
1540 		ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY");
1541 		ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE");
1542 		ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT");
1543 		ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS");
1544 		ADD_FLAG(G_ELI_FLAG_AUTORESIZE, "AUTORESIZE");
1545 #undef  ADD_FLAG
1546 	}
1547 	sbuf_cat(sb, "</Flags>\n");
1548 
1549 	if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) {
1550 		sbuf_printf(sb, "%s<UsedKey>%u</UsedKey>\n", indent,
1551 		    sc->sc_nkey);
1552 	}
1553 	sbuf_printf(sb, "%s<Version>%u</Version>\n", indent, sc->sc_version);
1554 	sbuf_printf(sb, "%s<Crypto>", indent);
1555 	switch (sc->sc_crypto) {
1556 	case G_ELI_CRYPTO_HW:
1557 		sbuf_cat(sb, "hardware");
1558 		break;
1559 	case G_ELI_CRYPTO_SW:
1560 		sbuf_cat(sb, "software");
1561 		break;
1562 	case G_ELI_CRYPTO_SW_ACCEL:
1563 		sbuf_cat(sb, "accelerated software");
1564 		break;
1565 	default:
1566 		sbuf_cat(sb, "UNKNOWN");
1567 		break;
1568 	}
1569 	sbuf_cat(sb, "</Crypto>\n");
1570 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
1571 		sbuf_printf(sb,
1572 		    "%s<AuthenticationAlgorithm>%s</AuthenticationAlgorithm>\n",
1573 		    indent, g_eli_algo2str(sc->sc_aalgo));
1574 	}
1575 	sbuf_printf(sb, "%s<KeyLength>%u</KeyLength>\n", indent,
1576 	    sc->sc_ekeylen);
1577 	sbuf_printf(sb, "%s<EncryptionAlgorithm>%s</EncryptionAlgorithm>\n",
1578 	    indent, g_eli_algo2str(sc->sc_ealgo));
1579 	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
1580 	    (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE");
1581 }
1582 
1583 static void
1584 g_eli_shutdown_pre_sync(void *arg, int howto)
1585 {
1586 	struct g_class *mp;
1587 	struct g_geom *gp, *gp2;
1588 	struct g_provider *pp;
1589 	struct g_eli_softc *sc;
1590 
1591 	mp = arg;
1592 	g_topology_lock();
1593 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
1594 		sc = gp->softc;
1595 		if (sc == NULL)
1596 			continue;
1597 		pp = LIST_FIRST(&gp->provider);
1598 		KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name));
1599 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0 ||
1600 		    SCHEDULER_STOPPED())
1601 		{
1602 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
1603 			gp->access = g_eli_access;
1604 		} else {
1605 			(void) g_eli_destroy(sc, TRUE);
1606 		}
1607 	}
1608 	g_topology_unlock();
1609 }
1610 
1611 static void
1612 g_eli_init(struct g_class *mp)
1613 {
1614 
1615 	g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1616 	    g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
1617 	if (g_eli_pre_sync == NULL)
1618 		G_ELI_DEBUG(0, "Warning! Cannot register shutdown event.");
1619 }
1620 
1621 static void
1622 g_eli_fini(struct g_class *mp)
1623 {
1624 
1625 	if (g_eli_pre_sync != NULL)
1626 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync);
1627 }
1628 
1629 DECLARE_GEOM_CLASS(g_eli_class, g_eli);
1630 MODULE_DEPEND(g_eli, crypto, 1, 1, 1);
1631 MODULE_VERSION(geom_eli, 0);
1632