xref: /freebsd/sys/geom/eli/g_eli.c (revision 8ea95b2fbab8eb891c4191c1879199685951b1f6)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2005-2019 Pawel Jakub Dawidek <pawel@dawidek.net>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 __FBSDID("$FreeBSD$");
31 
32 #include <sys/param.h>
33 #include <sys/systm.h>
34 #include <sys/cons.h>
35 #include <sys/kenv.h>
36 #include <sys/kernel.h>
37 #include <sys/linker.h>
38 #include <sys/module.h>
39 #include <sys/lock.h>
40 #include <sys/mutex.h>
41 #include <sys/bio.h>
42 #include <sys/sbuf.h>
43 #include <sys/sysctl.h>
44 #include <sys/malloc.h>
45 #include <sys/eventhandler.h>
46 #include <sys/kthread.h>
47 #include <sys/proc.h>
48 #include <sys/sched.h>
49 #include <sys/smp.h>
50 #include <sys/uio.h>
51 #include <sys/vnode.h>
52 
53 #include <machine/vmparam.h>
54 
55 #include <vm/uma.h>
56 #include <vm/vm.h>
57 #include <vm/swap_pager.h>
58 
59 #include <geom/geom.h>
60 #include <geom/geom_dbg.h>
61 #include <geom/eli/g_eli.h>
62 #include <geom/eli/pkcs5v2.h>
63 
64 #include <crypto/intake.h>
65 
66 FEATURE(geom_eli, "GEOM crypto module");
67 
68 MALLOC_DEFINE(M_ELI, "eli data", "GEOM_ELI Data");
69 
70 SYSCTL_DECL(_kern_geom);
71 SYSCTL_NODE(_kern_geom, OID_AUTO, eli, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
72     "GEOM_ELI stuff");
73 static int g_eli_version = G_ELI_VERSION;
74 SYSCTL_INT(_kern_geom_eli, OID_AUTO, version, CTLFLAG_RD, &g_eli_version, 0,
75     "GELI version");
76 int g_eli_debug = 0;
77 SYSCTL_INT(_kern_geom_eli, OID_AUTO, debug, CTLFLAG_RWTUN, &g_eli_debug, 0,
78     "Debug level");
79 static u_int g_eli_tries = 3;
80 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, tries, CTLFLAG_RWTUN, &g_eli_tries, 0,
81     "Number of tries for entering the passphrase");
82 static u_int g_eli_visible_passphrase = GETS_NOECHO;
83 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, visible_passphrase, CTLFLAG_RWTUN,
84     &g_eli_visible_passphrase, 0,
85     "Visibility of passphrase prompt (0 = invisible, 1 = visible, 2 = asterisk)");
86 u_int g_eli_overwrites = G_ELI_OVERWRITES;
87 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, overwrites, CTLFLAG_RWTUN, &g_eli_overwrites,
88     0, "Number of times on-disk keys should be overwritten when destroying them");
89 static u_int g_eli_threads = 0;
90 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, threads, CTLFLAG_RWTUN, &g_eli_threads, 0,
91     "Number of threads doing crypto work");
92 u_int g_eli_batch = 0;
93 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, batch, CTLFLAG_RWTUN, &g_eli_batch, 0,
94     "Use crypto operations batching");
95 static u_int g_eli_minbufs = 16;
96 static int sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS);
97 SYSCTL_PROC(_kern_geom_eli, OID_AUTO, minbufs, CTLTYPE_UINT | CTLFLAG_RW |
98     CTLFLAG_MPSAFE, NULL, 0, sysctl_g_eli_minbufs, "IU",
99     "Number of GELI bufs reserved for swap transactions");
100 static struct sx g_eli_umalock;	/* Controls changes to UMA zone. */
101 SX_SYSINIT(g_eli_umalock, &g_eli_umalock, "GELI UMA");
102 static uma_zone_t g_eli_uma = NULL;
103 static int g_eli_alloc_sz;
104 static volatile int g_eli_umaoutstanding;
105 static volatile int g_eli_devs;
106 static bool g_eli_blocking_malloc = false;
107 SYSCTL_BOOL(_kern_geom_eli, OID_AUTO, blocking_malloc, CTLFLAG_RWTUN,
108     &g_eli_blocking_malloc, 0, "Use blocking malloc calls for GELI buffers");
109 
110 /*
111  * Control the number of reserved entries in the GELI zone.
112  * If the GELI zone has already been allocated, update the zone. Otherwise,
113  * simply update the variable for use the next time the zone is created.
114  */
115 static int
116 sysctl_g_eli_minbufs(SYSCTL_HANDLER_ARGS)
117 {
118 	int error;
119 	u_int new;
120 
121 	new = g_eli_minbufs;
122 	error = sysctl_handle_int(oidp, &new, 0, req);
123 	if (error != 0 || req->newptr == NULL)
124 		return (error);
125 	sx_xlock(&g_eli_umalock);
126 	if (g_eli_uma != NULL) {
127 		if (new != g_eli_minbufs)
128 			uma_zone_reserve(g_eli_uma, new);
129 		if (new > g_eli_minbufs)
130 			uma_prealloc(g_eli_uma, new - g_eli_minbufs);
131 	}
132 	if (new != g_eli_minbufs)
133 		g_eli_minbufs = new;
134 	sx_xunlock(&g_eli_umalock);
135 	return (0);
136 }
137 
138 /*
139  * Passphrase cached during boot, in order to be more user-friendly if
140  * there are multiple providers using the same passphrase.
141  */
142 static char cached_passphrase[256];
143 static u_int g_eli_boot_passcache = 1;
144 TUNABLE_INT("kern.geom.eli.boot_passcache", &g_eli_boot_passcache);
145 SYSCTL_UINT(_kern_geom_eli, OID_AUTO, boot_passcache, CTLFLAG_RD,
146     &g_eli_boot_passcache, 0,
147     "Passphrases are cached during boot process for possible reuse");
148 static void
149 fetch_loader_passphrase(void * dummy)
150 {
151 	char * env_passphrase;
152 
153 	KASSERT(dynamic_kenv, ("need dynamic kenv"));
154 
155 	if ((env_passphrase = kern_getenv("kern.geom.eli.passphrase")) != NULL) {
156 		/* Extract passphrase from the environment. */
157 		strlcpy(cached_passphrase, env_passphrase,
158 		    sizeof(cached_passphrase));
159 		freeenv(env_passphrase);
160 
161 		/* Wipe the passphrase from the environment. */
162 		kern_unsetenv("kern.geom.eli.passphrase");
163 	}
164 }
165 SYSINIT(geli_fetch_loader_passphrase, SI_SUB_KMEM + 1, SI_ORDER_ANY,
166     fetch_loader_passphrase, NULL);
167 
168 static void
169 zero_boot_passcache(void)
170 {
171 
172         explicit_bzero(cached_passphrase, sizeof(cached_passphrase));
173 }
174 
175 static void
176 zero_geli_intake_keys(void)
177 {
178         struct keybuf *keybuf;
179         int i;
180 
181         if ((keybuf = get_keybuf()) != NULL) {
182                 /* Scan the key buffer, clear all GELI keys. */
183                 for (i = 0; i < keybuf->kb_nents; i++) {
184                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
185                                  explicit_bzero(keybuf->kb_ents[i].ke_data,
186                                      sizeof(keybuf->kb_ents[i].ke_data));
187                                  keybuf->kb_ents[i].ke_type = KEYBUF_TYPE_NONE;
188                          }
189                 }
190         }
191 }
192 
193 static void
194 zero_intake_passcache(void *dummy)
195 {
196         zero_boot_passcache();
197         zero_geli_intake_keys();
198 }
199 EVENTHANDLER_DEFINE(mountroot, zero_intake_passcache, NULL, 0);
200 
201 static eventhandler_tag g_eli_pre_sync = NULL;
202 
203 static int g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
204     off_t offset, struct g_eli_metadata *md);
205 
206 static int g_eli_destroy_geom(struct gctl_req *req, struct g_class *mp,
207     struct g_geom *gp);
208 static void g_eli_init(struct g_class *mp);
209 static void g_eli_fini(struct g_class *mp);
210 
211 static g_taste_t g_eli_taste;
212 static g_dumpconf_t g_eli_dumpconf;
213 
214 struct g_class g_eli_class = {
215 	.name = G_ELI_CLASS_NAME,
216 	.version = G_VERSION,
217 	.ctlreq = g_eli_config,
218 	.taste = g_eli_taste,
219 	.destroy_geom = g_eli_destroy_geom,
220 	.init = g_eli_init,
221 	.fini = g_eli_fini
222 };
223 
224 /*
225  * Code paths:
226  * BIO_READ:
227  *	g_eli_start -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
228  * BIO_WRITE:
229  *	g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
230  */
231 
232 /*
233  * EAGAIN from crypto(9) means, that we were probably balanced to another crypto
234  * accelerator or something like this.
235  * The function updates the SID and rerun the operation.
236  */
237 int
238 g_eli_crypto_rerun(struct cryptop *crp)
239 {
240 	struct g_eli_softc *sc;
241 	struct g_eli_worker *wr;
242 	struct bio *bp;
243 	int error;
244 
245 	bp = (struct bio *)crp->crp_opaque;
246 	sc = bp->bio_to->geom->softc;
247 	LIST_FOREACH(wr, &sc->sc_workers, w_next) {
248 		if (wr->w_number == G_ELI_WORKER(bp->bio_pflags))
249 			break;
250 	}
251 	KASSERT(wr != NULL, ("Invalid worker (%u).",
252 	    G_ELI_WORKER(bp->bio_pflags)));
253 	G_ELI_DEBUG(1, "Rerunning crypto %s request (sid: %p -> %p).",
254 	    bp->bio_cmd == BIO_READ ? "READ" : "WRITE", wr->w_sid,
255 	    crp->crp_session);
256 	wr->w_sid = crp->crp_session;
257 	crp->crp_etype = 0;
258 	error = crypto_dispatch(crp);
259 	if (error == 0)
260 		return (0);
261 	G_ELI_DEBUG(1, "%s: crypto_dispatch() returned %d.", __func__, error);
262 	crp->crp_etype = error;
263 	return (error);
264 }
265 
266 static void
267 g_eli_getattr_done(struct bio *bp)
268 {
269 	if (bp->bio_error == 0 &&
270 	    !strcmp(bp->bio_attribute, "GEOM::physpath")) {
271 		strlcat(bp->bio_data, "/eli", bp->bio_length);
272 	}
273 	g_std_done(bp);
274 }
275 
276 /*
277  * The function is called afer reading encrypted data from the provider.
278  *
279  * g_eli_start -> g_eli_crypto_read -> g_io_request -> G_ELI_READ_DONE -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
280  */
281 void
282 g_eli_read_done(struct bio *bp)
283 {
284 	struct g_eli_softc *sc;
285 	struct bio *pbp;
286 
287 	G_ELI_LOGREQ(2, bp, "Request done.");
288 	pbp = bp->bio_parent;
289 	if (pbp->bio_error == 0 && bp->bio_error != 0)
290 		pbp->bio_error = bp->bio_error;
291 	g_destroy_bio(bp);
292 	/*
293 	 * Do we have all sectors already?
294 	 */
295 	pbp->bio_inbed++;
296 	if (pbp->bio_inbed < pbp->bio_children)
297 		return;
298 	sc = pbp->bio_to->geom->softc;
299 	if (pbp->bio_error != 0) {
300 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
301 		    pbp->bio_error);
302 		pbp->bio_completed = 0;
303 		g_eli_free_data(pbp);
304 		g_io_deliver(pbp, pbp->bio_error);
305 		if (sc != NULL)
306 			atomic_subtract_int(&sc->sc_inflight, 1);
307 		return;
308 	}
309 	mtx_lock(&sc->sc_queue_mtx);
310 	bioq_insert_tail(&sc->sc_queue, pbp);
311 	mtx_unlock(&sc->sc_queue_mtx);
312 	wakeup(sc);
313 }
314 
315 /*
316  * The function is called after we encrypt and write data.
317  *
318  * g_eli_start -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> G_ELI_WRITE_DONE -> g_io_deliver
319  */
320 void
321 g_eli_write_done(struct bio *bp)
322 {
323 	struct g_eli_softc *sc;
324 	struct bio *pbp;
325 
326 	G_ELI_LOGREQ(2, bp, "Request done.");
327 	pbp = bp->bio_parent;
328 	if (pbp->bio_error == 0 && bp->bio_error != 0)
329 		pbp->bio_error = bp->bio_error;
330 	g_destroy_bio(bp);
331 	/*
332 	 * Do we have all sectors already?
333 	 */
334 	pbp->bio_inbed++;
335 	if (pbp->bio_inbed < pbp->bio_children)
336 		return;
337 	sc = pbp->bio_to->geom->softc;
338 	g_eli_free_data(pbp);
339 	if (pbp->bio_error != 0) {
340 		G_ELI_LOGREQ(0, pbp, "%s() failed (error=%d)", __func__,
341 		    pbp->bio_error);
342 		pbp->bio_completed = 0;
343 	} else
344 		pbp->bio_completed = pbp->bio_length;
345 
346 	/*
347 	 * Write is finished, send it up.
348 	 */
349 	g_io_deliver(pbp, pbp->bio_error);
350 	if (sc != NULL)
351 		atomic_subtract_int(&sc->sc_inflight, 1);
352 }
353 
354 /*
355  * This function should never be called, but GEOM made as it set ->orphan()
356  * method for every geom.
357  */
358 static void
359 g_eli_orphan_spoil_assert(struct g_consumer *cp)
360 {
361 
362 	panic("Function %s() called for %s.", __func__, cp->geom->name);
363 }
364 
365 static void
366 g_eli_orphan(struct g_consumer *cp)
367 {
368 	struct g_eli_softc *sc;
369 
370 	g_topology_assert();
371 	sc = cp->geom->softc;
372 	if (sc == NULL)
373 		return;
374 	g_eli_destroy(sc, TRUE);
375 }
376 
377 static void
378 g_eli_resize(struct g_consumer *cp)
379 {
380 	struct g_eli_softc *sc;
381 	struct g_provider *epp, *pp;
382 	off_t oldsize;
383 
384 	g_topology_assert();
385 	sc = cp->geom->softc;
386 	if (sc == NULL)
387 		return;
388 
389 	if ((sc->sc_flags & G_ELI_FLAG_AUTORESIZE) == 0) {
390 		G_ELI_DEBUG(0, "Autoresize is turned off, old size: %jd.",
391 		    (intmax_t)sc->sc_provsize);
392 		return;
393 	}
394 
395 	pp = cp->provider;
396 
397 	if ((sc->sc_flags & G_ELI_FLAG_ONETIME) == 0) {
398 		struct g_eli_metadata md;
399 		u_char *sector;
400 		int error;
401 
402 		sector = NULL;
403 
404 		error = g_eli_read_metadata_offset(cp->geom->class, pp,
405 		    sc->sc_provsize - pp->sectorsize, &md);
406 		if (error != 0) {
407 			G_ELI_DEBUG(0, "Cannot read metadata from %s (error=%d).",
408 			    pp->name, error);
409 			goto iofail;
410 		}
411 
412 		md.md_provsize = pp->mediasize;
413 
414 		sector = malloc(pp->sectorsize, M_ELI, M_WAITOK | M_ZERO);
415 		eli_metadata_encode(&md, sector);
416 		error = g_write_data(cp, pp->mediasize - pp->sectorsize, sector,
417 		    pp->sectorsize);
418 		if (error != 0) {
419 			G_ELI_DEBUG(0, "Cannot store metadata on %s (error=%d).",
420 			    pp->name, error);
421 			goto iofail;
422 		}
423 		explicit_bzero(sector, pp->sectorsize);
424 		error = g_write_data(cp, sc->sc_provsize - pp->sectorsize,
425 		    sector, pp->sectorsize);
426 		if (error != 0) {
427 			G_ELI_DEBUG(0, "Cannot clear old metadata from %s (error=%d).",
428 			    pp->name, error);
429 			goto iofail;
430 		}
431 iofail:
432 		explicit_bzero(&md, sizeof(md));
433 		zfree(sector, M_ELI);
434 	}
435 
436 	oldsize = sc->sc_mediasize;
437 	sc->sc_mediasize = eli_mediasize(sc, pp->mediasize, pp->sectorsize);
438 	g_eli_key_resize(sc);
439 	sc->sc_provsize = pp->mediasize;
440 
441 	epp = LIST_FIRST(&sc->sc_geom->provider);
442 	g_resize_provider(epp, sc->sc_mediasize);
443 	G_ELI_DEBUG(0, "Device %s size changed from %jd to %jd.", epp->name,
444 	    (intmax_t)oldsize, (intmax_t)sc->sc_mediasize);
445 }
446 
447 /*
448  * BIO_READ:
449  *	G_ELI_START -> g_eli_crypto_read -> g_io_request -> g_eli_read_done -> g_eli_crypto_run -> g_eli_crypto_read_done -> g_io_deliver
450  * BIO_WRITE:
451  *	G_ELI_START -> g_eli_crypto_run -> g_eli_crypto_write_done -> g_io_request -> g_eli_write_done -> g_io_deliver
452  */
453 static void
454 g_eli_start(struct bio *bp)
455 {
456 	struct g_eli_softc *sc;
457 	struct g_consumer *cp;
458 	struct bio *cbp;
459 
460 	sc = bp->bio_to->geom->softc;
461 	KASSERT(sc != NULL,
462 	    ("Provider's error should be set (error=%d)(device=%s).",
463 	    bp->bio_to->error, bp->bio_to->name));
464 	G_ELI_LOGREQ(2, bp, "Request received.");
465 
466 	switch (bp->bio_cmd) {
467 	case BIO_READ:
468 	case BIO_WRITE:
469 	case BIO_GETATTR:
470 	case BIO_FLUSH:
471 	case BIO_ZONE:
472 	case BIO_SPEEDUP:
473 		break;
474 	case BIO_DELETE:
475 		/*
476 		 * If the user hasn't set the NODELETE flag, we just pass
477 		 * it down the stack and let the layers beneath us do (or
478 		 * not) whatever they do with it.  If they have, we
479 		 * reject it.  A possible extension would be an
480 		 * additional flag to take it as a hint to shred the data
481 		 * with [multiple?] overwrites.
482 		 */
483 		if (!(sc->sc_flags & G_ELI_FLAG_NODELETE))
484 			break;
485 	default:
486 		g_io_deliver(bp, EOPNOTSUPP);
487 		return;
488 	}
489 	cbp = g_clone_bio(bp);
490 	if (cbp == NULL) {
491 		g_io_deliver(bp, ENOMEM);
492 		return;
493 	}
494 	bp->bio_driver1 = cbp;
495 	bp->bio_pflags = 0;
496 	G_ELI_SET_NEW_BIO(bp->bio_pflags);
497 	switch (bp->bio_cmd) {
498 	case BIO_READ:
499 		if (!(sc->sc_flags & G_ELI_FLAG_AUTH)) {
500 			g_eli_crypto_read(sc, bp, 0);
501 			break;
502 		}
503 		/* FALLTHROUGH */
504 	case BIO_WRITE:
505 		mtx_lock(&sc->sc_queue_mtx);
506 		bioq_insert_tail(&sc->sc_queue, bp);
507 		mtx_unlock(&sc->sc_queue_mtx);
508 		wakeup(sc);
509 		break;
510 	case BIO_GETATTR:
511 	case BIO_FLUSH:
512 	case BIO_DELETE:
513 	case BIO_SPEEDUP:
514 	case BIO_ZONE:
515 		if (bp->bio_cmd == BIO_GETATTR)
516 			cbp->bio_done = g_eli_getattr_done;
517 		else
518 			cbp->bio_done = g_std_done;
519 		cp = LIST_FIRST(&sc->sc_geom->consumer);
520 		cbp->bio_to = cp->provider;
521 		G_ELI_LOGREQ(2, cbp, "Sending request.");
522 		g_io_request(cbp, cp);
523 		break;
524 	}
525 }
526 
527 static int
528 g_eli_newsession(struct g_eli_worker *wr)
529 {
530 	struct g_eli_softc *sc;
531 	struct crypto_session_params csp;
532 	uint32_t caps;
533 	int error, new_crypto;
534 	void *key;
535 
536 	sc = wr->w_softc;
537 
538 	memset(&csp, 0, sizeof(csp));
539 	csp.csp_mode = CSP_MODE_CIPHER;
540 	csp.csp_cipher_alg = sc->sc_ealgo;
541 	csp.csp_ivlen = g_eli_ivlen(sc->sc_ealgo);
542 	csp.csp_cipher_klen = sc->sc_ekeylen / 8;
543 	if (sc->sc_ealgo == CRYPTO_AES_XTS)
544 		csp.csp_cipher_klen <<= 1;
545 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) {
546 		key = g_eli_key_hold(sc, 0,
547 		    LIST_FIRST(&sc->sc_geom->consumer)->provider->sectorsize);
548 		csp.csp_cipher_key = key;
549 	} else {
550 		key = NULL;
551 		csp.csp_cipher_key = sc->sc_ekey;
552 	}
553 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
554 		csp.csp_mode = CSP_MODE_ETA;
555 		csp.csp_auth_alg = sc->sc_aalgo;
556 		csp.csp_auth_klen = G_ELI_AUTH_SECKEYLEN;
557 	}
558 
559 	switch (sc->sc_crypto) {
560 	case G_ELI_CRYPTO_SW_ACCEL:
561 	case G_ELI_CRYPTO_SW:
562 		error = crypto_newsession(&wr->w_sid, &csp,
563 		    CRYPTOCAP_F_SOFTWARE);
564 		break;
565 	case G_ELI_CRYPTO_HW:
566 		error = crypto_newsession(&wr->w_sid, &csp,
567 		    CRYPTOCAP_F_HARDWARE);
568 		break;
569 	case G_ELI_CRYPTO_UNKNOWN:
570 		error = crypto_newsession(&wr->w_sid, &csp,
571 		    CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE);
572 		if (error == 0) {
573 			caps = crypto_ses2caps(wr->w_sid);
574 			if (caps & CRYPTOCAP_F_HARDWARE)
575 				new_crypto = G_ELI_CRYPTO_HW;
576 			else if (caps & CRYPTOCAP_F_ACCEL_SOFTWARE)
577 				new_crypto = G_ELI_CRYPTO_SW_ACCEL;
578 			else
579 				new_crypto = G_ELI_CRYPTO_SW;
580 			mtx_lock(&sc->sc_queue_mtx);
581 			if (sc->sc_crypto == G_ELI_CRYPTO_UNKNOWN)
582 				sc->sc_crypto = new_crypto;
583 			mtx_unlock(&sc->sc_queue_mtx);
584 		}
585 		break;
586 	default:
587 		panic("%s: invalid condition", __func__);
588 	}
589 
590 	if ((sc->sc_flags & G_ELI_FLAG_FIRST_KEY) != 0) {
591 		if (error)
592 			g_eli_key_drop(sc, key);
593 		else
594 			wr->w_first_key = key;
595 	}
596 
597 	return (error);
598 }
599 
600 static void
601 g_eli_freesession(struct g_eli_worker *wr)
602 {
603 	struct g_eli_softc *sc;
604 
605 	crypto_freesession(wr->w_sid);
606 	if (wr->w_first_key != NULL) {
607 		sc = wr->w_softc;
608 		g_eli_key_drop(sc, wr->w_first_key);
609 		wr->w_first_key = NULL;
610 	}
611 }
612 
613 static void
614 g_eli_cancel(struct g_eli_softc *sc)
615 {
616 	struct bio *bp;
617 
618 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
619 
620 	while ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) {
621 		KASSERT(G_ELI_IS_NEW_BIO(bp->bio_pflags),
622 		    ("Not new bio when canceling (bp=%p).", bp));
623 		g_io_deliver(bp, ENXIO);
624 	}
625 }
626 
627 static struct bio *
628 g_eli_takefirst(struct g_eli_softc *sc)
629 {
630 	struct bio *bp;
631 
632 	mtx_assert(&sc->sc_queue_mtx, MA_OWNED);
633 
634 	if (!(sc->sc_flags & G_ELI_FLAG_SUSPEND))
635 		return (bioq_takefirst(&sc->sc_queue));
636 	/*
637 	 * Device suspended, so we skip new I/O requests.
638 	 */
639 	TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) {
640 		if (!G_ELI_IS_NEW_BIO(bp->bio_pflags))
641 			break;
642 	}
643 	if (bp != NULL)
644 		bioq_remove(&sc->sc_queue, bp);
645 	return (bp);
646 }
647 
648 /*
649  * This is the main function for kernel worker thread when we don't have
650  * hardware acceleration and we have to do cryptography in software.
651  * Dedicated thread is needed, so we don't slow down g_up/g_down GEOM
652  * threads with crypto work.
653  */
654 static void
655 g_eli_worker(void *arg)
656 {
657 	struct g_eli_softc *sc;
658 	struct g_eli_worker *wr;
659 	struct bio *bp;
660 	int error;
661 
662 	wr = arg;
663 	sc = wr->w_softc;
664 #ifdef EARLY_AP_STARTUP
665 	MPASS(!sc->sc_cpubind || smp_started);
666 #elif defined(SMP)
667 	/* Before sched_bind() to a CPU, wait for all CPUs to go on-line. */
668 	if (sc->sc_cpubind) {
669 		while (!smp_started)
670 			tsleep(wr, 0, "geli:smp", hz / 4);
671 	}
672 #endif
673 	thread_lock(curthread);
674 	sched_prio(curthread, PUSER);
675 	if (sc->sc_cpubind)
676 		sched_bind(curthread, wr->w_number % mp_ncpus);
677 	thread_unlock(curthread);
678 
679 	G_ELI_DEBUG(1, "Thread %s started.", curthread->td_proc->p_comm);
680 
681 	for (;;) {
682 		mtx_lock(&sc->sc_queue_mtx);
683 again:
684 		bp = g_eli_takefirst(sc);
685 		if (bp == NULL) {
686 			if (sc->sc_flags & G_ELI_FLAG_DESTROY) {
687 				g_eli_cancel(sc);
688 				LIST_REMOVE(wr, w_next);
689 				g_eli_freesession(wr);
690 				free(wr, M_ELI);
691 				G_ELI_DEBUG(1, "Thread %s exiting.",
692 				    curthread->td_proc->p_comm);
693 				wakeup(&sc->sc_workers);
694 				mtx_unlock(&sc->sc_queue_mtx);
695 				kproc_exit(0);
696 			}
697 			while (sc->sc_flags & G_ELI_FLAG_SUSPEND) {
698 				if (sc->sc_inflight > 0) {
699 					G_ELI_DEBUG(0, "inflight=%d",
700 					    sc->sc_inflight);
701 					/*
702 					 * We still have inflight BIOs, so
703 					 * sleep and retry.
704 					 */
705 					msleep(sc, &sc->sc_queue_mtx, PRIBIO,
706 					    "geli:inf", hz / 5);
707 					goto again;
708 				}
709 				/*
710 				 * Suspend requested, mark the worker as
711 				 * suspended and go to sleep.
712 				 */
713 				if (wr->w_active) {
714 					g_eli_freesession(wr);
715 					wr->w_active = FALSE;
716 				}
717 				wakeup(&sc->sc_workers);
718 				msleep(sc, &sc->sc_queue_mtx, PRIBIO,
719 				    "geli:suspend", 0);
720 				if (!wr->w_active &&
721 				    !(sc->sc_flags & G_ELI_FLAG_SUSPEND)) {
722 					error = g_eli_newsession(wr);
723 					KASSERT(error == 0,
724 					    ("g_eli_newsession() failed on resume (error=%d)",
725 					    error));
726 					wr->w_active = TRUE;
727 				}
728 				goto again;
729 			}
730 			msleep(sc, &sc->sc_queue_mtx, PDROP, "geli:w", 0);
731 			continue;
732 		}
733 		if (G_ELI_IS_NEW_BIO(bp->bio_pflags))
734 			atomic_add_int(&sc->sc_inflight, 1);
735 		mtx_unlock(&sc->sc_queue_mtx);
736 		if (G_ELI_IS_NEW_BIO(bp->bio_pflags)) {
737 			G_ELI_SETWORKER(bp->bio_pflags, 0);
738 			if (sc->sc_flags & G_ELI_FLAG_AUTH) {
739 				if (bp->bio_cmd == BIO_READ)
740 					g_eli_auth_read(sc, bp);
741 				else
742 					g_eli_auth_run(wr, bp);
743 			} else {
744 				if (bp->bio_cmd == BIO_READ)
745 					g_eli_crypto_read(sc, bp, 1);
746 				else
747 					g_eli_crypto_run(wr, bp);
748 			}
749 		} else {
750 			if (sc->sc_flags & G_ELI_FLAG_AUTH)
751 				g_eli_auth_run(wr, bp);
752 			else
753 				g_eli_crypto_run(wr, bp);
754 		}
755 	}
756 }
757 
758 static int
759 g_eli_read_metadata_offset(struct g_class *mp, struct g_provider *pp,
760     off_t offset, struct g_eli_metadata *md)
761 {
762 	struct g_geom *gp;
763 	struct g_consumer *cp;
764 	u_char *buf = NULL;
765 	int error;
766 
767 	g_topology_assert();
768 
769 	gp = g_new_geomf(mp, "eli:taste");
770 	gp->start = g_eli_start;
771 	gp->access = g_std_access;
772 	/*
773 	 * g_eli_read_metadata() is always called from the event thread.
774 	 * Our geom is created and destroyed in the same event, so there
775 	 * could be no orphan nor spoil event in the meantime.
776 	 */
777 	gp->orphan = g_eli_orphan_spoil_assert;
778 	gp->spoiled = g_eli_orphan_spoil_assert;
779 	cp = g_new_consumer(gp);
780 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
781 	error = g_attach(cp, pp);
782 	if (error != 0)
783 		goto end;
784 	error = g_access(cp, 1, 0, 0);
785 	if (error != 0)
786 		goto end;
787 	g_topology_unlock();
788 	buf = g_read_data(cp, offset, pp->sectorsize, &error);
789 	g_topology_lock();
790 	if (buf == NULL)
791 		goto end;
792 	error = eli_metadata_decode(buf, md);
793 	if (error != 0)
794 		goto end;
795 	/* Metadata was read and decoded successfully. */
796 end:
797 	if (buf != NULL)
798 		g_free(buf);
799 	if (cp->provider != NULL) {
800 		if (cp->acr == 1)
801 			g_access(cp, -1, 0, 0);
802 		g_detach(cp);
803 	}
804 	g_destroy_consumer(cp);
805 	g_destroy_geom(gp);
806 	return (error);
807 }
808 
809 int
810 g_eli_read_metadata(struct g_class *mp, struct g_provider *pp,
811     struct g_eli_metadata *md)
812 {
813 
814 	return (g_eli_read_metadata_offset(mp, pp,
815 	    pp->mediasize - pp->sectorsize, md));
816 }
817 
818 /*
819  * The function is called when we had last close on provider and user requested
820  * to close it when this situation occur.
821  */
822 static void
823 g_eli_last_close(void *arg, int flags __unused)
824 {
825 	struct g_geom *gp;
826 	char gpname[64];
827 	int error;
828 
829 	g_topology_assert();
830 	gp = arg;
831 	strlcpy(gpname, gp->name, sizeof(gpname));
832 	error = g_eli_destroy(gp->softc, TRUE);
833 	KASSERT(error == 0, ("Cannot detach %s on last close (error=%d).",
834 	    gpname, error));
835 	G_ELI_DEBUG(0, "Detached %s on last close.", gpname);
836 }
837 
838 int
839 g_eli_access(struct g_provider *pp, int dr, int dw, int de)
840 {
841 	struct g_eli_softc *sc;
842 	struct g_geom *gp;
843 
844 	gp = pp->geom;
845 	sc = gp->softc;
846 
847 	if (dw > 0) {
848 		if (sc->sc_flags & G_ELI_FLAG_RO) {
849 			/* Deny write attempts. */
850 			return (EROFS);
851 		}
852 		/* Someone is opening us for write, we need to remember that. */
853 		sc->sc_flags |= G_ELI_FLAG_WOPEN;
854 		return (0);
855 	}
856 	/* Is this the last close? */
857 	if (pp->acr + dr > 0 || pp->acw + dw > 0 || pp->ace + de > 0)
858 		return (0);
859 
860 	/*
861 	 * Automatically detach on last close if requested.
862 	 */
863 	if ((sc->sc_flags & G_ELI_FLAG_RW_DETACH) ||
864 	    (sc->sc_flags & G_ELI_FLAG_WOPEN)) {
865 		g_post_event(g_eli_last_close, gp, M_WAITOK, NULL);
866 	}
867 	return (0);
868 }
869 
870 static int
871 g_eli_cpu_is_disabled(int cpu)
872 {
873 #ifdef SMP
874 	return (CPU_ISSET(cpu, &hlt_cpus_mask));
875 #else
876 	return (0);
877 #endif
878 }
879 
880 static void
881 g_eli_init_uma(void)
882 {
883 
884 	atomic_add_int(&g_eli_devs, 1);
885 	sx_xlock(&g_eli_umalock);
886 	if (g_eli_uma == NULL) {
887 		/*
888 		 * Calculate the maximum-sized swap buffer we are
889 		 * likely to see.
890 		 */
891 		g_eli_alloc_sz = roundup2((PAGE_SIZE + sizeof(int) +
892                     G_ELI_AUTH_SECKEYLEN) * nsw_cluster_max +
893                     sizeof(uintptr_t), PAGE_SIZE);
894 
895 		/*
896 		 * Create the zone, setting UMA_ZONE_NOFREE so we won't
897 		 * drain the zone in a memory shortage.
898 		 */
899 		g_eli_uma = uma_zcreate("GELI buffers", g_eli_alloc_sz,
900 		    NULL, NULL, NULL, NULL,
901 		    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
902 
903 		/* Reserve and pre-allocate pages, as appropriate. */
904 		uma_zone_reserve(g_eli_uma, g_eli_minbufs);
905 		uma_prealloc(g_eli_uma, g_eli_minbufs);
906 	}
907 	sx_xunlock(&g_eli_umalock);
908 }
909 
910 /*
911  * Try to destroy the UMA pool. This will do nothing if there are existing
912  * GELI devices or existing UMA allocations.
913  */
914 static void
915 g_eli_destroy_uma(void)
916 {
917 	uma_zone_t oldzone;
918 
919 	sx_xlock(&g_eli_umalock);
920 	/* Ensure we really should be destroying this. */
921 	if (atomic_load_int(&g_eli_devs) == 0 &&
922 	    atomic_load_int(&g_eli_umaoutstanding) == 0) {
923 		oldzone = g_eli_uma;
924 		g_eli_uma = NULL;
925 	} else
926 		oldzone = NULL;
927 	sx_xunlock(&g_eli_umalock);
928 
929 	if (oldzone != NULL)
930 		uma_zdestroy(oldzone);
931 }
932 
933 static void
934 g_eli_fini_uma(void)
935 {
936 
937 	/*
938 	 * If this is the last outstanding GELI device, try to
939 	 * destroy the UMA pool.
940 	 */
941 	if (atomic_fetchadd_int(&g_eli_devs, -1) == 1)
942 		g_eli_destroy_uma();
943 }
944 
945 /*
946  * Allocate a data buffer. If the size fits within our swap-sized buffers,
947  * try to allocate a swap-sized buffer from the UMA pool. Otherwise, fall
948  * back to using malloc.
949  *
950  * Swap-related requests are special: they can only use the UMA pool, they
951  * use M_USE_RESERVE to let them dip farther into system resources, and
952  * they always use M_NOWAIT to prevent swap operations from deadlocking.
953  */
954 bool
955 g_eli_alloc_data(struct bio *bp, int sz)
956 {
957 
958 	KASSERT(sz <= g_eli_alloc_sz || (bp->bio_flags & BIO_SWAP) == 0,
959 	    ("BIO_SWAP request for %d bytes exceeds the precalculated buffer"
960 	    " size (%d)", sz, g_eli_alloc_sz));
961 	if (sz <= g_eli_alloc_sz) {
962 		bp->bio_driver2 = uma_zalloc(g_eli_uma, M_NOWAIT |
963 		    ((bp->bio_flags & BIO_SWAP) != 0 ? M_USE_RESERVE : 0));
964 		if (bp->bio_driver2 != NULL) {
965 			bp->bio_pflags |= G_ELI_UMA_ALLOC;
966 			atomic_add_int(&g_eli_umaoutstanding, 1);
967 		}
968 		if (bp->bio_driver2 != NULL || (bp->bio_flags & BIO_SWAP) != 0)
969 			return (bp->bio_driver2 != NULL);
970 	}
971 	bp->bio_pflags &= ~(G_ELI_UMA_ALLOC);
972 	bp->bio_driver2 = malloc(sz, M_ELI, g_eli_blocking_malloc ? M_WAITOK :
973 	    M_NOWAIT);
974 	return (bp->bio_driver2 != NULL);
975 }
976 
977 /*
978  * Free a buffer from bp->bio_driver2 which was allocated with
979  * g_eli_alloc_data(). This function makes sure that the memory is freed
980  * to the correct place.
981  *
982  * Additionally, if this function frees the last outstanding UMA request
983  * and there are no open GELI devices, this will destroy the UMA pool.
984  */
985 void
986 g_eli_free_data(struct bio *bp)
987 {
988 
989 	/*
990 	 * Mimic the free(9) behavior of allowing a NULL pointer to be
991 	 * freed.
992 	 */
993 	if (bp->bio_driver2 == NULL)
994 		return;
995 
996 	if ((bp->bio_pflags & G_ELI_UMA_ALLOC) != 0) {
997 		uma_zfree(g_eli_uma, bp->bio_driver2);
998 		if (atomic_fetchadd_int(&g_eli_umaoutstanding, -1) == 1 &&
999 		    atomic_load_int(&g_eli_devs) == 0)
1000 			g_eli_destroy_uma();
1001 	} else
1002 		free(bp->bio_driver2, M_ELI);
1003 	bp->bio_driver2 = NULL;
1004 }
1005 
1006 struct g_geom *
1007 g_eli_create(struct gctl_req *req, struct g_class *mp, struct g_provider *bpp,
1008     const struct g_eli_metadata *md, const u_char *mkey, int nkey)
1009 {
1010 	struct g_eli_softc *sc;
1011 	struct g_eli_worker *wr;
1012 	struct g_geom *gp;
1013 	struct g_provider *pp;
1014 	struct g_consumer *cp;
1015 	struct g_geom_alias *gap;
1016 	u_int i, threads;
1017 	int dcw, error;
1018 
1019 	G_ELI_DEBUG(1, "Creating device %s%s.", bpp->name, G_ELI_SUFFIX);
1020 	KASSERT(eli_metadata_crypto_supported(md),
1021 	    ("%s: unsupported crypto for %s", __func__, bpp->name));
1022 
1023 	gp = g_new_geomf(mp, "%s%s", bpp->name, G_ELI_SUFFIX);
1024 	sc = malloc(sizeof(*sc), M_ELI, M_WAITOK | M_ZERO);
1025 	gp->start = g_eli_start;
1026 	/*
1027 	 * Spoiling can happen even though we have the provider open
1028 	 * exclusively, e.g. through media change events.
1029 	 */
1030 	gp->spoiled = g_eli_orphan;
1031 	gp->orphan = g_eli_orphan;
1032 	gp->resize = g_eli_resize;
1033 	gp->dumpconf = g_eli_dumpconf;
1034 	/*
1035 	 * If detach-on-last-close feature is not enabled and we don't operate
1036 	 * on read-only provider, we can simply use g_std_access().
1037 	 */
1038 	if (md->md_flags & (G_ELI_FLAG_WO_DETACH | G_ELI_FLAG_RO))
1039 		gp->access = g_eli_access;
1040 	else
1041 		gp->access = g_std_access;
1042 
1043 	eli_metadata_softc(sc, md, bpp->sectorsize, bpp->mediasize);
1044 	sc->sc_nkey = nkey;
1045 
1046 	gp->softc = sc;
1047 	sc->sc_geom = gp;
1048 
1049 	bioq_init(&sc->sc_queue);
1050 	mtx_init(&sc->sc_queue_mtx, "geli:queue", NULL, MTX_DEF);
1051 	mtx_init(&sc->sc_ekeys_lock, "geli:ekeys", NULL, MTX_DEF);
1052 
1053 	pp = NULL;
1054 	cp = g_new_consumer(gp);
1055 	cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
1056 	error = g_attach(cp, bpp);
1057 	if (error != 0) {
1058 		if (req != NULL) {
1059 			gctl_error(req, "Cannot attach to %s (error=%d).",
1060 			    bpp->name, error);
1061 		} else {
1062 			G_ELI_DEBUG(1, "Cannot attach to %s (error=%d).",
1063 			    bpp->name, error);
1064 		}
1065 		goto failed;
1066 	}
1067 	/*
1068 	 * Keep provider open all the time, so we can run critical tasks,
1069 	 * like Master Keys deletion, without wondering if we can open
1070 	 * provider or not.
1071 	 * We don't open provider for writing only when user requested read-only
1072 	 * access.
1073 	 */
1074 	dcw = (sc->sc_flags & G_ELI_FLAG_RO) ? 0 : 1;
1075 	error = g_access(cp, 1, dcw, 1);
1076 	if (error != 0) {
1077 		if (req != NULL) {
1078 			gctl_error(req, "Cannot access %s (error=%d).",
1079 			    bpp->name, error);
1080 		} else {
1081 			G_ELI_DEBUG(1, "Cannot access %s (error=%d).",
1082 			    bpp->name, error);
1083 		}
1084 		goto failed;
1085 	}
1086 
1087 	/*
1088 	 * Remember the keys in our softc structure.
1089 	 */
1090 	g_eli_mkey_propagate(sc, mkey);
1091 
1092 	LIST_INIT(&sc->sc_workers);
1093 
1094 	threads = g_eli_threads;
1095 	if (threads == 0)
1096 		threads = mp_ncpus;
1097 	sc->sc_cpubind = (mp_ncpus > 1 && threads == mp_ncpus);
1098 	g_eli_init_uma();
1099 	for (i = 0; i < threads; i++) {
1100 		if (g_eli_cpu_is_disabled(i)) {
1101 			G_ELI_DEBUG(1, "%s: CPU %u disabled, skipping.",
1102 			    bpp->name, i);
1103 			continue;
1104 		}
1105 		wr = malloc(sizeof(*wr), M_ELI, M_WAITOK | M_ZERO);
1106 		wr->w_softc = sc;
1107 		wr->w_number = i;
1108 		wr->w_active = TRUE;
1109 
1110 		error = g_eli_newsession(wr);
1111 		if (error != 0) {
1112 			free(wr, M_ELI);
1113 			if (req != NULL) {
1114 				gctl_error(req, "Cannot set up crypto session "
1115 				    "for %s (error=%d).", bpp->name, error);
1116 			} else {
1117 				G_ELI_DEBUG(1, "Cannot set up crypto session "
1118 				    "for %s (error=%d).", bpp->name, error);
1119 			}
1120 			goto failed;
1121 		}
1122 
1123 		error = kproc_create(g_eli_worker, wr, &wr->w_proc, 0, 0,
1124 		    "g_eli[%u] %s", i, bpp->name);
1125 		if (error != 0) {
1126 			g_eli_freesession(wr);
1127 			free(wr, M_ELI);
1128 			if (req != NULL) {
1129 				gctl_error(req, "Cannot create kernel thread "
1130 				    "for %s (error=%d).", bpp->name, error);
1131 			} else {
1132 				G_ELI_DEBUG(1, "Cannot create kernel thread "
1133 				    "for %s (error=%d).", bpp->name, error);
1134 			}
1135 			goto failed;
1136 		}
1137 		LIST_INSERT_HEAD(&sc->sc_workers, wr, w_next);
1138 	}
1139 
1140 	/*
1141 	 * Create decrypted provider.
1142 	 */
1143 	pp = g_new_providerf(gp, "%s%s", bpp->name, G_ELI_SUFFIX);
1144 	pp->flags |= G_PF_DIRECT_SEND | G_PF_DIRECT_RECEIVE;
1145 	if (CRYPTO_HAS_VMPAGE) {
1146 		/*
1147 		 * On DMAP architectures we can use unmapped I/O.  But don't
1148 		 * use it with data integrity verification.  That code hasn't
1149 		 * been written yet.
1150 		 */
1151 		 if ((sc->sc_flags & G_ELI_FLAG_AUTH) == 0)
1152 			pp->flags |= G_PF_ACCEPT_UNMAPPED;
1153 	}
1154 	pp->mediasize = sc->sc_mediasize;
1155 	pp->sectorsize = sc->sc_sectorsize;
1156 	LIST_FOREACH(gap, &bpp->aliases, ga_next)
1157 		g_provider_add_alias(pp, "%s%s", gap->ga_alias, G_ELI_SUFFIX);
1158 
1159 	g_error_provider(pp, 0);
1160 
1161 	G_ELI_DEBUG(0, "Device %s created.", pp->name);
1162 	G_ELI_DEBUG(0, "Encryption: %s %u", g_eli_algo2str(sc->sc_ealgo),
1163 	    sc->sc_ekeylen);
1164 	if (sc->sc_flags & G_ELI_FLAG_AUTH)
1165 		G_ELI_DEBUG(0, " Integrity: %s", g_eli_algo2str(sc->sc_aalgo));
1166 	G_ELI_DEBUG(0, "    Crypto: %s",
1167 	    sc->sc_crypto == G_ELI_CRYPTO_SW_ACCEL ? "accelerated software" :
1168 	    sc->sc_crypto == G_ELI_CRYPTO_SW ? "software" : "hardware");
1169 	return (gp);
1170 failed:
1171 	mtx_lock(&sc->sc_queue_mtx);
1172 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
1173 	wakeup(sc);
1174 	/*
1175 	 * Wait for kernel threads self destruction.
1176 	 */
1177 	while (!LIST_EMPTY(&sc->sc_workers)) {
1178 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
1179 		    "geli:destroy", 0);
1180 	}
1181 	mtx_destroy(&sc->sc_queue_mtx);
1182 	if (cp->provider != NULL) {
1183 		if (cp->acr == 1)
1184 			g_access(cp, -1, -dcw, -1);
1185 		g_detach(cp);
1186 	}
1187 	g_destroy_consumer(cp);
1188 	g_destroy_geom(gp);
1189 	g_eli_key_destroy(sc);
1190 	g_eli_fini_uma();
1191 	zfree(sc, M_ELI);
1192 	return (NULL);
1193 }
1194 
1195 int
1196 g_eli_destroy(struct g_eli_softc *sc, boolean_t force)
1197 {
1198 	struct g_geom *gp;
1199 	struct g_provider *pp;
1200 
1201 	g_topology_assert();
1202 
1203 	if (sc == NULL)
1204 		return (ENXIO);
1205 
1206 	gp = sc->sc_geom;
1207 	pp = LIST_FIRST(&gp->provider);
1208 	if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) {
1209 		if (force) {
1210 			G_ELI_DEBUG(1, "Device %s is still open, so it "
1211 			    "cannot be definitely removed.", pp->name);
1212 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
1213 			gp->access = g_eli_access;
1214 			g_wither_provider(pp, ENXIO);
1215 			return (EBUSY);
1216 		} else {
1217 			G_ELI_DEBUG(1,
1218 			    "Device %s is still open (r%dw%de%d).", pp->name,
1219 			    pp->acr, pp->acw, pp->ace);
1220 			return (EBUSY);
1221 		}
1222 	}
1223 
1224 	mtx_lock(&sc->sc_queue_mtx);
1225 	sc->sc_flags |= G_ELI_FLAG_DESTROY;
1226 	wakeup(sc);
1227 	while (!LIST_EMPTY(&sc->sc_workers)) {
1228 		msleep(&sc->sc_workers, &sc->sc_queue_mtx, PRIBIO,
1229 		    "geli:destroy", 0);
1230 	}
1231 	mtx_destroy(&sc->sc_queue_mtx);
1232 	gp->softc = NULL;
1233 	g_eli_key_destroy(sc);
1234 	g_eli_fini_uma();
1235 	zfree(sc, M_ELI);
1236 
1237 	G_ELI_DEBUG(0, "Device %s destroyed.", gp->name);
1238 	g_wither_geom_close(gp, ENXIO);
1239 
1240 	return (0);
1241 }
1242 
1243 static int
1244 g_eli_destroy_geom(struct gctl_req *req __unused,
1245     struct g_class *mp __unused, struct g_geom *gp)
1246 {
1247 	struct g_eli_softc *sc;
1248 
1249 	sc = gp->softc;
1250 	return (g_eli_destroy(sc, FALSE));
1251 }
1252 
1253 static int
1254 g_eli_keyfiles_load(struct hmac_ctx *ctx, const char *provider)
1255 {
1256 	u_char *keyfile, *data;
1257 	char *file, name[64];
1258 	size_t size;
1259 	int i;
1260 
1261 	for (i = 0; ; i++) {
1262 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
1263 		keyfile = preload_search_by_type(name);
1264 		if (keyfile == NULL && i == 0) {
1265 			/*
1266 			 * If there is only one keyfile, allow simpler name.
1267 			 */
1268 			snprintf(name, sizeof(name), "%s:geli_keyfile", provider);
1269 			keyfile = preload_search_by_type(name);
1270 		}
1271 		if (keyfile == NULL)
1272 			return (i);	/* Return number of loaded keyfiles. */
1273 		data = preload_fetch_addr(keyfile);
1274 		if (data == NULL) {
1275 			G_ELI_DEBUG(0, "Cannot find key file data for %s.",
1276 			    name);
1277 			return (0);
1278 		}
1279 		size = preload_fetch_size(keyfile);
1280 		if (size == 0) {
1281 			G_ELI_DEBUG(0, "Cannot find key file size for %s.",
1282 			    name);
1283 			return (0);
1284 		}
1285 		file = preload_search_info(keyfile, MODINFO_NAME);
1286 		if (file == NULL) {
1287 			G_ELI_DEBUG(0, "Cannot find key file name for %s.",
1288 			    name);
1289 			return (0);
1290 		}
1291 		G_ELI_DEBUG(1, "Loaded keyfile %s for %s (type: %s).", file,
1292 		    provider, name);
1293 		g_eli_crypto_hmac_update(ctx, data, size);
1294 	}
1295 }
1296 
1297 static void
1298 g_eli_keyfiles_clear(const char *provider)
1299 {
1300 	u_char *keyfile, *data;
1301 	char name[64];
1302 	size_t size;
1303 	int i;
1304 
1305 	for (i = 0; ; i++) {
1306 		snprintf(name, sizeof(name), "%s:geli_keyfile%d", provider, i);
1307 		keyfile = preload_search_by_type(name);
1308 		if (keyfile == NULL)
1309 			return;
1310 		data = preload_fetch_addr(keyfile);
1311 		size = preload_fetch_size(keyfile);
1312 		if (data != NULL && size != 0)
1313 			explicit_bzero(data, size);
1314 	}
1315 }
1316 
1317 /*
1318  * Tasting is only made on boot.
1319  * We detect providers which should be attached before root is mounted.
1320  */
1321 static struct g_geom *
1322 g_eli_taste(struct g_class *mp, struct g_provider *pp, int flags __unused)
1323 {
1324 	struct g_eli_metadata md;
1325 	struct g_geom *gp;
1326 	struct hmac_ctx ctx;
1327 	char passphrase[256];
1328 	u_char key[G_ELI_USERKEYLEN], mkey[G_ELI_DATAIVKEYLEN];
1329 	u_int i, nkey, nkeyfiles, tries, showpass;
1330 	int error;
1331         struct keybuf *keybuf;
1332 
1333 	g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name);
1334 	g_topology_assert();
1335 
1336 	if (root_mounted() || g_eli_tries == 0)
1337 		return (NULL);
1338 
1339 	G_ELI_DEBUG(3, "Tasting %s.", pp->name);
1340 
1341 	error = g_eli_read_metadata(mp, pp, &md);
1342 	if (error != 0)
1343 		return (NULL);
1344 	gp = NULL;
1345 
1346 	if (strcmp(md.md_magic, G_ELI_MAGIC) != 0)
1347 		return (NULL);
1348 	if (md.md_version > G_ELI_VERSION) {
1349 		printf("geom_eli.ko module is too old to handle %s.\n",
1350 		    pp->name);
1351 		return (NULL);
1352 	}
1353 	if (md.md_provsize != pp->mediasize)
1354 		return (NULL);
1355 	/* Should we attach it on boot? */
1356 	if (!(md.md_flags & G_ELI_FLAG_BOOT) &&
1357 	    !(md.md_flags & G_ELI_FLAG_GELIBOOT))
1358 		return (NULL);
1359 	if (md.md_keys == 0x00) {
1360 		G_ELI_DEBUG(0, "No valid keys on %s.", pp->name);
1361 		return (NULL);
1362 	}
1363 	if (!eli_metadata_crypto_supported(&md)) {
1364 		G_ELI_DEBUG(0, "%s uses invalid or unsupported algorithms\n",
1365 		    pp->name);
1366 		return (NULL);
1367 	}
1368 	if (md.md_iterations == -1) {
1369 		/* If there is no passphrase, we try only once. */
1370 		tries = 1;
1371 	} else {
1372 		/* Ask for the passphrase no more than g_eli_tries times. */
1373 		tries = g_eli_tries;
1374 	}
1375 
1376         if ((keybuf = get_keybuf()) != NULL) {
1377                 /* Scan the key buffer, try all GELI keys. */
1378                 for (i = 0; i < keybuf->kb_nents; i++) {
1379                          if (keybuf->kb_ents[i].ke_type == KEYBUF_TYPE_GELI) {
1380                                  memcpy(key, keybuf->kb_ents[i].ke_data,
1381                                      sizeof(key));
1382 
1383                                  if (g_eli_mkey_decrypt_any(&md, key,
1384                                      mkey, &nkey) == 0 ) {
1385                                          explicit_bzero(key, sizeof(key));
1386                                          goto have_key;
1387                                  }
1388                          }
1389                 }
1390         }
1391 
1392         for (i = 0; i <= tries; i++) {
1393                 g_eli_crypto_hmac_init(&ctx, NULL, 0);
1394 
1395                 /*
1396                  * Load all key files.
1397                  */
1398                 nkeyfiles = g_eli_keyfiles_load(&ctx, pp->name);
1399 
1400                 if (nkeyfiles == 0 && md.md_iterations == -1) {
1401                         /*
1402                          * No key files and no passphrase, something is
1403                          * definitely wrong here.
1404                          * geli(8) doesn't allow for such situation, so assume
1405                          * that there was really no passphrase and in that case
1406                          * key files are no properly defined in loader.conf.
1407                          */
1408                         G_ELI_DEBUG(0,
1409                             "Found no key files in loader.conf for %s.",
1410                             pp->name);
1411                         return (NULL);
1412                 }
1413 
1414                 /* Ask for the passphrase if defined. */
1415                 if (md.md_iterations >= 0) {
1416                         /* Try first with cached passphrase. */
1417                         if (i == 0) {
1418                                 if (!g_eli_boot_passcache)
1419                                         continue;
1420                                 memcpy(passphrase, cached_passphrase,
1421                                     sizeof(passphrase));
1422                         } else {
1423                                 printf("Enter passphrase for %s: ", pp->name);
1424 				showpass = g_eli_visible_passphrase;
1425 				if ((md.md_flags & G_ELI_FLAG_GELIDISPLAYPASS) != 0)
1426 					showpass = GETS_ECHOPASS;
1427                                 cngets(passphrase, sizeof(passphrase),
1428 				    showpass);
1429                                 memcpy(cached_passphrase, passphrase,
1430                                     sizeof(passphrase));
1431                         }
1432                 }
1433 
1434                 /*
1435                  * Prepare Derived-Key from the user passphrase.
1436                  */
1437                 if (md.md_iterations == 0) {
1438                         g_eli_crypto_hmac_update(&ctx, md.md_salt,
1439                             sizeof(md.md_salt));
1440                         g_eli_crypto_hmac_update(&ctx, passphrase,
1441                             strlen(passphrase));
1442                         explicit_bzero(passphrase, sizeof(passphrase));
1443                 } else if (md.md_iterations > 0) {
1444                         u_char dkey[G_ELI_USERKEYLEN];
1445 
1446                         pkcs5v2_genkey(dkey, sizeof(dkey), md.md_salt,
1447                             sizeof(md.md_salt), passphrase, md.md_iterations);
1448                         explicit_bzero(passphrase, sizeof(passphrase));
1449                         g_eli_crypto_hmac_update(&ctx, dkey, sizeof(dkey));
1450                         explicit_bzero(dkey, sizeof(dkey));
1451                 }
1452 
1453                 g_eli_crypto_hmac_final(&ctx, key, 0);
1454 
1455                 /*
1456                  * Decrypt Master-Key.
1457                  */
1458                 error = g_eli_mkey_decrypt_any(&md, key, mkey, &nkey);
1459                 explicit_bzero(key, sizeof(key));
1460                 if (error == -1) {
1461                         if (i == tries) {
1462                                 G_ELI_DEBUG(0,
1463                                     "Wrong key for %s. No tries left.",
1464                                     pp->name);
1465                                 g_eli_keyfiles_clear(pp->name);
1466                                 return (NULL);
1467                         }
1468                         if (i > 0) {
1469                                 G_ELI_DEBUG(0,
1470                                     "Wrong key for %s. Tries left: %u.",
1471                                     pp->name, tries - i);
1472                         }
1473                         /* Try again. */
1474                         continue;
1475                 } else if (error > 0) {
1476                         G_ELI_DEBUG(0,
1477                             "Cannot decrypt Master Key for %s (error=%d).",
1478                             pp->name, error);
1479                         g_eli_keyfiles_clear(pp->name);
1480                         return (NULL);
1481                 }
1482                 g_eli_keyfiles_clear(pp->name);
1483                 G_ELI_DEBUG(1, "Using Master Key %u for %s.", nkey, pp->name);
1484                 break;
1485         }
1486 have_key:
1487 
1488 	/*
1489 	 * We have correct key, let's attach provider.
1490 	 */
1491 	gp = g_eli_create(NULL, mp, pp, &md, mkey, nkey);
1492 	explicit_bzero(mkey, sizeof(mkey));
1493 	explicit_bzero(&md, sizeof(md));
1494 	if (gp == NULL) {
1495 		G_ELI_DEBUG(0, "Cannot create device %s%s.", pp->name,
1496 		    G_ELI_SUFFIX);
1497 		return (NULL);
1498 	}
1499 	return (gp);
1500 }
1501 
1502 static void
1503 g_eli_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp,
1504     struct g_consumer *cp, struct g_provider *pp)
1505 {
1506 	struct g_eli_softc *sc;
1507 
1508 	g_topology_assert();
1509 	sc = gp->softc;
1510 	if (sc == NULL)
1511 		return;
1512 	if (pp != NULL || cp != NULL)
1513 		return;	/* Nothing here. */
1514 
1515 	sbuf_printf(sb, "%s<KeysTotal>%ju</KeysTotal>\n", indent,
1516 	    (uintmax_t)sc->sc_ekeys_total);
1517 	sbuf_printf(sb, "%s<KeysAllocated>%ju</KeysAllocated>\n", indent,
1518 	    (uintmax_t)sc->sc_ekeys_allocated);
1519 	sbuf_printf(sb, "%s<Flags>", indent);
1520 	if (sc->sc_flags == 0)
1521 		sbuf_cat(sb, "NONE");
1522 	else {
1523 		int first = 1;
1524 
1525 #define ADD_FLAG(flag, name)	do {					\
1526 	if (sc->sc_flags & (flag)) {					\
1527 		if (!first)						\
1528 			sbuf_cat(sb, ", ");				\
1529 		else							\
1530 			first = 0;					\
1531 		sbuf_cat(sb, name);					\
1532 	}								\
1533 } while (0)
1534 		ADD_FLAG(G_ELI_FLAG_SUSPEND, "SUSPEND");
1535 		ADD_FLAG(G_ELI_FLAG_SINGLE_KEY, "SINGLE-KEY");
1536 		ADD_FLAG(G_ELI_FLAG_NATIVE_BYTE_ORDER, "NATIVE-BYTE-ORDER");
1537 		ADD_FLAG(G_ELI_FLAG_ONETIME, "ONETIME");
1538 		ADD_FLAG(G_ELI_FLAG_BOOT, "BOOT");
1539 		ADD_FLAG(G_ELI_FLAG_WO_DETACH, "W-DETACH");
1540 		ADD_FLAG(G_ELI_FLAG_RW_DETACH, "RW-DETACH");
1541 		ADD_FLAG(G_ELI_FLAG_AUTH, "AUTH");
1542 		ADD_FLAG(G_ELI_FLAG_WOPEN, "W-OPEN");
1543 		ADD_FLAG(G_ELI_FLAG_DESTROY, "DESTROY");
1544 		ADD_FLAG(G_ELI_FLAG_RO, "READ-ONLY");
1545 		ADD_FLAG(G_ELI_FLAG_NODELETE, "NODELETE");
1546 		ADD_FLAG(G_ELI_FLAG_GELIBOOT, "GELIBOOT");
1547 		ADD_FLAG(G_ELI_FLAG_GELIDISPLAYPASS, "GELIDISPLAYPASS");
1548 		ADD_FLAG(G_ELI_FLAG_AUTORESIZE, "AUTORESIZE");
1549 #undef  ADD_FLAG
1550 	}
1551 	sbuf_cat(sb, "</Flags>\n");
1552 
1553 	if (!(sc->sc_flags & G_ELI_FLAG_ONETIME)) {
1554 		sbuf_printf(sb, "%s<UsedKey>%u</UsedKey>\n", indent,
1555 		    sc->sc_nkey);
1556 	}
1557 	sbuf_printf(sb, "%s<Version>%u</Version>\n", indent, sc->sc_version);
1558 	sbuf_printf(sb, "%s<Crypto>", indent);
1559 	switch (sc->sc_crypto) {
1560 	case G_ELI_CRYPTO_HW:
1561 		sbuf_cat(sb, "hardware");
1562 		break;
1563 	case G_ELI_CRYPTO_SW:
1564 		sbuf_cat(sb, "software");
1565 		break;
1566 	case G_ELI_CRYPTO_SW_ACCEL:
1567 		sbuf_cat(sb, "accelerated software");
1568 		break;
1569 	default:
1570 		sbuf_cat(sb, "UNKNOWN");
1571 		break;
1572 	}
1573 	sbuf_cat(sb, "</Crypto>\n");
1574 	if (sc->sc_flags & G_ELI_FLAG_AUTH) {
1575 		sbuf_printf(sb,
1576 		    "%s<AuthenticationAlgorithm>%s</AuthenticationAlgorithm>\n",
1577 		    indent, g_eli_algo2str(sc->sc_aalgo));
1578 	}
1579 	sbuf_printf(sb, "%s<KeyLength>%u</KeyLength>\n", indent,
1580 	    sc->sc_ekeylen);
1581 	sbuf_printf(sb, "%s<EncryptionAlgorithm>%s</EncryptionAlgorithm>\n",
1582 	    indent, g_eli_algo2str(sc->sc_ealgo));
1583 	sbuf_printf(sb, "%s<State>%s</State>\n", indent,
1584 	    (sc->sc_flags & G_ELI_FLAG_SUSPEND) ? "SUSPENDED" : "ACTIVE");
1585 }
1586 
1587 static void
1588 g_eli_shutdown_pre_sync(void *arg, int howto)
1589 {
1590 	struct g_class *mp;
1591 	struct g_geom *gp, *gp2;
1592 	struct g_provider *pp;
1593 	struct g_eli_softc *sc;
1594 	int error;
1595 
1596 	mp = arg;
1597 	g_topology_lock();
1598 	LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) {
1599 		sc = gp->softc;
1600 		if (sc == NULL)
1601 			continue;
1602 		pp = LIST_FIRST(&gp->provider);
1603 		KASSERT(pp != NULL, ("No provider? gp=%p (%s)", gp, gp->name));
1604 		if (pp->acr != 0 || pp->acw != 0 || pp->ace != 0 ||
1605 		    SCHEDULER_STOPPED())
1606 		{
1607 			sc->sc_flags |= G_ELI_FLAG_RW_DETACH;
1608 			gp->access = g_eli_access;
1609 		} else {
1610 			error = g_eli_destroy(sc, TRUE);
1611 		}
1612 	}
1613 	g_topology_unlock();
1614 }
1615 
1616 static void
1617 g_eli_init(struct g_class *mp)
1618 {
1619 
1620 	g_eli_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync,
1621 	    g_eli_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST);
1622 	if (g_eli_pre_sync == NULL)
1623 		G_ELI_DEBUG(0, "Warning! Cannot register shutdown event.");
1624 }
1625 
1626 static void
1627 g_eli_fini(struct g_class *mp)
1628 {
1629 
1630 	if (g_eli_pre_sync != NULL)
1631 		EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_eli_pre_sync);
1632 }
1633 
1634 DECLARE_GEOM_CLASS(g_eli_class, g_eli);
1635 MODULE_DEPEND(g_eli, crypto, 1, 1, 1);
1636 MODULE_VERSION(geom_eli, 0);
1637